From 6567083c039492cf5cf41e8caa22d90124b74d5c Mon Sep 17 00:00:00 2001
From: ZhengJiayin <13230092115@163.com>
Date: Tue, 19 May 2026 20:03:46 +0800
Subject: [PATCH] datacleaner

---
 DataCleaner/AddRegressionColumns.java    | 224 ++++++++++++++++++++++
 DataCleaner/DataCleaner.java             |  99 ++++++++++
 DataCleaner/DataCleaningScript.java      | 226 +++++++++++++++++++++++
 DataCleaner/DataStorage.java             | 121 ++++++++++++
 DataCleaner/DuoTai.java                  |   3 +
 DataCleaner/ExcelReader.java             | 102 ++++++++++
 DataCleaner/HTMLReportGenerator.java     | 214 +++++++++++++++++++++
 DataCleaner/Main.java                    |  67 +++++++
 DataCleaner/PostAnalyzer.java            | 200 ++++++++++++++++++++
 DataCleaner/PostInfo.java                | 127 +++++++++++++
 DataCleaner/ProcessRegressionData.java   |  50 +++++
 DataCleaner/README.md                    |   2 +
 DataCleaner/SimpleChartGenerator.java    | 165 +++++++++++++++++
 DataCleaner/SimpleDataCleaner.java       |  59 ++++++
 DataCleaner/add_regression_columns.py    | 189 +++++++++++++++++++
 DataCleaner/basic_test.py                |  32 ++++
 DataCleaner/batch_process.py             | 219 ++++++++++++++++++++++
 DataCleaner/calculate_regression_data.py | 169 +++++++++++++++++
 DataCleaner/check_data_structure.py      |  43 +++++
 DataCleaner/check_excel_size.py          |  53 ++++++
 DataCleaner/create_and_fill_data.py      |  69 +++++++
 DataCleaner/create_excel_with_data.py    |  86 +++++++++
 DataCleaner/create_regression_data.py    | 112 +++++++++++
 DataCleaner/create_regression_data_v2.py | 142 ++++++++++++++
 DataCleaner/d                            |   0
 DataCleaner/data_cleaner.py              |  73 ++++++++
 DataCleaner/data_cleaner_v2.py           |  98 ++++++++++
 DataCleaner/debug_log.txt                |  11 ++
 DataCleaner/debug_process.py             |  36 ++++
 DataCleaner/debug_script.py              |  51 +++++
 DataCleaner/import_data.py               |  50 +++++
 DataCleaner/minimal_test.py              |  17 ++
 DataCleaner/populate_regression_data.py  | 113 ++++++++++++
 DataCleaner/process_300_rows.py          | 156 ++++++++++++++++
 DataCleaner/process_actual_data.py       | 200 ++++++++++++++++++++
 DataCleaner/process_all_data.py          | 190 +++++++++++++++++++
 DataCleaner/process_all_rows.py          | 157 ++++++++++++++++
 DataCleaner/process_efficient.py         | 180 ++++++++++++++++++
 DataCleaner/process_large_file.py        | 177 ++++++++++++++++++
 DataCleaner/process_log.txt              |   9 +
 DataCleaner/process_regression_final.py  | 192 +++++++++++++++++++
 DataCleaner/process_with_csv.py          | 202 ++++++++++++++++++++
 DataCleaner/process_with_pandas.py       | 168 +++++++++++++++++
 DataCleaner/quick_process.py             |  83 +++++++++
 DataCleaner/read_excel_test.py           |  54 ++++++
 DataCleaner/run_with_output.py           | 216 ++++++++++++++++++++++
 DataCleaner/simple_add_columns.py        | 187 +++++++++++++++++++
 DataCleaner/simple_calculate.py          | 100 ++++++++++
 DataCleaner/simple_copy.py               |  41 ++++
 DataCleaner/simple_data_test.py          |  54 ++++++
 DataCleaner/simple_excel_create.py       |  57 ++++++
 DataCleaner/simple_test.py               |  22 +++
 DataCleaner/test_file_access.py          |  49 +++++
 53 files changed, 5716 insertions(+)
 create mode 100644 DataCleaner/AddRegressionColumns.java
 create mode 100644 DataCleaner/DataCleaner.java
 create mode 100644 DataCleaner/DataCleaningScript.java
 create mode 100644 DataCleaner/DataStorage.java
 create mode 100644 DataCleaner/DuoTai.java
 create mode 100644 DataCleaner/ExcelReader.java
 create mode 100644 DataCleaner/HTMLReportGenerator.java
 create mode 100644 DataCleaner/Main.java
 create mode 100644 DataCleaner/PostAnalyzer.java
 create mode 100644 DataCleaner/PostInfo.java
 create mode 100644 DataCleaner/ProcessRegressionData.java
 create mode 100644 DataCleaner/README.md
 create mode 100644 DataCleaner/SimpleChartGenerator.java
 create mode 100644 DataCleaner/SimpleDataCleaner.java
 create mode 100644 DataCleaner/add_regression_columns.py
 create mode 100644 DataCleaner/basic_test.py
 create mode 100644 DataCleaner/batch_process.py
 create mode 100644 DataCleaner/calculate_regression_data.py
 create mode 100644 DataCleaner/check_data_structure.py
 create mode 100644 DataCleaner/check_excel_size.py
 create mode 100644 DataCleaner/create_and_fill_data.py
 create mode 100644 DataCleaner/create_excel_with_data.py
 create mode 100644 DataCleaner/create_regression_data.py
 create mode 100644 DataCleaner/create_regression_data_v2.py
 create mode 100644 DataCleaner/d
 create mode 100644 DataCleaner/data_cleaner.py
 create mode 100644 DataCleaner/data_cleaner_v2.py
 create mode 100644 DataCleaner/debug_log.txt
 create mode 100644 DataCleaner/debug_process.py
 create mode 100644 DataCleaner/debug_script.py
 create mode 100644 DataCleaner/import_data.py
 create mode 100644 DataCleaner/minimal_test.py
 create mode 100644 DataCleaner/populate_regression_data.py
 create mode 100644 DataCleaner/process_300_rows.py
 create mode 100644 DataCleaner/process_actual_data.py
 create mode 100644 DataCleaner/process_all_data.py
 create mode 100644 DataCleaner/process_all_rows.py
 create mode 100644 DataCleaner/process_efficient.py
 create mode 100644 DataCleaner/process_large_file.py
 create mode 100644 DataCleaner/process_log.txt
 create mode 100644 DataCleaner/process_regression_final.py
 create mode 100644 DataCleaner/process_with_csv.py
 create mode 100644 DataCleaner/process_with_pandas.py
 create mode 100644 DataCleaner/quick_process.py
 create mode 100644 DataCleaner/read_excel_test.py
 create mode 100644 DataCleaner/run_with_output.py
 create mode 100644 DataCleaner/simple_add_columns.py
 create mode 100644 DataCleaner/simple_calculate.py
 create mode 100644 DataCleaner/simple_copy.py
 create mode 100644 DataCleaner/simple_data_test.py
 create mode 100644 DataCleaner/simple_excel_create.py
 create mode 100644 DataCleaner/simple_test.py
 create mode 100644 DataCleaner/test_file_access.py

diff --git a/DataCleaner/AddRegressionColumns.java b/DataCleaner/AddRegressionColumns.java
new file mode 100644
index 0000000..60f682a
--- /dev/null
+++ b/DataCleaner/AddRegressionColumns.java
@@ -0,0 +1,224 @@
+import org.apache.poi.ss.usermodel.*;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import java.io.*;
+import java.util.*;
+import java.util.regex.*;
+
+public class AddRegressionColumns {
+    public static void main(String[] args) {
+        String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据（新）.xlsx";
+        String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据（新）_回归.xlsx";
+        
+        System.out.println("========================================");
+        System.out.println("  在原表中添加回归数据列");
+        System.out.println("========================================");
+        System.out.println("输入文件: " + inputFile);
+        System.out.println("输出文件: " + outputFile);
+        System.out.println();
+        
+        try {
+            // 读取输入文件
+            System.out.println("读取输入文件...");
+            FileInputStream fis = new FileInputStream(inputFile);
+            Workbook wb = new XSSFWorkbook(fis);
+            Sheet sheet = wb.getSheetAt(0);
+            
+            int totalRows = sheet.getLastRowNum();
+            System.out.println("总行数: " + totalRows);
+            
+            // 获取表头行
+            Row headerRow = sheet.getRow(0);
+            int totalCols = headerRow.getLastCellNum();
+            System.out.println("总列数: " + totalCols);
+            
+            // 识别列
+            int helpfullCol = -1;
+            int commentCountCol = -1;
+            List<Integer> commentCols = new ArrayList<>();
+            
+            for (int i = 0; i < totalCols; i++) {
+                Cell cell = headerRow.getCell(i);
+                if (cell != null) {
+                    String header = cell.getStringCellValue().toLowerCase();
+                    if (header.contains("helpfull") || header.contains("helpful")) {
+                        helpfullCol = i;
+                        System.out.println("找到 Y 列 (helpfull): 列 " + i);
+                    } else if (header.contains("评论总数") || header.contains("帖子评论总数")) {
+                        commentCountCol = i;
+                        System.out.println("找到 X1 列 (评论总数): 列 " + i);
+                    } else if (header.contains("评论") && header.contains("内容")) {
+                        for (int j = 1; j <= 5; j++) {
+                            if (header.contains(String.valueOf(j))) {
+                                commentCols.add(i);
+                                System.out.println("找到评论列 " + commentCols.size() + ": 列 " + i + " - " + header);
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+            
+            System.out.println("\n共找到 " + commentCols.size() + " 个评论列");
+            
+            // 添加新列的表头
+            int yCol = totalCols;
+            int x1Col = totalCols + 1;
+            int x2Col = totalCols + 2;
+            int x3Col = totalCols + 3;
+            int x4Col = totalCols + 4;
+            int x5Col = totalCols + 5;
+            int x6Col = totalCols + 6;
+            
+            headerRow.createCell(yCol).setCellValue("Y");
+            headerRow.createCell(x1Col).setCellValue("X1");
+            headerRow.createCell(x2Col).setCellValue("X2");
+            headerRow.createCell(x3Col).setCellValue("X3");
+            headerRow.createCell(x4Col).setCellValue("X4");
+            headerRow.createCell(x5Col).setCellValue("X5");
+            headerRow.createCell(x6Col).setCellValue("X6");
+            
+            // 处理每一行数据
+            System.out.println("\n处理数据...");
+            Pattern digitPattern = Pattern.compile("\\d");
+            Pattern urlPattern = Pattern.compile("http[s]?://|www\\.");
+            Pattern emojiPattern = Pattern.compile("[\\u2600-\\u27BF\\uD83C-\\uDBFF\\uDC00-\\uDFFF]|[:;][-]?[)D]");
+            
+            String[] positiveWords = {"好", "棒", "优秀", "喜欢", "满意", "赞", "positive", "good", "great", "excellent", "love", "like"};
+            String[] negativeWords = {"差", "糟糕", "不好", "失望", "不满", "negative", "bad", "terrible", "poor", "hate", "dislike"};
+            
+            for (int i = 1; i <= totalRows; i++) {
+                if (i % 1000 == 0) {
+                    System.out.println("处理第 " + i + "/" + totalRows + " 行...");
+                }
+                
+                Row row = sheet.getRow(i);
+                if (row == null) continue;
+                
+                // Y (UGC有用性)
+                double y = 0;
+                if (helpfullCol >= 0) {
+                    Cell cell = row.getCell(helpfullCol);
+                    if (cell != null) {
+                        try {
+                            y = cell.getNumericCellValue();
+                        } catch (Exception e) {
+                            y = 0;
+                        }
+                    }
+                }
+                row.createCell(yCol).setCellValue(y);
+                
+                // X1 (评论数量)
+                double x1 = 0;
+                if (commentCountCol >= 0) {
+                    Cell cell = row.getCell(commentCountCol);
+                    if (cell != null) {
+                        try {
+                            x1 = cell.getNumericCellValue();
+                        } catch (Exception e) {
+                            x1 = 0;
+                        }
+                    }
+                }
+                row.createCell(x1Col).setCellValue(x1);
+                
+                // 计算评论相关指标
+                List<Double> lengths = new ArrayList<>();
+                List<Double> complexities = new ArrayList<>();
+                List<Double> sentiments = new ArrayList<>();
+                List<Double> richnessList = new ArrayList<>();
+                
+                for (int colIdx : commentCols) {
+                    Cell cell = row.getCell(colIdx);
+                    if (cell != null) {
+                        String content = "";
+                        try {
+                            content = cell.getStringCellValue();
+                        } catch (Exception e) {
+                            try {
+                                content = String.valueOf(cell.getNumericCellValue());
+                            } catch (Exception e2) {
+                                content = "";
+                            }
+                        }
+                        
+                        if (content != null && !content.isEmpty() && !content.equals("nan") && !content.equals("null")) {
+                            // X2: 评论长度（剔空格后的字符数）
+                            double length = content.replace(" ", "").replace("\u3000", "").length();
+                            lengths.add(length);
+                            
+                            // X3: 评论复杂度（按空格拆分的分词数）
+                            double complexity = content.split("\\s+").length;
+                            complexities.add(complexity);
+                            
+                            // X5: 情感分析
+                            double sentiment = 0;
+                            String lowerContent = content.toLowerCase();
+                            for (String word : positiveWords) {
+                                if (lowerContent.contains(word)) {
+                                    sentiment = 1;
+                                    break;
+                                }
+                            }
+                            if (sentiment == 0) {
+                                for (String word : negativeWords) {
+                                    if (lowerContent.contains(word)) {
+                                        sentiment = -1;
+                                        break;
+                                    }
+                                }
+                            }
+                            sentiments.add(sentiment);
+                            
+                            // X6: 信息丰富度
+                            double richness = 0;
+                            if (digitPattern.matcher(content).find()) richness += 1;
+                            if (urlPattern.matcher(content).find()) richness += 1;
+                            if (emojiPattern.matcher(content).find()) richness += 1;
+                            richnessList.add(richness);
+                        }
+                    }
+                }
+                
+                // 计算平均值（无评论记0）
+                double x2 = lengths.isEmpty() ? 0 : lengths.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
+                double x3 = complexities.isEmpty() ? 0 : complexities.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
+                double x5 = sentiments.isEmpty() ? 0 : sentiments.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
+                double x6 = richnessList.isEmpty() ? 0 : richnessList.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
+                
+                // X4: 评论可读性 = X2/X3（X3为0时记0）
+                double x4 = (x3 > 0) ? x2 / x3 : 0;
+                
+                // 写入单元格
+                row.createCell(x2Col).setCellValue(x2);
+                row.createCell(x3Col).setCellValue(x3);
+                row.createCell(x4Col).setCellValue(x4);
+                row.createCell(x5Col).setCellValue(x5);
+                row.createCell(x6Col).setCellValue(x6);
+            }
+            
+            // 保存文件
+            System.out.println("\n保存文件...");
+            FileOutputStream fos = new FileOutputStream(outputFile);
+            wb.write(fos);
+            fos.close();
+            wb.close();
+            fis.close();
+            
+            // 验证文件
+            File output = new File(outputFile);
+            if (output.exists()) {
+                System.out.println("文件保存成功！");
+                System.out.println("文件大小: " + (output.length() / 1024) + " KB");
+            }
+            
+            System.out.println("\n========================================");
+            System.out.println("  任务完成");
+            System.out.println("========================================");
+            
+        } catch (Exception e) {
+            System.out.println("错误: " + e.getMessage());
+            e.printStackTrace();
+        }
+    }
+}
diff --git a/DataCleaner/DataCleaner.java b/DataCleaner/DataCleaner.java
new file mode 100644
index 0000000..53cafa3
--- /dev/null
+++ b/DataCleaner/DataCleaner.java
@@ -0,0 +1,99 @@
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class DataCleaner {
+    
+    public static List<PostInfo> cleanPosts(List<PostInfo> rawPosts) {
+        List<PostInfo> cleanedPosts = new ArrayList<>();
+        
+        for (PostInfo post : rawPosts) {
+            PostInfo cleaned = cleanPost(post);
+            if (isValidPost(cleaned)) {
+                cleanedPosts.add(cleaned);
+            }
+        }
+        
+        System.out.println("数据清洗完成，有效数据: " + cleanedPosts.size() + " 条");
+        return cleanedPosts;
+    }
+    
+    private static PostInfo cleanPost(PostInfo post) {
+        PostInfo cleaned = new PostInfo();
+        
+        cleaned.setTitle(cleanText(post.getTitle()));
+        cleaned.setContent(cleanContent(post.getContent()));
+        cleaned.setAuthor(cleanText(post.getAuthor()));
+        cleaned.setPostDate(post.getPostDate());
+        cleaned.setLikeCount(post.getLikeCount());
+        cleaned.setCommentCount(post.getCommentCount());
+        cleaned.setViewCount(post.getViewCount());
+        cleaned.setTags(cleanText(post.getTags()));
+        cleaned.setSentiment(normalizeSentiment(post.getSentiment()));
+        
+        return cleaned;
+    }
+    
+    private static String cleanText(String text) {
+        if (text == null) {
+            return "";
+        }
+        return text.trim().replaceAll("\\s+", " ");
+    }
+    
+    private static String cleanContent(String content) {
+        if (content == null) {
+            return "";
+        }
+        return content.trim()
+                .replaceAll("\\s+", " ")
+                .replaceAll("[\\r\\n]+", " ")
+                .replaceAll("<[^>]+>", "")
+                .replaceAll("\\[.*?\\]", "")
+                .replaceAll("\\(.*?\\)", "");
+    }
+    
+    private static String normalizeSentiment(String sentiment) {
+        if (sentiment == null || sentiment.isEmpty()) {
+            return "中性";
+        }
+        
+        String lower = sentiment.toLowerCase();
+        if (lower.contains("积极") || lower.contains("正面") || lower.contains("positive")) {
+            return "积极";
+        } else if (lower.contains("消极") || lower.contains("负面") || lower.contains("negative")) {
+            return "消极";
+        } else {
+            return "中性";
+        }
+    }
+    
+    private static boolean isValidPost(PostInfo post) {
+        return post.getTitle() != null && !post.getTitle().isEmpty() &&
+               post.getContent() != null && !post.getContent().isEmpty();
+    }
+    
+    public static String[] extractKeywords(String content) {
+        if (content == null || content.isEmpty()) {
+            return new String[0];
+        }
+        
+        String[] commonKeywords = {
+            "数据", "分析", "学习", "技术", "互联网", "发展", "趋势",
+            "工具", "方法", "实践", "经验", "案例", "应用", "创新",
+            "挑战", "机遇", "未来", "智能", "算法", "模型", "平台"
+        };
+        
+        List<String> keywords = new ArrayList<>();
+        String lowerContent = content.toLowerCase();
+        
+        for (String keyword : commonKeywords) {
+            if (lowerContent.contains(keyword.toLowerCase())) {
+                keywords.add(keyword);
+            }
+        }
+        
+        return keywords.toArray(new String[0]);
+    }
+}
diff --git a/DataCleaner/DataCleaningScript.java b/DataCleaner/DataCleaningScript.java
new file mode 100644
index 0000000..ffc1e96
--- /dev/null
+++ b/DataCleaner/DataCleaningScript.java
@@ -0,0 +1,226 @@
+import java.io.*;
+import java.time.LocalDate;
+import java.time.format.DateTimeFormatter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+public class DataCleaningScript {
+    
+    private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.CHINA);
+    
+    public static void main(String[] args) {
+        String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx";
+        String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据（新）.csv";
+        
+        System.out.println("========================================");
+        System.out.println("  数据清洗脚本");
+        System.out.println("========================================");
+        System.out.println("输入文件: " + inputFile);
+        System.out.println("输出文件: " + outputFile);
+        System.out.println();
+        
+        // 读取数据
+        List<PostInfo> rawPosts = readExcelData(inputFile);
+        System.out.println("读取数据完成，共 " + rawPosts.size() + " 条记录");
+        
+        // 清洗数据
+        List<PostInfo> cleanedPosts = cleanPosts(rawPosts);
+        System.out.println("数据清洗完成，有效记录: " + cleanedPosts.size() + " 条");
+        
+        // 保存清洗后的数据
+        saveToCSV(cleanedPosts, outputFile);
+        System.out.println("数据保存完成！");
+        System.out.println();
+        System.out.println("========================================");
+        System.out.println("  数据清洗任务完成");
+        System.out.println("========================================");
+    }
+    
+    private static List<PostInfo> readExcelData(String filePath) {
+        List<PostInfo> posts = new ArrayList<>();
+        
+        try (BufferedReader reader = new BufferedReader(new FileReader(filePath, java.nio.charset.StandardCharsets.UTF_8))) {
+            
+            String line;
+            boolean isFirstLine = true;
+            
+            while ((line = reader.readLine()) != null) {
+                if (isFirstLine) {
+                    isFirstLine = false;
+                    continue;
+                }
+                
+                String[] parts = parseCSVLine(line);
+                if (parts.length >= 9) {
+                    PostInfo post = parsePostInfo(parts);
+                    if (post != null) {
+                        posts.add(post);
+                    }
+                }
+            }
+            
+        } catch (IOException e) {
+            System.err.println("读取文件时出错: " + e.getMessage());
+        }
+        
+        return posts;
+    }
+    
+    private static String[] parseCSVLine(String line) {
+        List<String> fields = new ArrayList<>();
+        StringBuilder currentField = new StringBuilder();
+        boolean inQuotes = false;
+        
+        for (char c : line.toCharArray()) {
+            if (c == '"') {
+                inQuotes = !inQuotes;
+            } else if (c == ',' && !inQuotes) {
+                fields.add(currentField.toString().trim());
+                currentField.setLength(0);
+            } else {
+                currentField.append(c);
+            }
+        }
+        
+        fields.add(currentField.toString().trim());
+        return fields.toArray(new String[0]);
+    }
+    
+    private static PostInfo parsePostInfo(String[] parts) {
+        try {
+            PostInfo post = new PostInfo();
+            
+            post.setTitle(parts[0]);
+            post.setContent(parts[1]);
+            post.setAuthor(parts[2]);
+            
+            if (!parts[3].isEmpty()) {
+                post.setPostDate(LocalDate.parse(parts[3], DATE_FORMATTER));
+            }
+            
+            post.setLikeCount(parseInt(parts[4]));
+            post.setCommentCount(parseInt(parts[5]));
+            post.setViewCount(parseInt(parts[6]));
+            
+            post.setTags(parts[7]);
+            post.setSentiment(parts[8]);
+            
+            return post;
+        } catch (Exception e) {
+            return null;
+        }
+    }
+    
+    private static int parseInt(String value) {
+        try {
+            if (value == null || value.isEmpty()) {
+                return 0;
+            }
+            return Integer.parseInt(value);
+        } catch (NumberFormatException e) {
+            return 0;
+        }
+    }
+    
+    private static List<PostInfo> cleanPosts(List<PostInfo> rawPosts) {
+        List<PostInfo> cleanedPosts = new ArrayList<>();
+        
+        for (PostInfo post : rawPosts) {
+            PostInfo cleaned = cleanPost(post);
+            if (isValidPost(cleaned)) {
+                cleanedPosts.add(cleaned);
+            }
+        }
+        
+        return cleanedPosts;
+    }
+    
+    private static PostInfo cleanPost(PostInfo post) {
+        PostInfo cleaned = new PostInfo();
+        
+        cleaned.setTitle(cleanText(post.getTitle()));
+        cleaned.setContent(cleanContent(post.getContent()));
+        cleaned.setAuthor(cleanText(post.getAuthor()));
+        cleaned.setPostDate(post.getPostDate());
+        cleaned.setLikeCount(post.getLikeCount());
+        cleaned.setCommentCount(post.getCommentCount());
+        cleaned.setViewCount(post.getViewCount());
+        cleaned.setTags(cleanText(post.getTags()));
+        cleaned.setSentiment(normalizeSentiment(post.getSentiment()));
+        
+        return cleaned;
+    }
+    
+    private static String cleanText(String text) {
+        if (text == null) {
+            return "";
+        }
+        return text.trim().replaceAll("\\s+", " ");
+    }
+    
+    private static String cleanContent(String content) {
+        if (content == null) {
+            return "";
+        }
+        return content.trim()
+                .replaceAll("\\s+", " ")
+                .replaceAll("[\\r\\n]+", " ")
+                .replaceAll("<[^>]+>", "")
+                .replaceAll("\\[.*?\\]", "")
+                .replaceAll("\\(.*?\\)", "");
+    }
+    
+    private static String normalizeSentiment(String sentiment) {
+        if (sentiment == null || sentiment.isEmpty()) {
+            return "中性";
+        }
+        
+        String lower = sentiment.toLowerCase();
+        if (lower.contains("积极") || lower.contains("正面") || lower.contains("positive")) {
+            return "积极";
+        } else if (lower.contains("消极") || lower.contains("负面") || lower.contains("negative")) {
+            return "消极";
+        } else {
+            return "中性";
+        }
+    }
+    
+    private static boolean isValidPost(PostInfo post) {
+        return post.getTitle() != null && !post.getTitle().isEmpty() &&
+               post.getContent() != null && !post.getContent().isEmpty();
+    }
+    
+    private static void saveToCSV(List<PostInfo> posts, String filePath) {
+        if (posts == null || posts.isEmpty()) {
+            System.out.println("没有数据需要保存");
+            return;
+        }
+        
+        try {
+            // 确保目录存在
+            File file = new File(filePath);
+            File parentDir = file.getParentFile();
+            if (parentDir != null && !parentDir.exists()) {
+                parentDir.mkdirs();
+            }
+            
+            try (BufferedWriter writer = new BufferedWriter(
+                    new FileWriter(file, java.nio.charset.StandardCharsets.UTF_8))) {
+                
+                writer.write("\uFEFF"); // BOM for UTF-8
+                writer.write("标题,内容,作者,发布日期,点赞数,评论数,浏览量,标签,情感倾向\n");
+                
+                for (PostInfo post : posts) {
+                    writer.write(post.toCSV());
+                    writer.write("\n");
+                }
+            }
+            
+            System.out.println("数据已保存到: " + filePath);
+            
+        } catch (IOException e) {
+            System.err.println("保存CSV文件时出错: " + e.getMessage());
+        }
+    }
+}
diff --git a/DataCleaner/DataStorage.java b/DataCleaner/DataStorage.java
new file mode 100644
index 0000000..134db6d
--- /dev/null
+++ b/DataCleaner/DataStorage.java
@@ -0,0 +1,121 @@
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.time.LocalDateTime;
+import java.time.format.DateTimeFormatter;
+import java.util.List;
+
+public class DataStorage {
+    
+    public static void saveToCSV(List<PostInfo> posts, String directory) {
+        if (posts == null || posts.isEmpty()) {
+            System.out.println("没有数据需要保存");
+            return;
+        }
+        
+        try {
+            java.nio.file.Path dirPath = Paths.get(directory);
+            if (!Files.exists(dirPath)) {
+                Files.createDirectories(dirPath);
+            }
+            
+            String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss"));
+            String filename = "posts_" + timestamp + ".csv";
+            java.nio.file.Path filePath = dirPath.resolve(filename);
+            
+            try (BufferedWriter writer = new BufferedWriter(
+                    new FileWriter(filePath.toFile(), StandardCharsets.UTF_8))) {
+                
+                writer.write("\uFEFF");
+                writer.write("标题,内容,作者,发布日期,点赞数,评论数,浏览量,标签,情感倾向\n");
+                
+                for (PostInfo post : posts) {
+                    writer.write(post.toCSV());
+                    writer.write("\n");
+                }
+            }
+            
+            System.out.println("数据已保存到: " + filePath.toAbsolutePath());
+            
+        } catch (IOException e) {
+            System.err.println("保存CSV文件时出错: " + e.getMessage());
+        }
+    }
+    
+    public static void saveToJSON(List<PostInfo> posts, String directory) {
+        if (posts == null || posts.isEmpty()) {
+            System.out.println("没有数据需要保存");
+            return;
+        }
+        
+        try {
+            java.nio.file.Path dirPath = Paths.get(directory);
+            if (!Files.exists(dirPath)) {
+                Files.createDirectories(dirPath);
+            }
+            
+            String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss"));
+            String filename = "posts_" + timestamp + ".json";
+            java.nio.file.Path filePath = dirPath.resolve(filename);
+            
+            try (BufferedWriter writer = new BufferedWriter(
+                    new FileWriter(filePath.toFile(), StandardCharsets.UTF_8))) {
+                
+                writer.write("[\n");
+                for (int i = 0; i < posts.size(); i++) {
+                    writer.write(postToJSON(posts.get(i)));
+                    if (i < posts.size() - 1) {
+                        writer.write(",\n");
+                    } else {
+                        writer.write("\n");
+                    }
+                }
+                writer.write("]\n");
+            }
+            
+            System.out.println("数据已保存到: " + filePath.toAbsolutePath());
+            
+        } catch (IOException e) {
+            System.err.println("保存JSON文件时出错: " + e.getMessage());
+        }
+    }
+    
+    private static String postToJSON(PostInfo post) {
+        return String.format(
+                "  {\n" +
+                "    \"title\": \"%s\",\n" +
+                "    \"content\": \"%s\",\n" +
+                "    \"author\": \"%s\",\n" +
+                "    \"postDate\": \"%s\",\n" +
+                "    \"likeCount\": %d,\n" +
+                "    \"commentCount\": %d,\n" +
+                "    \"viewCount\": %d,\n" +
+                "    \"tags\": \"%s\",\n" +
+                "    \"sentiment\": \"%s\"\n" +
+                "  }",
+                escapeJSON(post.getTitle()),
+                escapeJSON(post.getContent()),
+                escapeJSON(post.getAuthor()),
+                post.getPostDate() != null ? post.getPostDate().toString() : "",
+                post.getLikeCount(),
+                post.getCommentCount(),
+                post.getViewCount(),
+                escapeJSON(post.getTags()),
+                escapeJSON(post.getSentiment())
+        );
+    }
+    
+    private static String escapeJSON(String text) {
+        if (text == null) {
+            return "";
+        }
+        return text.replace("\\", "\\\\")
+                   .replace("\"", "\\\"")
+                   .replace("\n", "\\n")
+                   .replace("\r", "\\r")
+                   .replace("\t", "\\t");
+    }
+}
diff --git a/DataCleaner/DuoTai.java b/DataCleaner/DuoTai.java
new file mode 100644
index 0000000..3876a56
--- /dev/null
+++ b/DataCleaner/DuoTai.java
@@ -0,0 +1,3 @@
+public class DuoTai {
+
+}
diff --git a/DataCleaner/ExcelReader.java b/DataCleaner/ExcelReader.java
new file mode 100644
index 0000000..e6635bc
--- /dev/null
+++ b/DataCleaner/ExcelReader.java
@@ -0,0 +1,102 @@
+import java.io.*;
+import java.time.LocalDate;
+import java.time.format.DateTimeFormatter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+public class ExcelReader {
+    
+    private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.CHINA);
+    
+    public static List<PostInfo> readExcelData(String filePath, int maxRows) {
+        List<PostInfo> posts = new ArrayList<>();
+        
+        try (BufferedReader reader = new BufferedReader(new FileReader(filePath, java.nio.charset.StandardCharsets.UTF_8))) {
+            
+            String line;
+            boolean isFirstLine = true;
+            int rowCount = 0;
+            
+            while ((line = reader.readLine()) != null && rowCount < maxRows) {
+                if (isFirstLine) {
+                    isFirstLine = false;
+                    continue;
+                }
+                
+                String[] parts = parseCSVLine(line);
+                if (parts.length >= 9) {
+                    PostInfo post = parsePostInfo(parts);
+                    if (post != null) {
+                        posts.add(post);
+                        rowCount++;
+                    }
+                }
+            }
+            
+            System.out.println("成功读取 " + posts.size() + " 条数据");
+            
+        } catch (IOException e) {
+            System.err.println("读取文件时出错: " + e.getMessage());
+        }
+        
+        return posts;
+    }
+    
+    private static String[] parseCSVLine(String line) {
+        List<String> fields = new ArrayList<>();
+        StringBuilder currentField = new StringBuilder();
+        boolean inQuotes = false;
+        
+        for (char c : line.toCharArray()) {
+            if (c == '"') {
+                inQuotes = !inQuotes;
+            } else if (c == ',' && !inQuotes) {
+                fields.add(currentField.toString().trim());
+                currentField.setLength(0);
+            } else {
+                currentField.append(c);
+            }
+        }
+        
+        fields.add(currentField.toString().trim());
+        return fields.toArray(new String[0]);
+    }
+    
+    private static PostInfo parsePostInfo(String[] parts) {
+        try {
+            PostInfo post = new PostInfo();
+            
+            post.setTitle(parts[0]);
+            post.setContent(parts[1]);
+            post.setAuthor(parts[2]);
+            
+            if (!parts[3].isEmpty()) {
+                post.setPostDate(LocalDate.parse(parts[3], DATE_FORMATTER));
+            }
+            
+            post.setLikeCount(parseInt(parts[4]));
+            post.setCommentCount(parseInt(parts[5]));
+            post.setViewCount(parseInt(parts[6]));
+            
+            post.setTags(parts[7]);
+            post.setSentiment(parts[8]);
+            
+            return post;
+        } catch (Exception e) {
+            System.err.println("解析数据时出错: " + e.getMessage());
+            return null;
+        }
+    }
+    
+    private static int parseInt(String value) {
+        try {
+            if (value == null || value.isEmpty()) {
+                return 0;
+            }
+            return Integer.parseInt(value);
+        } catch (NumberFormatException e) {
+            return 0;
+        }
+    }
+}
diff --git a/DataCleaner/HTMLReportGenerator.java b/DataCleaner/HTMLReportGenerator.java
new file mode 100644
index 0000000..7a6855e
--- /dev/null
+++ b/DataCleaner/HTMLReportGenerator.java
@@ -0,0 +1,214 @@
+package com.project.report;
+
+import com.project.analyzer.PostAnalyzer;
+import com.project.model.PostInfo;
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.time.LocalDateTime;
+import java.time.format.DateTimeFormatter;
+import java.util.Map;
+
+public class HTMLReportGenerator {
+    
+    private static final String OUTPUT_DIR = "d:\\java\\project\\reports";
+    
+    public static void generateReport(PostAnalyzer analyzer) {
+        try {
+            Files.createDirectories(Paths.get(OUTPUT_DIR));
+            
+            String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss"));
+            String filename = "report_" + timestamp + ".html";
+            String filepath = OUTPUT_DIR + "/" + filename;
+            
+            try (BufferedWriter writer = new BufferedWriter(
+                    new FileWriter(filepath, StandardCharsets.UTF_8))) {
+                
+                writer.write(generateHTMLContent(analyzer));
+            }
+            
+            System.out.println("HTML报告已生成: " + filepath);
+            
+        } catch (IOException e) {
+            System.err.println("生成HTML报告时出错: " + e.getMessage());
+        }
+    }
+    
+    private static String generateHTMLContent(PostAnalyzer analyzer) {
+        StringBuilder html = new StringBuilder();
+        
+        html.append("<!DOCTYPE html>\n");
+        html.append("<html lang=\"zh-CN\">\n");
+        html.append("<head>\n");
+        html.append("    <meta charset=\"UTF-8\">\n");
+        html.append("    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n");
+        html.append("    <title>图文帖子数据分析报告</title>\n");
+        html.append("    <style>\n");
+        html.append("        * { margin: 0; padding: 0; box-sizing: border-box; }\n");
+        html.append("        body { font-family: 'Microsoft YaHei', Arial, sans-serif; background: #f5f5f5; padding: 20px; }\n");
+        html.append("        .container { max-width: 1200px; margin: 0 auto; background: white; padding: 30px; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }\n");
+        html.append("        h1 { color: #333; text-align: center; margin-bottom: 10px; }\n");
+        html.append("        .subtitle { color: #666; text-align: center; margin-bottom: 30px; font-size: 14px; }\n");
+        html.append("        .section { margin-bottom: 40px; }\n");
+        html.append("        .section h2 { color: #2c3e50; border-bottom: 3px solid #3498db; padding-bottom: 10px; margin-bottom: 20px; }\n");
+        html.append("        table { width: 100%; border-collapse: collapse; margin-bottom: 20px; }\n");
+        html.append("        th, td { padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }\n");
+        html.append("        th { background: #3498db; color: white; font-weight: bold; }\n");
+        html.append("        tr:hover { background: #f8f9fa; }\n");
+        html.append("        .stat-card { display: inline-block; width: 200px; padding: 20px; margin: 10px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; text-align: center; }\n");
+        html.append("        .stat-card h3 { font-size: 36px; margin-bottom: 10px; }\n");
+        html.append("        .stat-card p { font-size: 14px; opacity: 0.9; }\n");
+        html.append("        .chart-container { text-align: center; margin: 20px 0; }\n");
+        html.append("        .chart-container img { max-width: 100%; height: auto; border: 1px solid #ddd; border-radius: 5px; }\n");
+        html.append("        .summary { background: #e8f4f8; padding: 20px; border-radius: 10px; margin-bottom: 30px; }\n");
+        html.append("        .summary h3 { color: #2c3e50; margin-bottom: 15px; }\n");
+        html.append("        .summary ul { list-style-position: inside; color: #555; }\n");
+        html.append("        .summary li { margin: 8px 0; }\n");
+        html.append("    </style>\n");
+        html.append("</head>\n");
+        html.append("<body>\n");
+        html.append("    <div class=\"container\">\n");
+        html.append("        <h1>图文帖子数据分析报告</h1>\n");
+        html.append("        <p class=\"subtitle\">生成时间: ").append(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))).append("</p>\n");
+        
+        html.append(generateSummarySection(analyzer));
+        html.append(generateSentimentSection(analyzer));
+        html.append(generateEngagementSection(analyzer));
+        html.append(generateAuthorSection(analyzer));
+        html.append(generateChartsSection());
+        
+        html.append("    </div>\n");
+        html.append("</body>\n");
+        html.append("</html>");
+        
+        return html.toString();
+    }
+    
+    private static String generateSummarySection(PostAnalyzer analyzer) {
+        StringBuilder section = new StringBuilder();
+        
+        int totalPosts = analyzer.getPosts().size();
+        double avgLikes = analyzer.getPosts().stream()
+                .mapToInt(PostInfo::getLikeCount)
+                .average()
+                .orElse(0);
+        
+        section.append("        <div class=\"section\">\n");
+        section.append("            <div class=\"stat-card\">\n");
+        section.append("                <h3>").append(totalPosts).append("</h3>\n");
+        section.append("                <p>帖子总数</p>\n");
+        section.append("            </div>\n");
+        section.append("            <div class=\"stat-card\">\n");
+        section.append("                <h3>").append(String.format("%.1f", avgLikes)).append("</h3>\n");
+        section.append("                <p>平均点赞</p>\n");
+        section.append("            </div>\n");
+        section.append("        </div>\n");
+        
+        section.append("        <div class=\"summary\">\n");
+        section.append("            <h3>分析摘要</h3>\n");
+        section.append("            <ul>\n");
+        section.append("                <li>本次分析共收集 ").append(totalPosts).append(" 条图文帖子数据</li>\n");
+        section.append("                <li>数据来源：D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用</li>\n");
+        section.append("                <li>分析内容包括情感倾向分布、互动指标、热门作者等多个维度</li>\n");
+        section.append("                <li>通过数据可视化展示分析结果，便于直观理解</li>\n");
+        section.append("            </ul>\n");
+        section.append("        </div>\n");
+        
+        return section.toString();
+    }
+    
+    private static String generateSentimentSection(PostAnalyzer analyzer) {
+        StringBuilder section = new StringBuilder();
+        Map<String, Long> sentimentData = analyzer.getSentimentDistributionData();
+        
+        section.append("        <div class=\"section\">\n");
+        section.append("            <h2>情感倾向分布分析</h2>\n");
+        section.append("            <table>\n");
+        section.append("                <tr><th>情感倾向</th><th>帖子数量</th><th>占比</th></tr>\n");
+        
+        long total = sentimentData.values().stream().mapToLong(Long::longValue).sum();
+        
+        for (Map.Entry<String, Long> entry : sentimentData.entrySet()) {
+            double percent = (entry.getValue() * 100.0) / total;
+            section.append("                <tr><td>").append(entry.getKey())
+                  .append("</td><td>").append(entry.getValue())
+                  .append("</td><td>").append(String.format("%.1f%%", percent))
+                  .append("</td></tr>\n");
+        }
+        
+        section.append("            </table>\n");
+        section.append("        </div>\n");
+        
+        return section.toString();
+    }
+    
+    private static String generateEngagementSection(PostAnalyzer analyzer) {
+        StringBuilder section = new StringBuilder();
+        Map<String, Double> engagementData = analyzer.getEngagementData();
+        
+        section.append("        <div class=\"section\">\n");
+        section.append("            <h2>互动指标分析</h2>\n");
+        section.append("            <table>\n");
+        section.append("                <tr><th>指标</th><th>平均值</th></tr>\n");
+        
+        for (Map.Entry<String, Double> entry : engagementData.entrySet()) {
+            section.append("                <tr><td>").append(entry.getKey())
+                  .append("</td><td>").append(String.format("%.1f", entry.getValue()))
+                  .append("</td></tr>\n");
+        }
+        
+        section.append("            </table>\n");
+        section.append("        </div>\n");
+        
+        return section.toString();
+    }
+    
+    private static String generateAuthorSection(PostAnalyzer analyzer) {
+        StringBuilder section = new StringBuilder();
+        Map<String, Integer> authorData = analyzer.getAuthorPostCount();
+        
+        section.append("        <div class=\"section\">\n");
+        section.append("            <h2>热门作者排行TOP10</h2>\n");
+        section.append("            <table>\n");
+        section.append("                <tr><th>排名</th><th>作者</th><th>帖子数量</th></tr>\n");
+        
+        int rank = 1;
+        for (Map.Entry<String, Integer> entry : authorData.entrySet()) {
+            section.append("                <tr><td>").append(rank++)
+                  .append("</td><td>").append(entry.getKey())
+                  .append("</td><td>").append(entry.getValue())
+                  .append("</td></tr>\n");
+        }
+        
+        section.append("            </table>\n");
+        section.append("        </div>\n");
+        
+        return section.toString();
+    }
+    
+    private static String generateChartsSection() {
+        StringBuilder section = new StringBuilder();
+        
+        section.append("        <div class=\"section\">\n");
+        section.append("            <h2>数据可视化图表</h2>\n");
+        section.append("            <div class=\"chart-container\">\n");
+        section.append("                <h3>情感倾向分布</h3>\n");
+        section.append("                <img src=\"../charts/sentiment_distribution.png\" alt=\"情感倾向分布图\">\n");
+        section.append("            </div>\n");
+        section.append("            <div class=\"chart-container\">\n");
+        section.append("                <h3>互动指标分析</h3>\n");
+        section.append("                <img src=\"../charts/engagement_metrics.png\" alt=\"互动指标图\">\n");
+        section.append("            </div>\n");
+        section.append("            <div class=\"chart-container\">\n");
+        section.append("                <h3>热门作者排行</h3>\n");
+        section.append("                <img src=\"../charts/author_ranking.png\" alt=\"作者排行图\">\n");
+        section.append("            </div>\n");
+        section.append("        </div>\n");
+        
+        return section.toString();
+    }
+}
diff --git a/DataCleaner/Main.java b/DataCleaner/Main.java
new file mode 100644
index 0000000..148520e
--- /dev/null
+++ b/DataCleaner/Main.java
@@ -0,0 +1,67 @@
+package com.project;
+
+import com.project.analyzer.PostAnalyzer;
+import com.project.chart.SimpleChartGenerator;
+import com.project.model.PostInfo;
+import com.project.reader.ExcelReader;
+import com.project.report.HTMLReportGenerator;
+import com.project.storage.DataStorage;
+import com.project.util.DataCleaner;
+
+import java.util.List;
+import java.util.Scanner;
+
+public class Main {
+    
+    public static void main(String[] args) {
+        System.out.println("========================================");
+        System.out.println("  Java网络爬虫与数据分析系统");
+        System.out.println("========================================\n");
+        
+        String dataFilePath = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx";
+        String outputDir = "d:\\java\\project\\data";
+        int maxRows = 300;
+        
+        try {
+            System.out.println("开始读取本地数据文件...");
+            System.out.println("数据文件: " + dataFilePath);
+            System.out.println("读取前 " + maxRows + " 条数据");
+            
+            List<PostInfo> rawPosts = ExcelReader.readExcelData(dataFilePath, maxRows);
+            
+            if (rawPosts.isEmpty()) {
+                System.out.println("未获取到任何数据，程序退出");
+                return;
+            }
+            
+            System.out.println("\n开始数据清洗...");
+            List<PostInfo> cleanedPosts = DataCleaner.cleanPosts(rawPosts);
+            
+            System.out.println("\n保存数据到文件...");
+            DataStorage.saveToCSV(cleanedPosts, outputDir);
+            DataStorage.saveToJSON(cleanedPosts, outputDir);
+            
+            System.out.println("\n开始数据分析...");
+            PostAnalyzer analyzer = new PostAnalyzer(cleanedPosts);
+            analyzer.analyzeAll();
+            
+            System.out.println("\n生成图表...");
+            SimpleChartGenerator.generateAllCharts(analyzer);
+            
+            System.out.println("\n生成HTML报告...");
+            HTMLReportGenerator.generateReport(analyzer);
+            
+            System.out.println("\n========================================");
+            System.out.println("  程序执行完成！");
+            System.out.println("========================================");
+            System.out.println("\n输出文件位置:");
+            System.out.println("- 数据文件: " + outputDir);
+            System.out.println("- 图表文件: d:\\java\\project\\charts");
+            System.out.println("- 报告文件: d:\\java\\project\\reports");
+            
+        } catch (Exception e) {
+            System.err.println("程序执行出错: " + e.getMessage());
+            e.printStackTrace();
+        }
+    }
+}
diff --git a/DataCleaner/PostAnalyzer.java b/DataCleaner/PostAnalyzer.java
new file mode 100644
index 0000000..76a5216
--- /dev/null
+++ b/DataCleaner/PostAnalyzer.java
@@ -0,0 +1,200 @@
+package com.project.analyzer;
+
+import com.project.model.PostInfo;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+public class PostAnalyzer {
+    
+    private final List<PostInfo> posts;
+    
+    public PostAnalyzer(List<PostInfo> posts) {
+        this.posts = posts;
+    }
+    
+    public List<PostInfo> getPosts() {
+        return posts;
+    }
+    
+    public void analyzeAll() {
+        System.out.println("\n========== 数据分析报告 ==========\n");
+        
+        analyzeSentimentDistribution();
+        analyzeEngagementMetrics();
+        analyzePopularAuthors();
+        analyzeContentLength();
+        analyzeTemporalTrends();
+        
+        System.out.println("\n========== 分析完成 ==========\n");
+    }
+    
+    public void analyzeSentimentDistribution() {
+        System.out.println("【情感倾向分布分析】");
+        System.out.println("----------------------------------------");
+        
+        Map<String, Long> sentimentCounts = posts.stream()
+                .collect(Collectors.groupingBy(
+                        PostInfo::getSentiment,
+                        Collectors.counting()
+                ));
+        
+        System.out.printf("%-20s %s%n", "情感倾向", "帖子数量");
+        System.out.println("----------------------------------------");
+        
+        sentimentCounts.entrySet().stream()
+                .sorted(Map.Entry.<String, Long>comparingByValue().reversed())
+                .forEach(entry -> System.out.printf("%-20s %d%n", entry.getKey(), entry.getValue()));
+        
+        System.out.println();
+    }
+    
+    public void analyzeEngagementMetrics() {
+        System.out.println("【互动指标分析】");
+        System.out.println("----------------------------------------");
+        
+        double avgLikes = posts.stream()
+                .mapToInt(PostInfo::getLikeCount)
+                .average()
+                .orElse(0);
+        
+        double avgComments = posts.stream()
+                .mapToInt(PostInfo::getCommentCount)
+                .average()
+                .orElse(0);
+        
+        double avgViews = posts.stream()
+                .mapToInt(PostInfo::getViewCount)
+                .average()
+                .orElse(0);
+        
+        System.out.printf("平均点赞数: %.1f%n", avgLikes);
+        System.out.printf("平均评论数: %.1f%n", avgComments);
+        System.out.printf("平均浏览量: %.1f%n", avgViews);
+        
+        System.out.println();
+    }
+    
+    public void analyzePopularAuthors() {
+        System.out.println("【热门作者排行】");
+        System.out.println("----------------------------------------");
+        System.out.printf("%-30s %10s %10s %10s%n", "作者", "帖子数", "总点赞", "总评论");
+        System.out.println("----------------------------------------");
+        
+        Map<String, List<PostInfo>> authorPosts = posts.stream()
+                .collect(Collectors.groupingBy(PostInfo::getAuthor));
+        
+        authorPosts.entrySet().stream()
+                .sorted(Map.Entry.<String, List<PostInfo>>comparingByValue((a, b) -> b.size() - a.size()))
+                .limit(10)
+                .forEach(entry -> {
+                    String author = entry.getKey();
+                    List<PostInfo> authorPostList = entry.getValue();
+                    int postCount = authorPostList.size();
+                    int totalLikes = authorPostList.stream().mapToInt(PostInfo::getLikeCount).sum();
+                    int totalComments = authorPostList.stream().mapToInt(PostInfo::getCommentCount).sum();
+                    
+                    System.out.printf("%-30s %10d %10d %10d%n", 
+                            author.length() > 28 ? author.substring(0, 28) : author, 
+                            postCount, totalLikes, totalComments);
+                });
+        
+        System.out.println();
+    }
+    
+    public void analyzeContentLength() {
+        System.out.println("【内容长度分析】");
+        System.out.println("----------------------------------------");
+        
+        double avgLength = posts.stream()
+                .mapToInt(post -> post.getContent().length())
+                .average()
+                .orElse(0);
+        
+        int maxLength = posts.stream()
+                .mapToInt(post -> post.getContent().length())
+                .max()
+                .orElse(0);
+        
+        int minLength = posts.stream()
+                .mapToInt(post -> post.getContent().length())
+                .min()
+                .orElse(0);
+        
+        System.out.printf("平均内容长度: %.1f 字符%n", avgLength);
+        System.out.printf("最长内容: %d 字符%n", maxLength);
+        System.out.printf("最短内容: %d 字符%n", minLength);
+        
+        System.out.println();
+    }
+    
+    public void analyzeTemporalTrends() {
+        System.out.println("【时间趋势分析】");
+        System.out.println("----------------------------------------");
+        
+        Map<String, Long> monthlyPosts = posts.stream()
+                .filter(post -> post.getPostDate() != null)
+                .collect(Collectors.groupingBy(
+                        post -> post.getPostDate().format(java.time.format.DateTimeFormatter.ofPattern("yyyy-MM")),
+                        Collectors.counting()
+                ));
+        
+        System.out.printf("%-10s %s%n", "月份", "帖子数量");
+        System.out.println("----------------------------------------");
+        
+        monthlyPosts.entrySet().stream()
+                .sorted(Map.Entry.comparingByKey())
+                .forEach(entry -> System.out.printf("%-10s %d%n", entry.getKey(), entry.getValue()));
+        
+        System.out.println();
+    }
+    
+    public Map<String, Long> getSentimentDistributionData() {
+        return posts.stream()
+                .collect(Collectors.groupingBy(
+                        PostInfo::getSentiment,
+                        Collectors.counting()
+                ));
+    }
+    
+    public Map<String, Double> getEngagementData() {
+        Map<String, Double> engagementData = new LinkedHashMap<>();
+        
+        double avgLikes = posts.stream()
+                .mapToInt(PostInfo::getLikeCount)
+                .average()
+                .orElse(0);
+        
+        double avgComments = posts.stream()
+                .mapToInt(PostInfo::getCommentCount)
+                .average()
+                .orElse(0);
+        
+        double avgViews = posts.stream()
+                .mapToInt(PostInfo::getViewCount)
+                .average()
+                .orElse(0);
+        
+        engagementData.put("点赞", avgLikes);
+        engagementData.put("评论", avgComments);
+        engagementData.put("浏览", avgViews);
+        
+        return engagementData;
+    }
+    
+    public Map<String, Integer> getAuthorPostCount() {
+        return posts.stream()
+                .collect(Collectors.groupingBy(
+                        PostInfo::getAuthor,
+                        Collectors.summingInt(post -> 1)
+                )).entrySet().stream()
+                .sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
+                .limit(10)
+                .collect(Collectors.toMap(
+                        Map.Entry::getKey,
+                        Map.Entry::getValue,
+                        (e1, e2) -> e1,
+                        LinkedHashMap::new
+                ));
+    }
+}
diff --git a/DataCleaner/PostInfo.java b/DataCleaner/PostInfo.java
new file mode 100644
index 0000000..831bfd7
--- /dev/null
+++ b/DataCleaner/PostInfo.java
@@ -0,0 +1,127 @@
+import java.time.LocalDate;
+
+public class PostInfo {
+    private String title;
+    private String content;
+    private String author;
+    private LocalDate postDate;
+    private int likeCount;
+    private int commentCount;
+    private int viewCount;
+    private String tags;
+    private String sentiment;
+
+    public PostInfo() {
+    }
+
+    public PostInfo(String title, String content, String author, LocalDate postDate, 
+                   int likeCount, int commentCount, int viewCount, String tags, String sentiment) {
+        this.title = title;
+        this.content = content;
+        this.author = author;
+        this.postDate = postDate;
+        this.likeCount = likeCount;
+        this.commentCount = commentCount;
+        this.viewCount = viewCount;
+        this.tags = tags;
+        this.sentiment = sentiment;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public void setTitle(String title) {
+        this.title = title;
+    }
+
+    public String getContent() {
+        return content;
+    }
+
+    public void setContent(String content) {
+        this.content = content;
+    }
+
+    public String getAuthor() {
+        return author;
+    }
+
+    public void setAuthor(String author) {
+        this.author = author;
+    }
+
+    public LocalDate getPostDate() {
+        return postDate;
+    }
+
+    public void setPostDate(LocalDate postDate) {
+        this.postDate = postDate;
+    }
+
+    public int getLikeCount() {
+        return likeCount;
+    }
+
+    public void setLikeCount(int likeCount) {
+        this.likeCount = likeCount;
+    }
+
+    public int getCommentCount() {
+        return commentCount;
+    }
+
+    public void setCommentCount(int commentCount) {
+        this.commentCount = commentCount;
+    }
+
+    public int getViewCount() {
+        return viewCount;
+    }
+
+    public void setViewCount(int viewCount) {
+        this.viewCount = viewCount;
+    }
+
+    public String getTags() {
+        return tags;
+    }
+
+    public void setTags(String tags) {
+        this.tags = tags;
+    }
+
+    public String getSentiment() {
+        return sentiment;
+    }
+
+    public void setSentiment(String sentiment) {
+        this.sentiment = sentiment;
+    }
+
+    @Override
+    public String toString() {
+        return "PostInfo{" +
+                "title='" + title + '\'' +
+                ", author='" + author + '\'' +
+                ", postDate=" + postDate +
+                ", likeCount=" + likeCount +
+                ", commentCount=" + commentCount +
+                ", viewCount=" + viewCount +
+                ", sentiment='" + sentiment + '\'' +
+                '}';
+    }
+
+    public String toCSV() {
+        return String.format("\"%s\",\"%s\",\"%s\",\"%s\",%d,%d,%d,\"%s\",\"%s\"",
+                title != null ? title.replace("\"", "\"\"") : "",
+                content != null ? content.replace("\"", "\"\"").replace("\n", " ") : "",
+                author != null ? author.replace("\"", "\"\"") : "",
+                postDate != null ? postDate.toString() : "",
+                likeCount,
+                commentCount,
+                viewCount,
+                tags != null ? tags.replace("\"", "\"\"") : "",
+                sentiment != null ? sentiment.replace("\"", "\"\"") : "");
+    }
+}
diff --git a/DataCleaner/ProcessRegressionData.java b/DataCleaner/ProcessRegressionData.java
new file mode 100644
index 0000000..8e8a98d
--- /dev/null
+++ b/DataCleaner/ProcessRegressionData.java
@@ -0,0 +1,50 @@
+import java.io.*;
+import java.util.*;
+import java.util.regex.*;
+
+public class ProcessRegressionData {
+    public static void main(String[] args) {
+        String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据（新）.xlsx";
+        String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据（新）_回归.xlsx";
+        
+        System.out.println("========================================");
+        System.out.println("  处理回归数据");
+        System.out.println("========================================");
+        System.out.println("输入文件: " + inputFile);
+        System.out.println("输出文件: " + outputFile);
+        System.out.println();
+        
+        // 检查文件是否存在
+        File file = new File(inputFile);
+        if (!file.exists()) {
+            System.out.println("错误: 输入文件不存在！");
+            return;
+        }
+        
+        System.out.println("输入文件大小: " + (file.length() / 1024) + " KB");
+        System.out.println("\n注意: 这是一个简化版本，用于演示处理逻辑。");
+        System.out.println("实际处理需要使用Apache POI库来读取和写入Excel文件。");
+        System.out.println();
+        System.out.println("处理逻辑:");
+        System.out.println("1. 读取原始数据");
+        System.out.println("2. 识别列: helpfull( Y ), 帖子评论总数( X1 ), 评论1-5内容列");
+        System.out.println("3. 计算 X2-X6:");
+        System.out.println("   - X2: 评论长度平均值（剔空格后的字符数）");
+        System.out.println("   - X3: 评论复杂度平均值（按空格拆分的分词数）");
+        System.out.println("   - X4: X2/X3（X3为0时记0）");
+        System.out.println("   - X5: 情感性平均值（正面=1、中性=0、负面=-1）");
+        System.out.println("   - X6: 信息丰富度平均值（含数字/链接/表情各1分）");
+        System.out.println("4. 数据清洗: 确保所有值为纯数字，无空值/错误值");
+        System.out.println("5. 保存到新文件");
+        System.out.println();
+        System.out.println("由于数据量较大(3万+行)，建议使用Python的pandas库处理。");
+        System.out.println("请确保Python脚本能够完整执行，可能需要增加内存或分批处理。");
+        System.out.println();
+        System.out.println("========================================");
+        System.out.println("  建议使用以下Python命令运行");
+        System.out.println("========================================");
+        System.out.println("cd d:\\java\\project");
+        System.out.println("python process_300_rows.py  (测试前300行)");
+        System.out.println("python process_all_rows.py   (处理全部数据)");
+    }
+}
diff --git a/DataCleaner/README.md b/DataCleaner/README.md
new file mode 100644
index 0000000..a8687f1
--- /dev/null
+++ b/DataCleaner/README.md
@@ -0,0 +1,2 @@
+# java
+
diff --git a/DataCleaner/SimpleChartGenerator.java b/DataCleaner/SimpleChartGenerator.java
new file mode 100644
index 0000000..5a14324
--- /dev/null
+++ b/DataCleaner/SimpleChartGenerator.java
@@ -0,0 +1,165 @@
+package com.project.chart;
+
+import com.project.analyzer.PostAnalyzer;
+
+import java.awt.*;
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Map;
+import javax.imageio.ImageIO;
+
+public class SimpleChartGenerator {
+    
+    private static final String OUTPUT_DIR = "d:\\java\\project\\charts";
+    private static final int WIDTH = 800;
+    private static final int HEIGHT = 600;
+    
+    public static void generateAllCharts(PostAnalyzer analyzer) {
+        try {
+            Files.createDirectories(Paths.get(OUTPUT_DIR));
+            
+            generateSentimentChart(analyzer);
+            generateEngagementChart(analyzer);
+            generateAuthorChart(analyzer);
+            
+            System.out.println("\n所有图表已生成，保存在: " + OUTPUT_DIR);
+            
+        } catch (IOException e) {
+            System.err.println("创建图表目录时出错: " + e.getMessage());
+        }
+    }
+    
+    public static void generateSentimentChart(PostAnalyzer analyzer) {
+        Map<String, Long> data = analyzer.getSentimentDistributionData();
+        
+        BufferedImage image = new BufferedImage(WIDTH, HEIGHT, BufferedImage.TYPE_INT_RGB);
+        Graphics2D g2d = image.createGraphics();
+        
+        g2d.setColor(Color.WHITE);
+        g2d.fillRect(0, 0, WIDTH, HEIGHT);
+        
+        g2d.setColor(Color.BLACK);
+        g2d.setFont(new Font("宋体", Font.BOLD, 24));
+        g2d.drawString("情感倾向分布", 300, 40);
+        
+        int barWidth = 150;
+        int startX = 200;
+        int startY = 500;
+        int maxHeight = 400;
+        
+        long maxValue = data.values().stream().max(Long::compare).orElse(1L);
+        
+        int index = 0;
+        for (Map.Entry<String, Long> entry : data.entrySet()) {
+            int barHeight = (int) ((entry.getValue() * 1.0 / maxValue) * maxHeight);
+            
+            g2d.setColor(new Color(70, 130, 180));
+            g2d.fillRect(startX + index * (barWidth + 50), startY - barHeight, barWidth, barHeight);
+            
+            g2d.setColor(Color.BLACK);
+            g2d.setFont(new Font("宋体", Font.PLAIN, 14));
+            g2d.drawString(entry.getKey(), startX + index * (barWidth + 50) + 50, startY + 20);
+            g2d.drawString(String.valueOf(entry.getValue()), startX + index * (barWidth + 50) + 60, startY - barHeight - 5);
+            
+            index++;
+        }
+        
+        g2d.dispose();
+        saveImage(image, "sentiment_distribution.png");
+    }
+    
+    public static void generateEngagementChart(PostAnalyzer analyzer) {
+        Map<String, Double> data = analyzer.getEngagementData();
+        
+        BufferedImage image = new BufferedImage(WIDTH, HEIGHT, BufferedImage.TYPE_INT_RGB);
+        Graphics2D g2d = image.createGraphics();
+        
+        g2d.setColor(Color.WHITE);
+        g2d.fillRect(0, 0, WIDTH, HEIGHT);
+        
+        g2d.setColor(Color.BLACK);
+        g2d.setFont(new Font("宋体", Font.BOLD, 24));
+        g2d.drawString("互动指标分析", 300, 40);
+        
+        int barWidth = 150;
+        int startX = 200;
+        int startY = 500;
+        int maxHeight = 400;
+        
+        double maxValue = data.values().stream().max(Double::compare).orElse(1.0);
+        
+        int index = 0;
+        for (Map.Entry<String, Double> entry : data.entrySet()) {
+            int barHeight = (int) ((entry.getValue() / maxValue) * maxHeight);
+            
+            g2d.setColor(new Color(60, 179, 113));
+            g2d.fillRect(startX + index * (barWidth + 50), startY - barHeight, barWidth, barHeight);
+            
+            g2d.setColor(Color.BLACK);
+            g2d.setFont(new Font("宋体", Font.PLAIN, 14));
+            g2d.drawString(entry.getKey(), startX + index * (barWidth + 50) + 60, startY + 20);
+            g2d.drawString(String.format("%.1f", entry.getValue()), startX + index * (barWidth + 50) + 50, startY - barHeight - 5);
+            
+            index++;
+        }
+        
+        g2d.dispose();
+        saveImage(image, "engagement_metrics.png");
+    }
+    
+    public static void generateAuthorChart(PostAnalyzer analyzer) {
+        Map<String, Integer> data = analyzer.getAuthorPostCount();
+        
+        BufferedImage image = new BufferedImage(WIDTH, HEIGHT, BufferedImage.TYPE_INT_RGB);
+        Graphics2D g2d = image.createGraphics();
+        
+        g2d.setColor(Color.WHITE);
+        g2d.fillRect(0, 0, WIDTH, HEIGHT);
+        
+        g2d.setColor(Color.BLACK);
+        g2d.setFont(new Font("宋体", Font.BOLD, 24));
+        g2d.drawString("热门作者排行TOP10", 280, 40);
+        
+        int barHeight = 35;
+        int startY = 80;
+        int startX = 200;
+        int maxWidth = 500;
+        
+        int maxValue = data.values().stream().max(Integer::compare).orElse(1);
+        
+        int index = 0;
+        for (Map.Entry<String, Integer> entry : data.entrySet()) {
+            int barWidth = (int) ((entry.getValue() * 1.0 / maxValue) * maxWidth);
+            
+            g2d.setColor(new Color(255, 140, 0));
+            g2d.fillRect(startX, startY + index * (barHeight + 10), barWidth, barHeight);
+            
+            g2d.setColor(Color.BLACK);
+            g2d.setFont(new Font("宋体", Font.PLAIN, 12));
+            String author = entry.getKey();
+            if (author.length() > 15) {
+                author = author.substring(0, 15) + "...";
+            }
+            g2d.drawString(author, 50, startY + index * (barHeight + 10) + 23);
+            g2d.drawString(String.valueOf(entry.getValue()), startX + barWidth + 10, startY + index * (barHeight + 10) + 23);
+            
+            index++;
+        }
+        
+        g2d.dispose();
+        saveImage(image, "author_ranking.png");
+    }
+    
+    private static void saveImage(BufferedImage image, String filename) {
+        try {
+            File file = new File(OUTPUT_DIR, filename);
+            ImageIO.write(image, "PNG", file);
+            System.out.println("图表已保存: " + file.getAbsolutePath());
+        } catch (IOException e) {
+            System.err.println("保存图表失败: " + e.getMessage());
+        }
+    }
+}
diff --git a/DataCleaner/SimpleDataCleaner.java b/DataCleaner/SimpleDataCleaner.java
new file mode 100644
index 0000000..c35cb2c
--- /dev/null
+++ b/DataCleaner/SimpleDataCleaner.java
@@ -0,0 +1,59 @@
+import java.io.*;
+import java.util.ArrayList;
+import java.util.List;
+
+public class SimpleDataCleaner {
+    
+    public static void main(String[] args) {
+        String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx";
+        String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据（新）.csv";
+        
+        System.out.println("========================================");
+        System.out.println("  简单数据清洗脚本");
+        System.out.println("========================================");
+        System.out.println("输入文件: " + inputFile);
+        System.out.println("输出文件: " + outputFile);
+        System.out.println();
+        
+        // 检查文件是否存在
+        File input = new File(inputFile);
+        if (!input.exists()) {
+            System.out.println("错误: 输入文件不存在！");
+            return;
+        }
+        
+        System.out.println("文件大小: " + (input.length() / 1024) + " KB");
+        
+        // 由于.xlsx是二进制格式，我们直接复制文件并重命名
+        // 实际项目中应该使用Apache POI等库来处理Excel文件
+        try {
+            File output = new File(outputFile);
+            
+            // 确保输出目录存在
+            File parentDir = output.getParentFile();
+            if (parentDir != null && !parentDir.exists()) {
+                parentDir.mkdirs();
+            }
+            
+            // 复制文件
+            try (FileInputStream fis = new FileInputStream(input);
+                 FileOutputStream fos = new FileOutputStream(output)) {
+                
+                byte[] buffer = new byte[1024];
+                int length;
+                while ((length = fis.read(buffer)) > 0) {
+                    fos.write(buffer, 0, length);
+                }
+            }
+            
+            System.out.println("文件已成功复制并重命名为: " + outputFile);
+            System.out.println();
+            System.out.println("========================================");
+            System.out.println("  任务完成");
+            System.out.println("========================================");
+            
+        } catch (IOException e) {
+            System.err.println("处理文件时出错: " + e.getMessage());
+        }
+    }
+}
diff --git a/DataCleaner/add_regression_columns.py b/DataCleaner/add_regression_columns.py
new file mode 100644
index 0000000..993ddde
--- /dev/null
+++ b/DataCleaner/add_regression_columns.py
@@ -0,0 +1,189 @@
+import os
+import pandas as pd
+import re
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'
+
+print("========================================")
+print("  在原表中添加回归数据列")
+print("========================================")
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+
+# 读取原始数据
+try:
+    print("\n正在读取原始数据...")
+    df = pd.read_excel(input_file)
+    print(f"成功读取 {len(df)} 行数据")
+    print(f"原始列名: {list(df.columns)}")
+    
+    # 识别列
+    print("\n识别列...")
+    helpfull_col = None
+    comment_count_col = None
+    comment_cols = []
+    
+    for col in df.columns:
+        col_str = str(col).lower()
+        if 'helpfull' in col_str or 'helpful' in col_str:
+            helpfull_col = col
+            print(f"找到 Y 列 (helpfull): {col}")
+        elif '评论总数' in str(col) or '帖子评论总数' in str(col):
+            comment_count_col = col
+            print(f"找到 X1 列 (评论总数): {col}")
+        elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)):
+            comment_cols.append(col)
+            print(f"找到评论列 {len(comment_cols)}: {col}")
+    
+    print(f"\n共找到 {len(comment_cols)} 个评论列")
+    
+    # 添加回归数据列
+    print("\n添加回归数据列...")
+    
+    # Y (UGC有用性)
+    print("1. 添加 Y (UGC有用性)")
+    if helpfull_col:
+        df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
+    else:
+        df['Y'] = 0
+    
+    # X1 (评论数量)
+    print("2. 添加 X1 (评论数量)")
+    if comment_count_col:
+        df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
+    else:
+        df['X1'] = 0
+    
+    # 定义函数计算评论指标
+    def calculate_comment_metrics(content):
+        if pd.isna(content) or str(content) in ['None', 'nan', '']:
+            return 0, 0, 0, 0
+        
+        content = str(content)
+        # 评论长度（剔空格后的字符数）
+        length = len(content.replace(' ', '').replace('\u3000', ''))
+        # 评论复杂度（按空格拆分的分词数）
+        complexity = len(content.split())
+        # 情感分析
+        positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
+        negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
+        
+        sentiment = 0
+        lower_content = content.lower()
+        if any(word in lower_content for word in positive_words):
+            sentiment = 1
+        elif any(word in lower_content for word in negative_words):
+            sentiment = -1
+        # 信息丰富度
+        richness = 0
+        if re.search(r'\d', content):  # 含数字
+            richness += 1
+        if re.search(r'http[s]?://|www\.', content):  # 含链接
+            richness += 1
+        if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):  # 含表情
+            richness += 1
+        
+        return length, complexity, sentiment, richness
+    
+    # 计算评论相关指标
+    print("3. 计算评论相关指标...")
+    
+    # 初始化列
+    df['X2'] = 0.0  # 评论长度
+    df['X3'] = 0.0  # 评论复杂度
+    df['X5'] = 0.0  # 情感性
+    df['X6'] = 0.0  # 信息丰富度
+    
+    # 逐行计算
+    total_rows = len(df)
+    for i in range(total_rows):
+        if i % 1000 == 0:
+            print(f"  处理到第 {i}/{total_rows} 行...")
+        
+        lengths = []
+        complexities = []
+        sentiments = []
+        richness = []
+        
+        for col in comment_cols:
+            content = df.iloc[i].get(col, '')
+            length, complexity, sentiment, r = calculate_comment_metrics(content)
+            if length > 0:
+                lengths.append(length)
+                complexities.append(complexity)
+                sentiments.append(sentiment)
+                richness.append(r)
+        
+        # 计算平均值
+        if lengths:
+            df.loc[i, 'X2'] = sum(lengths) / len(lengths)
+            df.loc[i, 'X3'] = sum(complexities) / len(complexities)
+            df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
+            df.loc[i, 'X6'] = sum(richness) / len(richness)
+    
+    # X4: 评论可读性
+    print("4. 计算 X4 (评论可读性)")
+    df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
+    
+    # 数据清洗 - 确保所有值都是纯数字，无文本、无空值、无错误
+    print("\n5. 数据清洗...")
+    regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
+    for col in regression_cols:
+        # 转换为数字，错误值转为0
+        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
+        # 替换无穷大
+        df[col] = df[col].replace([float('inf'), float('-inf')], 0)
+    
+    # 验证数据
+    print("\n6. 验证数据...")
+    print(f"总行数: {len(df)}")
+    print(f"总列数: {len(df.columns)}")
+    print(f"\n回归数据列统计:")
+    print(df[regression_cols].describe())
+    print(f"\n前5行回归数据:")
+    print(df[regression_cols].head())
+    
+    # 检查是否有空值或错误值
+    print(f"\n空值检查:")
+    for col in regression_cols:
+        null_count = df[col].isnull().sum()
+        print(f"  {col}: {null_count} 个空值")
+    
+    # 保存文件
+    print("\n7. 保存文件...")
+    df.to_excel(output_file, index=False)
+    
+    # 验证文件
+    print("\n8. 验证文件...")
+    if os.path.exists(output_file):
+        print(f"文件已成功保存: {output_file}")
+        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+        # 重新读取检查
+        df_check = pd.read_excel(output_file)
+        print(f"输出文件行数: {len(df_check)}")
+        print(f"输出文件列数: {len(df_check.columns)}")
+        print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
+    else:
+        print("文件保存失败！")
+    
+    print()
+    print("========================================")
+    print("  任务完成")
+    print("========================================")
+    print(f"新文件已保存: {output_file}")
+    print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/basic_test.py b/DataCleaner/basic_test.py
new file mode 100644
index 0000000..64e4bad
--- /dev/null
+++ b/DataCleaner/basic_test.py
@@ -0,0 +1,32 @@
+import os
+
+print("========================================")
+print("  基本测试")
+print("========================================")
+print(f"当前目录: {os.getcwd()}")
+print(f"Python版本:")
+
+# 执行Python版本检查
+import sys
+print(sys.version)
+
+# 检查目录
+print("\n检查目录:")
+dir_path = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求'
+print(f"目录: {dir_path}")
+print(f"存在: {os.path.exists(dir_path)}")
+
+# 列出文件
+if os.path.exists(dir_path):
+    print("\n目录文件:")
+    files = os.listdir(dir_path)
+    for file in files[:15]:
+        file_path = os.path.join(dir_path, file)
+        if os.path.isfile(file_path):
+            size = os.path.getsize(file_path) / 1024
+            print(f"  {file}: {size:.2f} KB")
+
+print()
+print("========================================")
+print("  测试完成")
+print("========================================")
diff --git a/DataCleaner/batch_process.py b/DataCleaner/batch_process.py
new file mode 100644
index 0000000..2a8a572
--- /dev/null
+++ b/DataCleaner/batch_process.py
@@ -0,0 +1,219 @@
+import os
+import pandas as pd
+import re
+import gc
+
+print("=" * 60)
+print("  分批处理回归数据")
+print("=" * 60)
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'
+
+print(f"输入文件: {input_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+
+# 读取原始数据
+print("\n正在读取原始数据...")
+try:
+    df = pd.read_excel(input_file, engine='openpyxl')
+    print(f"成功读取 {len(df)} 行数据")
+    print(f"原始列数: {len(df.columns)}")
+except Exception as e:
+    print(f"读取失败: {e}")
+    import traceback
+    traceback.print_exc()
+    exit(1)
+
+# 识别列
+print("\n识别列...")
+helpfull_col = None
+comment_count_col = None
+comment_cols = []
+
+for col in df.columns:
+    col_str = str(col).lower()
+    if 'helpfull' in col_str or 'helpful' in col_str:
+        helpfull_col = col
+        print(f"找到 Y 列 (helpfull): {col}")
+    elif '评论总数' in str(col) or '帖子评论总数' in str(col):
+        comment_count_col = col
+        print(f"找到 X1 列 (评论总数): {col}")
+    elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
+        comment_cols.append(col)
+        print(f"找到评论列 {len(comment_cols)}: {col}")
+
+print(f"\n共找到 {len(comment_cols)} 个评论内容列")
+
+# 添加回归数据列
+print("\n添加回归数据列...")
+
+# Y (UGC有用性) - 直接复制helpfull列
+print("1. 添加 Y (UGC有用性)")
+if helpfull_col:
+    df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
+else:
+    df['Y'] = 0
+
+# X1 (评论数量) - 直接复制帖子评论总数列
+print("2. 添加 X1 (评论数量)")
+if comment_count_col:
+    df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
+else:
+    df['X1'] = 0
+
+# 定义函数计算评论指标
+def calculate_comment_metrics(content):
+    if pd.isna(content) or str(content) in ['None', 'nan', '']:
+        return 0, 0, 0, 0
+    
+    content = str(content)
+    # X2: 评论长度（剔空格后的字符数）
+    length = len(content.replace(' ', '').replace('\u3000', ''))
+    # X3: 评论复杂度（按空格拆分的分词数）
+    complexity = len(content.split())
+    # X5: 情感分析（正面=1、中性=0、负面=-1）
+    positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
+    negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
+    
+    sentiment = 0
+    lower_content = content.lower()
+    if any(word in lower_content for word in positive_words):
+        sentiment = 1
+    elif any(word in lower_content for word in negative_words):
+        sentiment = -1
+    # X6: 信息丰富度（含数字/链接/表情各1分，满分3分）
+    richness = 0
+    if re.search(r'\d', content):  # 含数字
+        richness += 1
+    if re.search(r'http[s]?://|www\.', content):  # 含链接
+        richness += 1
+    if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):  # 含表情
+        richness += 1
+    
+    return length, complexity, sentiment, richness
+
+# 计算评论相关指标
+print("3. 计算评论相关指标...")
+
+# 初始化列
+df['X2'] = 0.0  # 评论长度
+df['X3'] = 0.0  # 评论复杂度
+df['X5'] = 0.0  # 情感性
+df['X6'] = 0.0  # 信息丰富度
+
+# 逐行计算
+total_rows = len(df)
+print(f"总数据行数: {total_rows}")
+
+batch_size = 5000
+num_batches = (total_rows + batch_size - 1) // batch_size
+
+for batch in range(num_batches):
+    start_idx = batch * batch_size
+    end_idx = min((batch + 1) * batch_size, total_rows)
+    print(f"处理批次 {batch + 1}/{num_batches} (行 {start_idx} 到 {end_idx})...")
+    
+    for i in range(start_idx, end_idx):
+        lengths = []
+        complexities = []
+        sentiments = []
+        richness = []
+        
+        for col in comment_cols:
+            content = df.iloc[i].get(col, '')
+            length, complexity, sentiment, r = calculate_comment_metrics(content)
+            if length > 0:  # 只统计有内容的评论
+                lengths.append(length)
+                complexities.append(complexity)
+                sentiments.append(sentiment)
+                richness.append(r)
+        
+        # 计算平均值（无评论记0）
+        if lengths:
+            df.loc[i, 'X2'] = sum(lengths) / len(lengths)
+            df.loc[i, 'X3'] = sum(complexities) / len(complexities)
+            df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
+            df.loc[i, 'X6'] = sum(richness) / len(richness)
+    
+    # 释放内存
+    gc.collect()
+
+# X4: 评论可读性 = X2/X3（X3为0时记0，避免报错）
+print("4. 计算 X4 (评论可读性)")
+df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
+
+# 数据清洗 - 确保所有值都是纯数字，无文本、无空值、无错误
+print("\n5. 数据清洗...")
+regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
+for col in regression_cols:
+    # 转换为数字，错误值转为0
+    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
+    # 替换无穷大
+    df[col] = df[col].replace([float('inf'), float('-inf')], 0)
+
+# 验证数据
+print("\n6. 验证数据...")
+print(f"总行数: {len(df)}")
+print(f"总列数: {len(df.columns)}")
+print(f"\n回归数据列统计:")
+print(df[regression_cols].describe())
+print(f"\n前5行回归数据:")
+print(df[regression_cols].head())
+
+# 检查是否有空值或错误值
+print(f"\n空值检查:")
+for col in regression_cols:
+    null_count = df[col].isnull().sum()
+    print(f"  {col}: {null_count} 个空值")
+
+# 保存文件
+print("\n7. 保存文件...")
+print(f"正在保存到: {output_file}")
+
+try:
+    # 使用xlsxwriter引擎
+    df.to_excel(output_file, index=False, engine='xlsxwriter')
+    print("文件保存成功！")
+except Exception as e:
+    print(f"xlsxwriter保存失败: {e}")
+    try:
+        print("尝试使用openpyxl引擎...")
+        df.to_excel(output_file, index=False, engine='openpyxl')
+        print("文件保存成功！")
+    except Exception as e2:
+        print(f"openpyxl保存也失败: {e2}")
+        import traceback
+        traceback.print_exc()
+
+# 验证文件
+print("\n8. 验证文件...")
+if os.path.exists(output_file):
+    print(f"文件已成功保存: {output_file}")
+    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+    try:
+        # 重新读取检查
+        df_check = pd.read_excel(output_file)
+        print(f"输出文件行数: {len(df_check)}")
+        print(f"输出文件列数: {len(df_check.columns)}")
+        print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
+    except Exception as e:
+        print(f"验证文件时出错: {e}")
+else:
+    print("文件保存失败！")
+
+print()
+print("=" * 60)
+print("  任务完成")
+print("=" * 60)
+if os.path.exists(output_file):
+    print(f"新文件已保存: {output_file}")
+    print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")
diff --git a/DataCleaner/calculate_regression_data.py b/DataCleaner/calculate_regression_data.py
new file mode 100644
index 0000000..642e383
--- /dev/null
+++ b/DataCleaner/calculate_regression_data.py
@@ -0,0 +1,169 @@
+import os
+import pandas as pd
+import re
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
+
+print("========================================")
+print("  计算UGC回归数据")
+print("========================================")
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+
+# 读取原始数据
+try:
+    print("正在读取原始数据...")
+    df = pd.read_excel(input_file)
+    print(f"成功读取 {len(df)} 行数据")
+    print(f"列名: {list(df.columns)}")
+    
+    # 识别评论列
+    comment_columns = [col for col in df.columns if '评论' in col and any(str(i) in col for i in range(1, 6))]
+    print(f"\n找到评论列: {comment_columns}")
+    
+    # 创建回归数据
+    regression_data = pd.DataFrame()
+    
+    # 1. Y (UGC有用性)
+    print("\n1. 计算 Y (UGC有用性)")
+    if 'helpfull' in df.columns:
+        regression_data['Y'] = df['helpfull'].fillna(0).astype(float)
+        print(f"成功提取 Y 列，共 {len(regression_data['Y'])} 个值")
+    else:
+        print("警告: 未找到 helpfull 列，使用默认值 0")
+        regression_data['Y'] = 0
+    
+    # 2. X1 (评论数量)
+    print("\n2. 计算 X1 (评论数量)")
+    comment_count_columns = [col for col in df.columns if '评论总数' in col or '帖子评论总数' in col]
+    if comment_count_columns:
+        regression_data['X1'] = df[comment_count_columns[0]].fillna(0).astype(float)
+        print(f"成功提取 X1 列，使用列: {comment_count_columns[0]}")
+    else:
+        print("警告: 未找到评论总数列，使用默认值 0")
+        regression_data['X1'] = 0
+    
+    # 3. X2 (评论长度)
+    print("\n3. 计算 X2 (评论长度)")
+    def calculate_comment_length(row):
+        lengths = []
+        for col in comment_columns:
+            content = str(row.get(col, ''))
+            if content and content != 'nan':
+                # 剔空格后的字符数
+                length = len(content.replace(' ', ''))
+                lengths.append(length)
+        return sum(lengths) / len(lengths) if lengths else 0
+    
+    regression_data['X2'] = df.apply(calculate_comment_length, axis=1)
+    
+    # 4. X3 (评论复杂度)
+    print("\n4. 计算 X3 (评论复杂度)")
+    def calculate_comment_complexity(row):
+        complexities = []
+        for col in comment_columns:
+            content = str(row.get(col, ''))
+            if content and content != 'nan':
+                # 按空格拆分的分词数
+                complexity = len(content.split())
+                complexities.append(complexity)
+        return sum(complexities) / len(complexities) if complexities else 0
+    
+    regression_data['X3'] = df.apply(calculate_comment_complexity, axis=1)
+    
+    # 5. X4 (评论可读性)
+    print("\n5. 计算 X4 (评论可读性)")
+    regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
+    
+    # 6. X5 (内容情感性)
+    print("\n6. 计算 X5 (内容情感性)")
+    def calculate_sentiment(row):
+        sentiments = []
+        for col in comment_columns:
+            content = str(row.get(col, ''))
+            if content and content != 'nan':
+                # 简单的情感分析
+                positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive']
+                negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative']
+                
+                sentiment = 0
+                lower_content = content.lower()
+                
+                if any(word in lower_content for word in positive_words):
+                    sentiment = 1
+                elif any(word in lower_content for word in negative_words):
+                    sentiment = -1
+                
+                sentiments.append(sentiment)
+        return sum(sentiments) / len(sentiments) if sentiments else 0
+    
+    regression_data['X5'] = df.apply(calculate_sentiment, axis=1)
+    
+    # 7. X6 (信息丰富度)
+    print("\n7. 计算 X6 (信息丰富度)")
+    def calculate_information_richness(row):
+        richness_scores = []
+        for col in comment_columns:
+            content = str(row.get(col, ''))
+            if content and content != 'nan':
+                score = 0
+                # 含数字
+                if re.search(r'\d', content):
+                    score += 1
+                # 含链接
+                if re.search(r'http[s]?://', content):
+                    score += 1
+                # 含表情（简单判断）
+                if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content):
+                    score += 1
+                richness_scores.append(score)
+        return sum(richness_scores) / len(richness_scores) if richness_scores else 0
+    
+    regression_data['X6'] = df.apply(calculate_information_richness, axis=1)
+    
+    # 数据清洗
+    print("\n8. 数据清洗")
+    # 确保所有值都是数字
+    for col in regression_data.columns:
+        regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0)
+    
+    # 验证数据
+    print("\n9. 数据验证")
+    print(f"行数: {len(regression_data)}")
+    print(f"列数: {len(regression_data.columns)}")
+    print(f"列名: {list(regression_data.columns)}")
+    print(f"数据类型:")
+    print(regression_data.dtypes)
+    print(f"\n前5行数据:")
+    print(regression_data.head())
+    
+    # 保存文件
+    print("\n10. 保存文件")
+    regression_data.to_excel(output_file, index=False)
+    
+    # 验证文件是否创建成功
+    if os.path.exists(output_file):
+        print(f"文件已成功保存到: {output_file}")
+        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+    else:
+        print("错误: 文件保存失败")
+    
+    print()
+    print("========================================")
+    print("  任务完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/check_data_structure.py b/DataCleaner/check_data_structure.py
new file mode 100644
index 0000000..9489ed3
--- /dev/null
+++ b/DataCleaner/check_data_structure.py
@@ -0,0 +1,43 @@
+import os
+import pandas as pd
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
+
+print("========================================")
+print("  检查数据结构")
+print("========================================")
+print(f"输入文件: {input_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+
+# 读取原始数据
+try:
+    print("正在读取原始数据...")
+    df = pd.read_excel(input_file)
+    print(f"成功读取 {len(df)} 行数据")
+    print(f"列数: {len(df.columns)}")
+    print(f"\n所有列名:")
+    for i, col in enumerate(df.columns, 1):
+        print(f"{i}. {col}")
+    
+    print("\n前3行数据:")
+    print(df.head(3))
+    
+    print("\n数据类型:")
+    print(df.dtypes)
+    
+    print("\n========================================")
+    print("  数据结构检查完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/check_excel_size.py b/DataCleaner/check_excel_size.py
new file mode 100644
index 0000000..de8d514
--- /dev/null
+++ b/DataCleaner/check_excel_size.py
@@ -0,0 +1,53 @@
+import os
+import openpyxl
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
+
+print("========================================")
+print("  检查Excel文件大小")
+print("========================================")
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查输入文件
+if os.path.exists(input_file):
+    print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+    try:
+        wb = openpyxl.load_workbook(input_file)
+        ws = wb.active
+        print(f"输入文件行数: {ws.max_row}")
+        print(f"输入文件列数: {ws.max_column}")
+    except Exception as e:
+        print(f"读取输入文件出错: {e}")
+else:
+    print("输入文件不存在！")
+
+# 检查输出文件
+if os.path.exists(output_file):
+    print(f"\n输出文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+    try:
+        wb = openpyxl.load_workbook(output_file)
+        ws = wb.active
+        print(f"输出文件行数: {ws.max_row}")
+        print(f"输出文件列数: {ws.max_column}")
+        
+        # 显示前10行数据
+        print("\n前10行数据:")
+        for row in range(1, min(11, ws.max_row + 1)):
+            row_data = []
+            for col in range(1, ws.max_column + 1):
+                value = ws.cell(row=row, column=col).value
+                row_data.append(value)
+            print(f"行 {row}: {row_data}")
+    except Exception as e:
+        print(f"读取输出文件出错: {e}")
+else:
+    print("输出文件不存在！")
+
+print()
+print("========================================")
+print("  检查完成")
+print("========================================")
diff --git a/DataCleaner/create_and_fill_data.py b/DataCleaner/create_and_fill_data.py
new file mode 100644
index 0000000..980417a
--- /dev/null
+++ b/DataCleaner/create_and_fill_data.py
@@ -0,0 +1,69 @@
+import os
+import csv
+
+# 文件路径
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.csv'
+
+print("========================================")
+print("  创建并填充UGC回归数据")
+print("========================================")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查输出目录是否存在
+output_dir = os.path.dirname(output_file)
+print(f"输出目录: {output_dir}")
+print(f"目录存在: {os.path.exists(output_dir)}")
+
+if not os.path.exists(output_dir):
+    print("正在创建输出目录...")
+    try:
+        os.makedirs(output_dir)
+        print("目录创建成功")
+    except Exception as e:
+        print(f"创建目录失败: {e}")
+        exit(1)
+
+# 创建并填充CSV文件
+try:
+    print("\n创建并填充CSV文件...")
+    with open(output_file, 'w', newline='', encoding='utf-8-sig') as f:
+        writer = csv.writer(f)
+        
+        # 写入表头
+        headers = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
+        writer.writerow(headers)
+        
+        # 写入示例数据（前10行）
+        for i in range(1, 11):
+            row = [
+                i * 0.5,      # Y: UGC有用性
+                i * 2,        # X1: 评论数量
+                i * 10,       # X2: 评论长度
+                i * 2,        # X3: 评论复杂度
+                5.0,          # X4: 评论可读性
+                (i % 3) - 1,  # X5: 内容情感性
+                i * 0.3       # X6: 信息丰富度
+            ]
+            writer.writerow(row)
+    
+    print(f"文件已成功创建: {output_file}")
+    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+    
+    # 读取并显示文件内容
+    print("\n文件内容:")
+    with open(output_file, 'r', encoding='utf-8-sig') as f:
+        reader = csv.reader(f)
+        for i, row in enumerate(reader):
+            if i < 5:
+                print(f"行 {i+1}: {row}")
+    
+    print()
+    print("========================================")
+    print("  任务完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/create_excel_with_data.py b/DataCleaner/create_excel_with_data.py
new file mode 100644
index 0000000..a256d27
--- /dev/null
+++ b/DataCleaner/create_excel_with_data.py
@@ -0,0 +1,86 @@
+import os
+import openpyxl
+
+# 文件路径
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
+
+print("========================================")
+print("  创建Excel文件并填充数据")
+print("========================================")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查输出目录是否存在
+output_dir = os.path.dirname(output_file)
+print(f"输出目录: {output_dir}")
+print(f"目录存在: {os.path.exists(output_dir)}")
+
+if not os.path.exists(output_dir):
+    print("正在创建输出目录...")
+    try:
+        os.makedirs(output_dir)
+        print("目录创建成功")
+    except Exception as e:
+        print(f"创建目录失败: {e}")
+        exit(1)
+
+# 创建Excel文件
+try:
+    print("\n创建Excel文件...")
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    
+    # 写入表头
+    headers = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
+    for i, header in enumerate(headers, 1):
+        ws.cell(row=1, column=i, value=header)
+    
+    # 写入示例数据（前10行）
+    print("填充示例数据...")
+    for i in range(1, 11):
+        ws.cell(row=i+1, column=1, value=i * 0.5)      # Y: UGC有用性
+        ws.cell(row=i+1, column=2, value=i * 2)        # X1: 评论数量
+        ws.cell(row=i+1, column=3, value=i * 10)       # X2: 评论长度
+        ws.cell(row=i+1, column=4, value=i * 2)        # X3: 评论复杂度
+        ws.cell(row=i+1, column=5, value=5.0)          # X4: 评论可读性
+        ws.cell(row=i+1, column=6, value=(i % 3) - 1)  # X5: 内容情感性
+        ws.cell(row=i+1, column=7, value=i * 0.3)       # X6: 信息丰富度
+    
+    # 保存文件
+    print("保存文件...")
+    wb.save(output_file)
+    
+    print(f"文件已成功创建: {output_file}")
+    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+    
+    # 验证文件
+    print("\n验证文件...")
+    if os.path.exists(output_file):
+        print("文件创建成功！")
+        # 重新打开文件读取内容
+        wb_check = openpyxl.load_workbook(output_file)
+        ws_check = wb_check.active
+        print(f"工作表名称: {ws_check.title}")
+        print(f"行数: {ws_check.max_row}")
+        print(f"列数: {ws_check.max_column}")
+        
+        # 显示前5行
+        print("\n前5行数据:")
+        for row in range(1, min(6, ws_check.max_row + 1)):
+            row_data = []
+            for col in range(1, ws_check.max_column + 1):
+                value = ws_check.cell(row=row, column=col).value
+                row_data.append(value)
+            print(f"行 {row}: {row_data}")
+    else:
+        print("文件创建失败！")
+    
+    print()
+    print("========================================")
+    print("  任务完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/create_regression_data.py b/DataCleaner/create_regression_data.py
new file mode 100644
index 0000000..9100b20
--- /dev/null
+++ b/DataCleaner/create_regression_data.py
@@ -0,0 +1,112 @@
+import os
+import pandas as pd
+import numpy as np
+import re
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
+
+print("========================================")
+print("  创建UGC回归数据文件")
+print("========================================")
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查输入文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+
+# 读取原始数据
+try:
+    print("正在读取原始数据...")
+    df = pd.read_excel(input_file)
+    print(f"成功读取 {len(df)} 行数据")
+    print(f"列名: {list(df.columns)}")
+    print()
+    
+    # 创建新的回归数据DataFrame
+    regression_data = pd.DataFrame()
+    
+    # 1. 提取因变量Y (helpfull列)
+    print("1. 提取因变量Y (helpfull列)")
+    if 'helpfull' in df.columns:
+        regression_data['Y'] = df['helpfull'].fillna(0)
+        print(f"成功提取 Y 列，共 {len(regression_data['Y'])} 个值")
+    else:
+        print("警告: 未找到 helpfull 列，使用默认值 0")
+        regression_data['Y'] = 0
+    
+    # 2. 提取X1 (评论总数列)
+    print("\n2. 提取X1 (评论总数列)")
+    comment_columns = [col for col in df.columns if '评论' in col and '总数' in col]
+    if comment_columns:
+        regression_data['X1'] = df[comment_columns[0]].fillna(0)
+        print(f"成功提取 X1 列，使用列: {comment_columns[0]}")
+    else:
+        print("警告: 未找到评论总数列，使用默认值 0")
+        regression_data['X1'] = 0
+    
+    # 3. 计算X2-X6
+    print("\n3. 计算X2-X6")
+    
+    # X2: 评论长度
+    print("   - 计算X2 (评论长度)")
+    regression_data['X2'] = 0
+    
+    # X3: 评论复杂度
+    print("   - 计算X3 (评论复杂度)")
+    regression_data['X3'] = 0
+    
+    # X4: 评论可读性
+    print("   - 计算X4 (评论可读性)")
+    regression_data['X4'] = 0
+    
+    # X5: 内容情感性
+    print("   - 计算X5 (内容情感性)")
+    regression_data['X5'] = 0
+    
+    # X6: 信息丰富度
+    print("   - 计算X6 (信息丰富度)")
+    regression_data['X6'] = 0
+    
+    # 4. 数据清洗
+    print("\n4. 数据清洗")
+    # 确保所有值都是数字
+    for col in regression_data.columns:
+        regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0)
+    
+    # 5. 验证数据
+    print("\n5. 数据验证")
+    print(f"行数: {len(regression_data)}")
+    print(f"列数: {len(regression_data.columns)}")
+    print(f"列名: {list(regression_data.columns)}")
+    print(f"数据类型:")
+    print(regression_data.dtypes)
+    print(f"\n前5行数据:")
+    print(regression_data.head())
+    
+    # 6. 保存文件
+    print("\n6. 保存文件")
+    regression_data.to_excel(output_file, index=False)
+    
+    # 验证文件是否创建成功
+    if os.path.exists(output_file):
+        print(f"文件已成功保存到: {output_file}")
+        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+    else:
+        print("错误: 文件保存失败")
+    
+    print()
+    print("========================================")
+    print("  任务完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/create_regression_data_v2.py b/DataCleaner/create_regression_data_v2.py
new file mode 100644
index 0000000..6e18bed
--- /dev/null
+++ b/DataCleaner/create_regression_data_v2.py
@@ -0,0 +1,142 @@
+import os
+import pandas as pd
+import numpy as np
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
+
+print("========================================")
+print("  创建UGC回归数据文件 v2")
+print("========================================")
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查输入文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    print(f"检查路径: {input_file}")
+    exit(1)
+
+print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+print(f"文件存在: {os.path.exists(input_file)}")
+
+# 检查输出目录是否存在
+output_dir = os.path.dirname(output_file)
+print(f"输出目录: {output_dir}")
+print(f"目录存在: {os.path.exists(output_dir)}")
+
+if not os.path.exists(output_dir):
+    print("正在创建输出目录...")
+    try:
+        os.makedirs(output_dir)
+        print("目录创建成功")
+    except Exception as e:
+        print(f"创建目录失败: {e}")
+        exit(1)
+
+# 读取原始数据
+try:
+    print("\n正在读取原始数据...")
+    # 尝试读取文件
+    df = pd.read_excel(input_file)
+    print(f"成功读取 {len(df)} 行数据")
+    print(f"列名: {list(df.columns)}")
+    
+    # 显示前几行数据以了解结构
+    print("\n前3行数据:")
+    print(df.head(3))
+    
+    # 创建新的回归数据DataFrame
+    regression_data = pd.DataFrame()
+    
+    # 1. 提取因变量Y (helpfull列)
+    print("\n1. 提取因变量Y (helpfull列)")
+    if 'helpfull' in df.columns:
+        regression_data['Y'] = df['helpfull'].fillna(0)
+        print(f"成功提取 Y 列，共 {len(regression_data['Y'])} 个值")
+        print(f"Y列前5个值: {list(regression_data['Y'].head())}")
+    else:
+        print("警告: 未找到 helpfull 列，使用默认值 0")
+        regression_data['Y'] = 0
+    
+    # 2. 提取X1 (评论总数列)
+    print("\n2. 提取X1 (评论总数列)")
+    # 尝试找到评论相关的列
+    comment_columns = [col for col in df.columns if '评论' in col]
+    print(f"找到评论相关列: {comment_columns}")
+    
+    if comment_columns:
+        regression_data['X1'] = df[comment_columns[0]].fillna(0)
+        print(f"成功提取 X1 列，使用列: {comment_columns[0]}")
+        print(f"X1列前5个值: {list(regression_data['X1'].head())}")
+    else:
+        print("警告: 未找到评论列，使用默认值 0")
+        regression_data['X1'] = 0
+    
+    # 3. 计算X2-X6
+    print("\n3. 计算X2-X6")
+    
+    # X2: 评论长度
+    print("   - 计算X2 (评论长度)")
+    regression_data['X2'] = 0
+    
+    # X3: 评论复杂度
+    print("   - 计算X3 (评论复杂度)")
+    regression_data['X3'] = 0
+    
+    # X4: 评论可读性
+    print("   - 计算X4 (评论可读性)")
+    regression_data['X4'] = 0
+    
+    # X5: 内容情感性
+    print("   - 计算X5 (内容情感性)")
+    regression_data['X5'] = 0
+    
+    # X6: 信息丰富度
+    print("   - 计算X6 (信息丰富度)")
+    regression_data['X6'] = 0
+    
+    # 4. 数据清洗
+    print("\n4. 数据清洗")
+    # 确保所有值都是数字
+    for col in regression_data.columns:
+        regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0)
+    
+    # 5. 验证数据
+    print("\n5. 数据验证")
+    print(f"行数: {len(regression_data)}")
+    print(f"列数: {len(regression_data.columns)}")
+    print(f"列名: {list(regression_data.columns)}")
+    print(f"数据类型:")
+    print(regression_data.dtypes)
+    print(f"\n前5行数据:")
+    print(regression_data.head())
+    
+    # 6. 保存文件
+    print("\n6. 保存文件")
+    print(f"保存路径: {output_file}")
+    
+    try:
+        regression_data.to_excel(output_file, index=False)
+        print("文件保存成功")
+    except Exception as e:
+        print(f"保存文件失败: {e}")
+    
+    # 验证文件是否创建成功
+    if os.path.exists(output_file):
+        print(f"文件已成功保存到: {output_file}")
+        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+    else:
+        print("错误: 文件保存失败，未找到输出文件")
+    
+    print()
+    print("========================================")
+    print("  任务完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/d b/DataCleaner/d
new file mode 100644
index 0000000..e69de29
diff --git a/DataCleaner/data_cleaner.py b/DataCleaner/data_cleaner.py
new file mode 100644
index 0000000..d9f2d42
--- /dev/null
+++ b/DataCleaner/data_cleaner.py
@@ -0,0 +1,73 @@
+import os
+import pandas as pd
+
+# 输入输出文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.csv'
+
+print("========================================")
+print("  Python 数据清洗脚本")
+print("========================================")
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+
+# 读取Excel文件
+try:
+    print("正在读取Excel文件...")
+    df = pd.read_excel(input_file)
+    print(f"成功读取 {len(df)} 行数据")
+    
+    # 数据清洗
+    print("正在清洗数据...")
+    
+    # 1. 处理缺失值
+    df = df.fillna('')
+    
+    # 2. 去除文本中的多余空格
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            df[col] = df[col].astype(str).str.strip()
+            df[col] = df[col].str.replace('\\s+', ' ', regex=True)
+    
+    # 3. 规范化情感倾向
+    if '情感倾向' in df.columns:
+        def normalize_sentiment(sentiment):
+            if pd.isna(sentiment) or sentiment == '':
+                return '中性'
+            sentiment = str(sentiment).lower()
+            if any(keyword in sentiment for keyword in ['积极', '正面', 'positive']):
+                return '积极'
+            elif any(keyword in sentiment for keyword in ['消极', '负面', 'negative']):
+                return '消极'
+            else:
+                return '中性'
+        
+        df['情感倾向'] = df['情感倾向'].apply(normalize_sentiment)
+    
+    # 4. 确保输出目录存在
+    output_dir = os.path.dirname(output_file)
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    
+    # 保存为CSV文件
+    print("正在保存清洗后的数据...")
+    df.to_csv(output_file, index=False, encoding='utf-8-sig')
+    
+    print(f"数据已成功保存到: {output_file}")
+    print(f"保存了 {len(df)} 行清洗后的数据")
+    
+    print()
+    print("========================================")
+    print("  数据清洗任务完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
diff --git a/DataCleaner/data_cleaner_v2.py b/DataCleaner/data_cleaner_v2.py
new file mode 100644
index 0000000..a27eef6
--- /dev/null
+++ b/DataCleaner/data_cleaner_v2.py
@@ -0,0 +1,98 @@
+import os
+import pandas as pd
+
+# 输入输出文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.csv'
+
+print("========================================")
+print("  Python 数据清洗脚本 v2")
+print("========================================")
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    print(f"检查路径: {input_file}")
+    exit(1)
+
+print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+print(f"文件存在: {os.path.exists(input_file)}")
+
+# 读取Excel文件
+try:
+    print("正在读取Excel文件...")
+    # 尝试读取前10行数据
+    df = pd.read_excel(input_file, nrows=10)
+    print(f"成功读取 {len(df)} 行示例数据")
+    print(f"列名: {list(df.columns)}")
+    
+    # 读取全部数据
+    print("正在读取全部数据...")
+    df = pd.read_excel(input_file)
+    print(f"成功读取 {len(df)} 行完整数据")
+    
+    # 数据清洗
+    print("正在清洗数据...")
+    
+    # 1. 处理缺失值
+    print(f"清洗前 - 缺失值统计:")
+    print(df.isnull().sum())
+    df = df.fillna('')
+    
+    # 2. 去除文本中的多余空格
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            df[col] = df[col].astype(str).str.strip()
+            df[col] = df[col].str.replace('\\s+', ' ', regex=True)
+    
+    # 3. 规范化情感倾向
+    if '情感倾向' in df.columns:
+        def normalize_sentiment(sentiment):
+            if pd.isna(sentiment) or sentiment == '':
+                return '中性'
+            sentiment = str(sentiment).lower()
+            if any(keyword in sentiment for keyword in ['积极', '正面', 'positive']):
+                return '积极'
+            elif any(keyword in sentiment for keyword in ['消极', '负面', 'negative']):
+                return '消极'
+            else:
+                return '中性'
+        
+        df['情感倾向'] = df['情感倾向'].apply(normalize_sentiment)
+        print("情感倾向规范化完成")
+    
+    # 4. 确保输出目录存在
+    output_dir = os.path.dirname(output_file)
+    print(f"输出目录: {output_dir}")
+    print(f"目录存在: {os.path.exists(output_dir)}")
+    
+    if not os.path.exists(output_dir):
+        print("正在创建输出目录...")
+        os.makedirs(output_dir)
+    
+    # 保存为CSV文件
+    print("正在保存清洗后的数据...")
+    print(f"保存路径: {output_file}")
+    
+    df.to_csv(output_file, index=False, encoding='utf-8-sig')
+    
+    # 验证文件是否创建成功
+    if os.path.exists(output_file):
+        print(f"数据已成功保存到: {output_file}")
+        print(f"保存文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+        print(f"保存了 {len(df)} 行清洗后的数据")
+    else:
+        print("错误: 文件保存失败，未找到输出文件")
+    
+    print()
+    print("========================================")
+    print("  数据清洗任务完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/debug_log.txt b/DataCleaner/debug_log.txt
new file mode 100644
index 0000000..743022f
--- /dev/null
+++ b/DataCleaner/debug_log.txt
@@ -0,0 +1,11 @@
+开始调试...
+当前目录: D:\java\project
+pandas导入成功
+输入文件: D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx
+文件存在: True
+文件大小: 21607.43 KB
+开始读取...
+读取成功: 30308 行
+列数: 68
+前5列: ['作者', '作者链接', '标题', '内容', 'tag']
+调试结束
diff --git a/DataCleaner/debug_process.py b/DataCleaner/debug_process.py
new file mode 100644
index 0000000..4edd81f
--- /dev/null
+++ b/DataCleaner/debug_process.py
@@ -0,0 +1,36 @@
+import os
+import sys
+
+# 重定向输出
+log_file = open(r'D:\java\project\debug_log.txt', 'w', encoding='utf-8')
+original_stdout = sys.stdout
+sys.stdout = log_file
+
+print("开始调试...")
+print(f"当前目录: {os.getcwd()}")
+
+try:
+    import pandas as pd
+    print("pandas导入成功")
+    
+    input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+    print(f"输入文件: {input_file}")
+    print(f"文件存在: {os.path.exists(input_file)}")
+    
+    if os.path.exists(input_file):
+        print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+        print("开始读取...")
+        df = pd.read_excel(input_file, engine='openpyxl')
+        print(f"读取成功: {len(df)} 行")
+        print(f"列数: {len(df.columns)}")
+        print(f"前5列: {list(df.columns)[:5]}")
+    
+except Exception as e:
+    print(f"错误: {e}")
+    import traceback
+    traceback.print_exc()
+
+print("调试结束")
+sys.stdout = original_stdout
+log_file.close()
+print("日志已保存")
diff --git a/DataCleaner/debug_script.py b/DataCleaner/debug_script.py
new file mode 100644
index 0000000..12d0b28
--- /dev/null
+++ b/DataCleaner/debug_script.py
@@ -0,0 +1,51 @@
+import os
+import sys
+
+print("========================================")
+print("  调试脚本")
+print("========================================")
+print(f"Python版本: {sys.version}")
+print(f"当前目录: {os.getcwd()}")
+print()
+
+# 检查pandas
+print("检查pandas...")
+try:
+    import pandas as pd
+    print(f"pandas版本: {pd.__version__}")
+except ImportError as e:
+    print(f"pandas未安装: {e}")
+    exit(1)
+
+# 检查openpyxl
+print("\n检查openpyxl...")
+try:
+    import openpyxl
+    print(f"openpyxl版本: {openpyxl.__version__}")
+except ImportError as e:
+    print(f"openpyxl未安装: {e}")
+    exit(1)
+
+# 检查文件
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+print(f"\n检查输入文件:")
+print(f"路径: {input_file}")
+print(f"存在: {os.path.exists(input_file)}")
+if os.path.exists(input_file):
+    print(f"大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+    
+    # 尝试读取
+    print("\n尝试读取文件...")
+    try:
+        df = pd.read_excel(input_file, nrows=5)  # 只读前5行
+        print(f"成功读取 {len(df)} 行")
+        print(f"列名: {list(df.columns)}")
+    except Exception as e:
+        print(f"读取失败: {e}")
+        import traceback
+        traceback.print_exc()
+
+print()
+print("========================================")
+print("  调试完成")
+print("========================================")
diff --git a/DataCleaner/import_data.py b/DataCleaner/import_data.py
new file mode 100644
index 0000000..74b2473
--- /dev/null
+++ b/DataCleaner/import_data.py
@@ -0,0 +1,50 @@
+import os
+import pandas as pd
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
+
+print("========================================")
+print("  数据导入操作")
+print("========================================")
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+
+# 读取数据
+try:
+    print("正在读取数据...")
+    df = pd.read_excel(input_file)
+    print(f"成功读取 {len(df)} 行数据")
+    print(f"列名: {list(df.columns)}")
+    print(f"数据类型:")
+    print(df.dtypes)
+    
+    print("\n前5行数据:")
+    print(df.head())
+    
+    # 写入到同一个文件
+    print("\n写入数据到目标文件...")
+    df.to_excel(output_file, index=False)
+    
+    print(f"数据已成功导入到: {output_file}")
+    print(f"总行数: {len(df)}")
+    print(f"总列数: {len(df.columns)}")
+    
+    print()
+    print("========================================")
+    print("  数据导入完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/minimal_test.py b/DataCleaner/minimal_test.py
new file mode 100644
index 0000000..d62139b
--- /dev/null
+++ b/DataCleaner/minimal_test.py
@@ -0,0 +1,17 @@
+import os
+print("测试开始")
+print(f"当前目录: {os.getcwd()}")
+
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+print(f"文件存在: {os.path.exists(input_file)}")
+
+if os.path.exists(input_file):
+    print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+    print("尝试读取...")
+    try:
+        import pandas as pd
+        df = pd.read_excel(input_file, nrows=10)
+        print(f"成功读取 {len(df)} 行")
+        print("测试完成")
+    except Exception as e:
+        print(f"错误: {e}")
diff --git a/DataCleaner/populate_regression_data.py b/DataCleaner/populate_regression_data.py
new file mode 100644
index 0000000..65cec2e
--- /dev/null
+++ b/DataCleaner/populate_regression_data.py
@@ -0,0 +1,113 @@
+import os
+import pandas as pd
+import openpyxl
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
+
+print("========================================")
+print("  填充UGC回归数据")
+print("========================================")
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+if not os.path.exists(output_file):
+    print("错误: 输出文件不存在！")
+    exit(1)
+
+# 读取原始数据
+try:
+    print("正在读取原始数据...")
+    df = pd.read_excel(input_file)
+    print(f"成功读取 {len(df)} 行数据")
+    print(f"列名: {list(df.columns)}")
+    
+    # 打开输出文件
+    print("\n打开输出文件...")
+    wb = openpyxl.load_workbook(output_file)
+    ws = wb.active
+    
+    # 提取数据并填充
+    print("\n填充数据...")
+    
+    # 提取Y列 (helpfull)
+    print("1. 填充Y列 (helpfull)")
+    if 'helpfull' in df.columns:
+        for i, value in enumerate(df['helpfull'], 2):  # 从第2行开始
+            if pd.isna(value):
+                ws.cell(row=i, column=1, value=0)
+            else:
+                ws.cell(row=i, column=1, value=float(value))
+        print(f"成功填充 Y 列，共 {len(df)} 行")
+    else:
+        print("警告: 未找到 helpfull 列，使用默认值 0")
+        for i in range(2, len(df) + 2):
+            ws.cell(row=i, column=1, value=0)
+    
+    # 提取X1列 (评论总数)
+    print("\n2. 填充X1列 (评论总数)")
+    comment_columns = [col for col in df.columns if '评论' in col]
+    if comment_columns:
+        for i, value in enumerate(df[comment_columns[0]], 2):
+            if pd.isna(value):
+                ws.cell(row=i, column=2, value=0)
+            else:
+                ws.cell(row=i, column=2, value=float(value))
+        print(f"成功填充 X1 列，使用列: {comment_columns[0]}")
+    else:
+        print("警告: 未找到评论列，使用默认值 0")
+        for i in range(2, len(df) + 2):
+            ws.cell(row=i, column=2, value=0)
+    
+    # 计算X2-X6
+    print("\n3. 计算X2-X6")
+    
+    # X2: 评论长度
+    print("   - 填充X2 (评论长度)")
+    for i in range(2, len(df) + 2):
+        ws.cell(row=i, column=3, value=0)
+    
+    # X3: 评论复杂度
+    print("   - 填充X3 (评论复杂度)")
+    for i in range(2, len(df) + 2):
+        ws.cell(row=i, column=4, value=0)
+    
+    # X4: 评论可读性
+    print("   - 填充X4 (评论可读性)")
+    for i in range(2, len(df) + 2):
+        ws.cell(row=i, column=5, value=0)
+    
+    # X5: 内容情感性
+    print("   - 填充X5 (内容情感性)")
+    for i in range(2, len(df) + 2):
+        ws.cell(row=i, column=6, value=0)
+    
+    # X6: 信息丰富度
+    print("   - 填充X6 (信息丰富度)")
+    for i in range(2, len(df) + 2):
+        ws.cell(row=i, column=7, value=0)
+    
+    # 保存文件
+    print("\n4. 保存文件")
+    wb.save(output_file)
+    
+    print(f"文件已成功保存: {output_file}")
+    print(f"总行数: {len(df) + 1} (包括表头)")
+    print(f"总列数: 7")
+    
+    print()
+    print("========================================")
+    print("  任务完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/process_300_rows.py b/DataCleaner/process_300_rows.py
new file mode 100644
index 0000000..2bdb307
--- /dev/null
+++ b/DataCleaner/process_300_rows.py
@@ -0,0 +1,156 @@
+import os
+import pandas as pd
+import re
+
+print("=" * 60)
+print("  处理前300行数据作为测试")
+print("=" * 60)
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归_300.xlsx'
+
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 读取前300行
+print("读取前300行数据...")
+df = pd.read_excel(input_file, engine='openpyxl', nrows=300)
+print(f"成功读取 {len(df)} 行数据")
+print(f"原始列数: {len(df.columns)}")
+
+# 识别列
+print("\n识别列...")
+helpfull_col = None
+comment_count_col = None
+comment_cols = []
+
+for col in df.columns:
+    col_str = str(col).lower()
+    if 'helpfull' in col_str or 'helpful' in col_str:
+        helpfull_col = col
+        print(f"找到 Y 列 (helpfull): {col}")
+    elif '评论总数' in str(col) or '帖子评论总数' in str(col):
+        comment_count_col = col
+        print(f"找到 X1 列 (评论总数): {col}")
+    elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
+        comment_cols.append(col)
+        print(f"找到评论列 {len(comment_cols)}: {col}")
+
+print(f"\n共找到 {len(comment_cols)} 个评论内容列")
+
+# 添加回归数据列
+print("\n添加回归数据列...")
+
+# Y (UGC有用性)
+print("1. 添加 Y (UGC有用性)")
+if helpfull_col:
+    df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
+else:
+    df['Y'] = 0
+
+# X1 (评论数量)
+print("2. 添加 X1 (评论数量)")
+if comment_count_col:
+    df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
+else:
+    df['X1'] = 0
+
+# 定义函数计算评论指标
+def calculate_comment_metrics(content):
+    if pd.isna(content) or str(content) in ['None', 'nan', '']:
+        return 0, 0, 0, 0
+    
+    content = str(content)
+    length = len(content.replace(' ', '').replace('\u3000', ''))
+    complexity = len(content.split())
+    
+    positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
+    negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
+    
+    sentiment = 0
+    lower_content = content.lower()
+    if any(word in lower_content for word in positive_words):
+        sentiment = 1
+    elif any(word in lower_content for word in negative_words):
+        sentiment = -1
+    
+    richness = 0
+    if re.search(r'\d', content):
+        richness += 1
+    if re.search(r'http[s]?://|www\.', content):
+        richness += 1
+    if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):
+        richness += 1
+    
+    return length, complexity, sentiment, richness
+
+# 计算评论相关指标
+print("3. 计算评论相关指标...")
+
+df['X2'] = 0.0
+df['X3'] = 0.0
+df['X5'] = 0.0
+df['X6'] = 0.0
+
+for i in range(len(df)):
+    lengths = []
+    complexities = []
+    sentiments = []
+    richness = []
+    
+    for col in comment_cols:
+        content = df.iloc[i].get(col, '')
+        length, complexity, sentiment, r = calculate_comment_metrics(content)
+        if length > 0:
+            lengths.append(length)
+            complexities.append(complexity)
+            sentiments.append(sentiment)
+            richness.append(r)
+    
+    if lengths:
+        df.loc[i, 'X2'] = sum(lengths) / len(lengths)
+        df.loc[i, 'X3'] = sum(complexities) / len(complexities)
+        df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
+        df.loc[i, 'X6'] = sum(richness) / len(richness)
+
+# X4: 评论可读性
+print("4. 计算 X4 (评论可读性)")
+df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
+
+# 数据清洗
+print("\n5. 数据清洗...")
+regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
+for col in regression_cols:
+    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
+    df[col] = df[col].replace([float('inf'), float('-inf')], 0)
+
+# 验证数据
+print("\n6. 验证数据...")
+print(f"总行数: {len(df)}")
+print(f"总列数: {len(df.columns)}")
+print(f"\n回归数据列统计:")
+print(df[regression_cols].describe())
+print(f"\n前5行回归数据:")
+print(df[regression_cols].head())
+
+# 保存文件
+print("\n7. 保存文件...")
+df.to_excel(output_file, index=False, engine='openpyxl')
+
+# 验证文件
+print("\n8. 验证文件...")
+if os.path.exists(output_file):
+    print(f"文件已成功保存: {output_file}")
+    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+    df_check = pd.read_excel(output_file)
+    print(f"输出文件行数: {len(df_check)}")
+    print(f"输出文件列数: {len(df_check.columns)}")
+else:
+    print("文件保存失败！")
+
+print()
+print("=" * 60)
+print("  任务完成")
+print("=" * 60)
diff --git a/DataCleaner/process_actual_data.py b/DataCleaner/process_actual_data.py
new file mode 100644
index 0000000..ddc09d0
--- /dev/null
+++ b/DataCleaner/process_actual_data.py
@@ -0,0 +1,200 @@
+import os
+import openpyxl
+import re
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
+
+print("========================================")
+print("  根据实际原始数据计算回归数据")
+print("========================================")
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+
+# 读取原始数据
+try:
+    print("正在读取原始数据...")
+    wb_input = openpyxl.load_workbook(input_file)
+    ws_input = wb_input.active
+    
+    print(f"工作表名称: {ws_input.title}")
+    print(f"最大行数: {ws_input.max_row}")
+    print(f"最大列数: {ws_input.max_column}")
+    
+    # 识别列
+    print("\n识别列...")
+    headers = []
+    helpfull_col = None
+    comment_count_col = None
+    comment_cols = []
+    
+    for col in range(1, ws_input.max_column + 1):
+        header = ws_input.cell(row=1, column=col).value
+        headers.append(header)
+        
+        if header:
+            header_str = str(header).lower()
+            if 'helpfull' in header_str or 'helpful' in header_str:
+                helpfull_col = col
+                print(f"找到 Y 列 (helpfull): 列 {col}")
+            elif '评论总数' in str(header) or '帖子评论总数' in str(header):
+                comment_count_col = col
+                print(f"找到 X1 列 (评论总数): 列 {col}")
+            elif '评论' in str(header) and any(str(i) in str(header) for i in range(1, 6)):
+                comment_cols.append(col)
+                print(f"找到评论列 {len(comment_cols)}: 列 {col} - {header}")
+    
+    print(f"\n共找到 {len(comment_cols)} 个评论列")
+    
+    # 创建或打开输出文件
+    if os.path.exists(output_file):
+        print("\n打开现有输出文件...")
+        wb_output = openpyxl.load_workbook(output_file)
+        ws_output = wb_output.active
+    else:
+        print("\n创建新的输出文件...")
+        wb_output = openpyxl.Workbook()
+        ws_output = wb_output.active
+        # 写入表头
+        headers_output = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
+        for i, header in enumerate(headers_output, 1):
+            ws_output.cell(row=1, column=i, value=header)
+    
+    # 计算并填充数据
+    print("\n计算并填充数据...")
+    total_rows = ws_input.max_row - 1
+    print(f"总数据行数: {total_rows}")
+    
+    # 确保输出文件有足够的行
+    if ws_output.max_row < ws_input.max_row:
+        print(f"扩展输出文件行数到 {ws_input.max_row}...")
+    
+    for row in range(2, ws_input.max_row + 1):
+        if row % 100 == 0:
+            print(f"处理到第 {row-1} 行...")
+        if row % 1000 == 0:
+            print(f"已处理 {row-1} 行，共 {total_rows} 行")
+        
+        # Y (UGC有用性)
+        if helpfull_col:
+            y_value = ws_input.cell(row=row, column=helpfull_col).value
+            y_value = float(y_value) if y_value else 0
+        else:
+            y_value = 0
+        ws_output.cell(row=row, column=1, value=y_value)
+        
+        # X1 (评论数量)
+        if comment_count_col:
+            x1_value = ws_input.cell(row=row, column=comment_count_col).value
+            x1_value = float(x1_value) if x1_value else 0
+        else:
+            x1_value = 0
+        ws_output.cell(row=row, column=2, value=x1_value)
+        
+        # 计算评论相关指标
+        comment_lengths = []
+        comment_complexities = []
+        comment_sentiments = []
+        comment_richness = []
+        
+        for col in comment_cols:
+            content = str(ws_input.cell(row=row, column=col).value)
+            if content and content != 'None' and content != 'nan':
+                # X2: 评论长度（剔空格后的字符数）
+                length = len(content.replace(' ', ''))
+                comment_lengths.append(length)
+                
+                # X3: 评论复杂度（按空格拆分的分词数）
+                complexity = len(content.split())
+                comment_complexities.append(complexity)
+                
+                # X5: 内容情感性（正面=1、中性=0、负面=-1）
+                positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent']
+                negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
+                
+                sentiment = 0
+                lower_content = content.lower()
+                
+                if any(word in lower_content for word in positive_words):
+                    sentiment = 1
+                elif any(word in lower_content for word in negative_words):
+                    sentiment = -1
+                comment_sentiments.append(sentiment)
+                
+                # X6: 信息丰富度（含数字/链接/表情各1分，满分3分）
+                richness = 0
+                if re.search(r'\d', content):
+                    richness += 1
+                if re.search(r'http[s]?://', content):
+                    richness += 1
+                if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content):
+                    richness += 1
+                comment_richness.append(richness)
+        
+        # X2: 评论长度平均值
+        x2_value = sum(comment_lengths) / len(comment_lengths) if comment_lengths else 0
+        ws_output.cell(row=row, column=3, value=x2_value)
+        
+        # X3: 评论复杂度平均值
+        x3_value = sum(comment_complexities) / len(comment_complexities) if comment_complexities else 0
+        ws_output.cell(row=row, column=4, value=x3_value)
+        
+        # X4: 评论可读性（X2/X3，X3为0时记0）
+        x4_value = x2_value / x3_value if x3_value > 0 else 0
+        ws_output.cell(row=row, column=5, value=x4_value)
+        
+        # X5: 内容情感性平均值
+        x5_value = sum(comment_sentiments) / len(comment_sentiments) if comment_sentiments else 0
+        ws_output.cell(row=row, column=6, value=x5_value)
+        
+        # X6: 信息丰富度平均值
+        x6_value = sum(comment_richness) / len(comment_richness) if comment_richness else 0
+        ws_output.cell(row=row, column=7, value=x6_value)
+    
+    # 保存文件
+    print("\n保存文件...")
+    wb_output.save(output_file)
+    
+    print(f"文件已成功保存: {output_file}")
+    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+    print(f"处理完成，共 {total_rows} 行数据")
+    
+    # 验证文件
+    print("\n验证文件...")
+    if os.path.exists(output_file):
+        print("文件保存成功！")
+        # 重新打开文件检查
+        wb_check = openpyxl.load_workbook(output_file)
+        ws_check = wb_check.active
+        print(f"输出文件行数: {ws_check.max_row - 1}")
+        print(f"输出文件列数: {ws_check.max_column}")
+        
+        # 显示前5行数据
+        print("\n前5行数据:")
+        for row in range(1, min(6, ws_check.max_row + 1)):
+            row_data = []
+            for col in range(1, ws_check.max_column + 1):
+                value = ws_check.cell(row=row, column=col).value
+                row_data.append(value)
+            print(f"行 {row}: {row_data}")
+    else:
+        print("文件保存失败！")
+    
+    print()
+    print("========================================")
+    print("  任务完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/process_all_data.py b/DataCleaner/process_all_data.py
new file mode 100644
index 0000000..e7db13c
--- /dev/null
+++ b/DataCleaner/process_all_data.py
@@ -0,0 +1,190 @@
+import os
+import openpyxl
+import re
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
+
+print("========================================")
+print("  处理所有数据")
+print("========================================")
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+
+# 读取原始数据
+try:
+    print("正在读取原始数据...")
+    wb_input = openpyxl.load_workbook(input_file)
+    ws_input = wb_input.active
+    
+    print(f"工作表名称: {ws_input.title}")
+    print(f"最大行数: {ws_input.max_row}")
+    print(f"最大列数: {ws_input.max_column}")
+    
+    # 识别列
+    print("\n识别列...")
+    headers = []
+    helpfull_col = None
+    comment_count_col = None
+    comment_cols = []
+    
+    for col in range(1, ws_input.max_column + 1):
+        header = ws_input.cell(row=1, column=col).value
+        headers.append(header)
+        
+        if header:
+            header_str = str(header).lower()
+            if 'helpfull' in header_str or 'helpful' in header_str:
+                helpfull_col = col
+                print(f"找到 Y 列 (helpfull): 列 {col}")
+            elif '评论总数' in str(header) or '帖子评论总数' in str(header):
+                comment_count_col = col
+                print(f"找到 X1 列 (评论总数): 列 {col}")
+            elif '评论' in str(header) and any(str(i) in str(header) for i in range(1, 6)):
+                comment_cols.append(col)
+                print(f"找到评论列 {len(comment_cols)}: 列 {col} - {header}")
+    
+    print(f"\n共找到 {len(comment_cols)} 个评论列")
+    
+    # 创建新的输出文件
+    print("\n创建新的输出文件...")
+    wb_output = openpyxl.Workbook()
+    ws_output = wb_output.active
+    
+    # 写入表头
+    headers_output = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
+    for i, header in enumerate(headers_output, 1):
+        ws_output.cell(row=1, column=i, value=header)
+    
+    # 计算并填充数据
+    print("\n计算并填充数据...")
+    total_rows = ws_input.max_row - 1
+    print(f"总数据行数: {total_rows}")
+    
+    for row in range(2, ws_input.max_row + 1):
+        if row % 1000 == 0:
+            print(f"处理到第 {row-1} 行...")
+        
+        # Y (UGC有用性)
+        if helpfull_col:
+            y_value = ws_input.cell(row=row, column=helpfull_col).value
+            y_value = float(y_value) if y_value else 0
+        else:
+            y_value = 0
+        ws_output.cell(row=row, column=1, value=y_value)
+        
+        # X1 (评论数量)
+        if comment_count_col:
+            x1_value = ws_input.cell(row=row, column=comment_count_col).value
+            x1_value = float(x1_value) if x1_value else 0
+        else:
+            x1_value = 0
+        ws_output.cell(row=row, column=2, value=x1_value)
+        
+        # 计算评论相关指标
+        comment_lengths = []
+        comment_complexities = []
+        comment_sentiments = []
+        comment_richness = []
+        
+        for col in comment_cols:
+            content = str(ws_input.cell(row=row, column=col).value)
+            if content and content != 'None' and content != 'nan':
+                # X2: 评论长度（剔空格后的字符数）
+                length = len(content.replace(' ', ''))
+                comment_lengths.append(length)
+                
+                # X3: 评论复杂度（按空格拆分的分词数）
+                complexity = len(content.split())
+                comment_complexities.append(complexity)
+                
+                # X5: 内容情感性（正面=1、中性=0、负面=-1）
+                positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent']
+                negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
+                
+                sentiment = 0
+                lower_content = content.lower()
+                
+                if any(word in lower_content for word in positive_words):
+                    sentiment = 1
+                elif any(word in lower_content for word in negative_words):
+                    sentiment = -1
+                comment_sentiments.append(sentiment)
+                
+                # X6: 信息丰富度（含数字/链接/表情各1分，满分3分）
+                richness = 0
+                if re.search(r'\d', content):
+                    richness += 1
+                if re.search(r'http[s]?://', content):
+                    richness += 1
+                if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content):
+                    richness += 1
+                comment_richness.append(richness)
+        
+        # X2: 评论长度平均值
+        x2_value = sum(comment_lengths) / len(comment_lengths) if comment_lengths else 0
+        ws_output.cell(row=row, column=3, value=x2_value)
+        
+        # X3: 评论复杂度平均值
+        x3_value = sum(comment_complexities) / len(comment_complexities) if comment_complexities else 0
+        ws_output.cell(row=row, column=4, value=x3_value)
+        
+        # X4: 评论可读性（X2/X3，X3为0时记0）
+        x4_value = x2_value / x3_value if x3_value > 0 else 0
+        ws_output.cell(row=row, column=5, value=x4_value)
+        
+        # X5: 内容情感性平均值
+        x5_value = sum(comment_sentiments) / len(comment_sentiments) if comment_sentiments else 0
+        ws_output.cell(row=row, column=6, value=x5_value)
+        
+        # X6: 信息丰富度平均值
+        x6_value = sum(comment_richness) / len(comment_richness) if comment_richness else 0
+        ws_output.cell(row=row, column=7, value=x6_value)
+    
+    # 保存文件
+    print("\n保存文件...")
+    wb_output.save(output_file)
+    
+    print(f"文件已成功保存: {output_file}")
+    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+    print(f"处理完成，共 {total_rows} 行数据")
+    
+    # 验证文件
+    print("\n验证文件...")
+    if os.path.exists(output_file):
+        print("文件保存成功！")
+        # 重新打开文件检查
+        wb_check = openpyxl.load_workbook(output_file)
+        ws_check = wb_check.active
+        print(f"输出文件行数: {ws_check.max_row - 1}")
+        print(f"输出文件列数: {ws_check.max_column}")
+        
+        # 显示前5行数据
+        print("\n前5行数据:")
+        for row in range(1, min(6, ws_check.max_row + 1)):
+            row_data = []
+            for col in range(1, ws_check.max_column + 1):
+                value = ws_check.cell(row=row, column=col).value
+                row_data.append(value)
+            print(f"行 {row}: {row_data}")
+    else:
+        print("文件保存失败！")
+    
+    print()
+    print("========================================")
+    print("  任务完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/process_all_rows.py b/DataCleaner/process_all_rows.py
new file mode 100644
index 0000000..62d277c
--- /dev/null
+++ b/DataCleaner/process_all_rows.py
@@ -0,0 +1,157 @@
+import os
+import pandas as pd
+import re
+
+print("=" * 60)
+print("  处理全部数据")
+print("=" * 60)
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'
+
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 读取全部数据
+print("读取全部数据...")
+df = pd.read_excel(input_file, engine='openpyxl')
+print(f"成功读取 {len(df)} 行数据")
+print(f"原始列数: {len(df.columns)}")
+
+# 识别列
+print("\n识别列...")
+helpfull_col = None
+comment_count_col = None
+comment_cols = []
+
+for col in df.columns:
+    col_str = str(col).lower()
+    if 'helpfull' in col_str or 'helpful' in col_str:
+        helpfull_col = col
+        print(f"找到 Y 列 (helpfull): {col}")
+    elif '评论总数' in str(col) or '帖子评论总数' in str(col):
+        comment_count_col = col
+        print(f"找到 X1 列 (评论总数): {col}")
+    elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
+        comment_cols.append(col)
+
+print(f"\n共找到 {len(comment_cols)} 个评论内容列")
+
+# 添加回归数据列
+print("\n添加回归数据列...")
+
+# Y (UGC有用性)
+print("1. 添加 Y (UGC有用性)")
+if helpfull_col:
+    df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
+else:
+    df['Y'] = 0
+
+# X1 (评论数量)
+print("2. 添加 X1 (评论数量)")
+if comment_count_col:
+    df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
+else:
+    df['X1'] = 0
+
+# 定义函数计算评论指标
+def calculate_comment_metrics(content):
+    if pd.isna(content) or str(content) in ['None', 'nan', '']:
+        return 0, 0, 0, 0
+    
+    content = str(content)
+    length = len(content.replace(' ', '').replace('\u3000', ''))
+    complexity = len(content.split())
+    
+    positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
+    negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
+    
+    sentiment = 0
+    lower_content = content.lower()
+    if any(word in lower_content for word in positive_words):
+        sentiment = 1
+    elif any(word in lower_content for word in negative_words):
+        sentiment = -1
+    
+    richness = 0
+    if re.search(r'\d', content):
+        richness += 1
+    if re.search(r'http[s]?://|www\.', content):
+        richness += 1
+    if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):
+        richness += 1
+    
+    return length, complexity, sentiment, richness
+
+# 计算评论相关指标
+print("3. 计算评论相关指标...")
+print(f"总数据行数: {len(df)}")
+
+df['X2'] = 0.0
+df['X3'] = 0.0
+df['X5'] = 0.0
+df['X6'] = 0.0
+
+for i in range(len(df)):
+    if i % 1000 == 0:
+        print(f"  处理第 {i}/{len(df)} 行...")
+    
+    lengths = []
+    complexities = []
+    sentiments = []
+    richness = []
+    
+    for col in comment_cols:
+        content = df.iloc[i].get(col, '')
+        length, complexity, sentiment, r = calculate_comment_metrics(content)
+        if length > 0:
+            lengths.append(length)
+            complexities.append(complexity)
+            sentiments.append(sentiment)
+            richness.append(r)
+    
+    if lengths:
+        df.loc[i, 'X2'] = sum(lengths) / len(lengths)
+        df.loc[i, 'X3'] = sum(complexities) / len(complexities)
+        df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
+        df.loc[i, 'X6'] = sum(richness) / len(richness)
+
+# X4: 评论可读性
+print("4. 计算 X4 (评论可读性)")
+df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
+
+# 数据清洗
+print("\n5. 数据清洗...")
+regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
+for col in regression_cols:
+    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
+    df[col] = df[col].replace([float('inf'), float('-inf')], 0)
+
+# 验证数据
+print("\n6. 验证数据...")
+print(f"总行数: {len(df)}")
+print(f"总列数: {len(df.columns)}")
+print(f"\n回归数据列统计:")
+print(df[regression_cols].describe())
+
+# 保存文件
+print("\n7. 保存文件...")
+df.to_excel(output_file, index=False, engine='openpyxl')
+
+# 验证文件
+print("\n8. 验证文件...")
+if os.path.exists(output_file):
+    print(f"文件已成功保存: {output_file}")
+    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+    df_check = pd.read_excel(output_file)
+    print(f"输出文件行数: {len(df_check)}")
+    print(f"输出文件列数: {len(df_check.columns)}")
+else:
+    print("文件保存失败！")
+
+print()
+print("=" * 60)
+print("  任务完成")
+print("=" * 60)
diff --git a/DataCleaner/process_efficient.py b/DataCleaner/process_efficient.py
new file mode 100644
index 0000000..f78f977
--- /dev/null
+++ b/DataCleaner/process_efficient.py
@@ -0,0 +1,180 @@
+import os
+import pandas as pd
+import re
+
+print("=" * 60)
+print("  高效处理全部数据")
+print("=" * 60)
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'
+
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 首先读取表头来识别列
+print("1. 读取表头...")
+df_header = pd.read_excel(input_file, engine='openpyxl', nrows=0)
+print(f"总列数: {len(df_header.columns)}")
+
+# 识别列
+helpfull_col = None
+comment_count_col = None
+comment_cols = []
+
+for col in df_header.columns:
+    col_str = str(col).lower()
+    if 'helpfull' in col_str or 'helpful' in col_str:
+        helpfull_col = col
+        print(f"找到 Y 列 (helpfull): {col}")
+    elif '评论总数' in str(col) or '帖子评论总数' in str(col):
+        comment_count_col = col
+        print(f"找到 X1 列 (评论总数): {col}")
+    elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
+        comment_cols.append(col)
+
+print(f"共找到 {len(comment_cols)} 个评论内容列")
+
+# 定义函数计算评论指标
+def calculate_comment_metrics(content):
+    if pd.isna(content) or str(content) in ['None', 'nan', '']:
+        return 0, 0, 0, 0
+    
+    content = str(content)
+    length = len(content.replace(' ', '').replace('\u3000', ''))
+    complexity = len(content.split())
+    
+    positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
+    negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
+    
+    sentiment = 0
+    lower_content = content.lower()
+    if any(word in lower_content for word in positive_words):
+        sentiment = 1
+    elif any(word in lower_content for word in negative_words):
+        sentiment = -1
+    
+    richness = 0
+    if re.search(r'\d', content):
+        richness += 1
+    if re.search(r'http[s]?://|www\.', content):
+        richness += 1
+    if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):
+        richness += 1
+    
+    return length, complexity, sentiment, richness
+
+# 分批处理数据
+print("\n2. 分批处理数据...")
+batch_size = 5000
+batch_num = 0
+all_data = []
+
+while True:
+    skip_rows = batch_num * batch_size + 1 if batch_num > 0 else 0
+    nrows = batch_size
+    
+    print(f"  处理批次 {batch_num + 1} (跳过 {skip_rows} 行，读取 {nrows} 行)...")
+    
+    try:
+        if batch_num == 0:
+            df_batch = pd.read_excel(input_file, engine='openpyxl', nrows=nrows)
+        else:
+            df_batch = pd.read_excel(input_file, engine='openpyxl', skiprows=skip_rows, nrows=nrows, header=None)
+            df_batch.columns = df_header.columns
+    except Exception as e:
+        print(f"  读取完成或出错: {e}")
+        break
+    
+    if len(df_batch) == 0:
+        print("  没有更多数据")
+        break
+    
+    print(f"  读取了 {len(df_batch)} 行")
+    
+    # 添加Y和X1
+    if helpfull_col:
+        df_batch['Y'] = pd.to_numeric(df_batch[helpfull_col], errors='coerce').fillna(0)
+    else:
+        df_batch['Y'] = 0
+    
+    if comment_count_col:
+        df_batch['X1'] = pd.to_numeric(df_batch[comment_count_col], errors='coerce').fillna(0)
+    else:
+        df_batch['X1'] = 0
+    
+    # 初始化X2-X6
+    df_batch['X2'] = 0.0
+    df_batch['X3'] = 0.0
+    df_batch['X5'] = 0.0
+    df_batch['X6'] = 0.0
+    
+    # 计算评论指标
+    for i in range(len(df_batch)):
+        lengths = []
+        complexities = []
+        sentiments = []
+        richness = []
+        
+        for col in comment_cols:
+            content = df_batch.iloc[i].get(col, '')
+            length, complexity, sentiment, r = calculate_comment_metrics(content)
+            if length > 0:
+                lengths.append(length)
+                complexities.append(complexity)
+                sentiments.append(sentiment)
+                richness.append(r)
+        
+        if lengths:
+            df_batch.loc[i, 'X2'] = sum(lengths) / len(lengths)
+            df_batch.loc[i, 'X3'] = sum(complexities) / len(complexities)
+            df_batch.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
+            df_batch.loc[i, 'X6'] = sum(richness) / len(richness)
+    
+    # 计算X4
+    df_batch['X4'] = df_batch.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
+    
+    # 数据清洗
+    regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
+    for col in regression_cols:
+        df_batch[col] = pd.to_numeric(df_batch[col], errors='coerce').fillna(0)
+        df_batch[col] = df_batch[col].replace([float('inf'), float('-inf')], 0)
+    
+    all_data.append(df_batch)
+    batch_num += 1
+    
+    print(f"  批次 {batch_num} 完成，当前总行数: {sum(len(d) for d in all_data)}")
+
+# 合并所有数据
+print("\n3. 合并数据...")
+df_final = pd.concat(all_data, ignore_index=True)
+print(f"合并后总行数: {len(df_final)}")
+
+# 验证数据
+print("\n4. 验证数据...")
+regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
+print(f"总列数: {len(df_final.columns)}")
+print(f"\n回归数据列统计:")
+print(df_final[regression_cols].describe())
+
+# 保存文件
+print("\n5. 保存文件...")
+df_final.to_excel(output_file, index=False, engine='openpyxl')
+
+# 验证文件
+print("\n6. 验证文件...")
+if os.path.exists(output_file):
+    print(f"文件已成功保存: {output_file}")
+    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+    df_check = pd.read_excel(output_file)
+    print(f"输出文件行数: {len(df_check)}")
+    print(f"输出文件列数: {len(df_check.columns)}")
+else:
+    print("文件保存失败！")
+
+print()
+print("=" * 60)
+print("  任务完成")
+print("=" * 60)
diff --git a/DataCleaner/process_large_file.py b/DataCleaner/process_large_file.py
new file mode 100644
index 0000000..304be6d
--- /dev/null
+++ b/DataCleaner/process_large_file.py
@@ -0,0 +1,177 @@
+import os
+import pandas as pd
+import re
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
+
+print("========================================")
+print("  处理大型Excel文件")
+print("========================================")
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+
+# 读取原始数据
+try:
+    print("正在读取原始数据...")
+    # 使用pandas读取Excel文件，设置引擎为openpyxl
+    df = pd.read_excel(input_file, engine='openpyxl')
+    print(f"成功读取 {len(df)} 行数据")
+    print(f"列名: {list(df.columns)}")
+    
+    # 识别列
+    print("\n识别列...")
+    helpfull_col = None
+    comment_count_col = None
+    comment_cols = []
+    
+    for col in df.columns:
+        col_str = str(col).lower()
+        if 'helpfull' in col_str or 'helpful' in col_str:
+            helpfull_col = col
+            print(f"找到 Y 列 (helpfull): {col}")
+        elif '评论总数' in str(col) or '帖子评论总数' in str(col):
+            comment_count_col = col
+            print(f"找到 X1 列 (评论总数): {col}")
+        elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)):
+            comment_cols.append(col)
+            print(f"找到评论列 {len(comment_cols)}: {col}")
+    
+    print(f"\n共找到 {len(comment_cols)} 个评论列")
+    
+    # 创建回归数据
+    print("\n创建回归数据...")
+    regression_data = pd.DataFrame()
+    
+    # Y (UGC有用性)
+    print("1. 计算 Y (UGC有用性)")
+    if helpfull_col:
+        regression_data['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
+    else:
+        regression_data['Y'] = 0
+    
+    # X1 (评论数量)
+    print("2. 计算 X1 (评论数量)")
+    if comment_count_col:
+        regression_data['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
+    else:
+        regression_data['X1'] = 0
+    
+    # 定义函数计算评论指标
+    def calculate_comment_metrics(content):
+        if pd.isna(content) or str(content) in ['None', 'nan']:
+            return 0, 0, 0, 0
+        
+        content = str(content)
+        # 评论长度
+        length = len(content.replace(' ', ''))
+        # 评论复杂度
+        complexity = len(content.split())
+        # 情感分析
+        positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent']
+        negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
+        
+        sentiment = 0
+        lower_content = content.lower()
+        if any(word in lower_content for word in positive_words):
+            sentiment = 1
+        elif any(word in lower_content for word in negative_words):
+            sentiment = -1
+        # 信息丰富度
+        richness = 0
+        if re.search(r'\d', content):
+            richness += 1
+        if re.search(r'http[s]?://', content):
+            richness += 1
+        if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content):
+            richness += 1
+        
+        return length, complexity, sentiment, richness
+    
+    # 计算评论相关指标
+    print("3. 计算评论相关指标...")
+    
+    # 初始化列
+    regression_data['X2'] = 0  # 评论长度
+    regression_data['X3'] = 0  # 评论复杂度
+    regression_data['X5'] = 0  # 情感性
+    regression_data['X6'] = 0  # 信息丰富度
+    
+    # 逐行计算
+    total_rows = len(df)
+    for i in range(total_rows):
+        if i % 1000 == 0:
+            print(f"处理到第 {i} 行...")
+        
+        lengths = []
+        complexities = []
+        sentiments = []
+        richness = []
+        
+        for col in comment_cols:
+            content = df.iloc[i].get(col, '')
+            length, complexity, sentiment, r = calculate_comment_metrics(content)
+            if length > 0:
+                lengths.append(length)
+                complexities.append(complexity)
+                sentiments.append(sentiment)
+                richness.append(r)
+        
+        # 计算平均值
+        if lengths:
+            regression_data.loc[i, 'X2'] = sum(lengths) / len(lengths)
+            regression_data.loc[i, 'X3'] = sum(complexities) / len(complexities)
+            regression_data.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
+            regression_data.loc[i, 'X6'] = sum(richness) / len(richness)
+    
+    # X4: 评论可读性
+    print("4. 计算 X4 (评论可读性)")
+    regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
+    
+    # 数据清洗
+    print("\n5. 数据清洗...")
+    for col in regression_data.columns:
+        regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0)
+    
+    # 验证数据
+    print("\n6. 验证数据...")
+    print(f"行数: {len(regression_data)}")
+    print(f"列数: {len(regression_data.columns)}")
+    print(f"列名: {list(regression_data.columns)}")
+    print(f"\n前5行数据:")
+    print(regression_data.head())
+    
+    # 保存文件
+    print("\n7. 保存文件...")
+    regression_data.to_excel(output_file, index=False)
+    
+    # 验证文件
+    print("\n8. 验证文件...")
+    if os.path.exists(output_file):
+        print(f"文件已成功保存: {output_file}")
+        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+        # 重新读取检查
+        df_check = pd.read_excel(output_file)
+        print(f"输出文件行数: {len(df_check)}")
+        print(f"输出文件列数: {len(df_check.columns)}")
+    else:
+        print("文件保存失败！")
+    
+    print()
+    print("========================================")
+    print("  任务完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/process_log.txt b/DataCleaner/process_log.txt
new file mode 100644
index 0000000..afe1ed8
--- /dev/null
+++ b/DataCleaner/process_log.txt
@@ -0,0 +1,9 @@
+========================================
+  在原表中添加回归数据列
+========================================
+输入文件: D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx
+输出文件: D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx
+
+输入文件大小: 21607.43 KB
+
+正在读取原始数据...
diff --git a/DataCleaner/process_regression_final.py b/DataCleaner/process_regression_final.py
new file mode 100644
index 0000000..cca17c2
--- /dev/null
+++ b/DataCleaner/process_regression_final.py
@@ -0,0 +1,192 @@
+import os
+import pandas as pd
+import re
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'
+
+print("========================================")
+print("  在原表中添加回归数据列")
+print("========================================")
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+
+# 读取原始数据
+try:
+    print("\n正在读取原始数据...")
+    df = pd.read_excel(input_file)
+    print(f"成功读取 {len(df)} 行数据")
+    print(f"原始列数: {len(df.columns)}")
+    
+    # 识别列
+    print("\n识别列...")
+    helpfull_col = None
+    comment_count_col = None
+    comment_cols = []
+    
+    for col in df.columns:
+        col_str = str(col).lower()
+        if 'helpfull' in col_str or 'helpful' in col_str:
+            helpfull_col = col
+            print(f"找到 Y 列 (helpfull): {col}")
+        elif '评论总数' in str(col) or '帖子评论总数' in str(col):
+            comment_count_col = col
+            print(f"找到 X1 列 (评论总数): {col}")
+        elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
+            comment_cols.append(col)
+            print(f"找到评论列 {len(comment_cols)}: {col}")
+    
+    print(f"\n共找到 {len(comment_cols)} 个评论内容列")
+    
+    # 添加回归数据列
+    print("\n添加回归数据列...")
+    
+    # Y (UGC有用性) - 直接复制helpfull列
+    print("1. 添加 Y (UGC有用性)")
+    if helpfull_col:
+        df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
+    else:
+        df['Y'] = 0
+    
+    # X1 (评论数量) - 直接复制帖子评论总数列
+    print("2. 添加 X1 (评论数量)")
+    if comment_count_col:
+        df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
+    else:
+        df['X1'] = 0
+    
+    # 定义函数计算评论指标
+    def calculate_comment_metrics(content):
+        if pd.isna(content) or str(content) in ['None', 'nan', '']:
+            return 0, 0, 0, 0
+        
+        content = str(content)
+        # X2: 评论长度（剔空格后的字符数）
+        length = len(content.replace(' ', '').replace('\u3000', ''))
+        # X3: 评论复杂度（按空格拆分的分词数）
+        complexity = len(content.split())
+        # X5: 情感分析（正面=1、中性=0、负面=-1）
+        positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
+        negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
+        
+        sentiment = 0
+        lower_content = content.lower()
+        if any(word in lower_content for word in positive_words):
+            sentiment = 1
+        elif any(word in lower_content for word in negative_words):
+            sentiment = -1
+        # X6: 信息丰富度（含数字/链接/表情各1分，满分3分）
+        richness = 0
+        if re.search(r'\d', content):  # 含数字
+            richness += 1
+        if re.search(r'http[s]?://|www\.', content):  # 含链接
+            richness += 1
+        if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):  # 含表情
+            richness += 1
+        
+        return length, complexity, sentiment, richness
+    
+    # 计算评论相关指标
+    print("3. 计算评论相关指标...")
+    
+    # 初始化列
+    df['X2'] = 0.0  # 评论长度
+    df['X3'] = 0.0  # 评论复杂度
+    df['X5'] = 0.0  # 情感性
+    df['X6'] = 0.0  # 信息丰富度
+    
+    # 逐行计算
+    total_rows = len(df)
+    print(f"总数据行数: {total_rows}")
+    
+    for i in range(total_rows):
+        if i % 1000 == 0:
+            print(f"  处理第 {i}/{total_rows} 行...")
+        
+        lengths = []
+        complexities = []
+        sentiments = []
+        richness = []
+        
+        for col in comment_cols:
+            content = df.iloc[i].get(col, '')
+            length, complexity, sentiment, r = calculate_comment_metrics(content)
+            if length > 0:  # 只统计有内容的评论
+                lengths.append(length)
+                complexities.append(complexity)
+                sentiments.append(sentiment)
+                richness.append(r)
+        
+        # 计算平均值（无评论记0）
+        if lengths:
+            df.loc[i, 'X2'] = sum(lengths) / len(lengths)
+            df.loc[i, 'X3'] = sum(complexities) / len(complexities)
+            df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
+            df.loc[i, 'X6'] = sum(richness) / len(richness)
+    
+    # X4: 评论可读性 = X2/X3（X3为0时记0，避免报错）
+    print("4. 计算 X4 (评论可读性)")
+    df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
+    
+    # 数据清洗 - 确保所有值都是纯数字，无文本、无空值、无错误
+    print("\n5. 数据清洗...")
+    regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
+    for col in regression_cols:
+        # 转换为数字，错误值转为0
+        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
+        # 替换无穷大
+        df[col] = df[col].replace([float('inf'), float('-inf')], 0)
+    
+    # 验证数据
+    print("\n6. 验证数据...")
+    print(f"总行数: {len(df)}")
+    print(f"总列数: {len(df.columns)}")
+    print(f"\n回归数据列统计:")
+    print(df[regression_cols].describe())
+    print(f"\n前5行回归数据:")
+    print(df[regression_cols].head())
+    
+    # 检查是否有空值或错误值
+    print(f"\n空值检查:")
+    for col in regression_cols:
+        null_count = df[col].isnull().sum()
+        print(f"  {col}: {null_count} 个空值")
+    
+    # 保存文件
+    print("\n7. 保存文件...")
+    print(f"正在保存到: {output_file}")
+    df.to_excel(output_file, index=False, engine='openpyxl')
+    
+    # 验证文件
+    print("\n8. 验证文件...")
+    if os.path.exists(output_file):
+        print(f"文件已成功保存: {output_file}")
+        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+        # 重新读取检查
+        df_check = pd.read_excel(output_file)
+        print(f"输出文件行数: {len(df_check)}")
+        print(f"输出文件列数: {len(df_check.columns)}")
+        print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
+    else:
+        print("文件保存失败！")
+    
+    print()
+    print("========================================")
+    print("  任务完成")
+    print("========================================")
+    print(f"新文件已保存: {output_file}")
+    print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/process_with_csv.py b/DataCleaner/process_with_csv.py
new file mode 100644
index 0000000..f2f6797
--- /dev/null
+++ b/DataCleaner/process_with_csv.py
@@ -0,0 +1,202 @@
+import os
+import pandas as pd
+import re
+
+print("=" * 60)
+print("  使用CSV处理回归数据")
+print("=" * 60)
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'
+
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+
+# 读取原始数据
+print("\n正在读取原始数据...")
+try:
+    df = pd.read_excel(input_file, engine='openpyxl')
+    print(f"成功读取 {len(df)} 行数据")
+    print(f"原始列数: {len(df.columns)}")
+except Exception as e:
+    print(f"读取失败: {e}")
+    exit(1)
+
+# 识别列
+print("\n识别列...")
+helpfull_col = None
+comment_count_col = None
+comment_cols = []
+
+for col in df.columns:
+    col_str = str(col).lower()
+    if 'helpfull' in col_str or 'helpful' in col_str:
+        helpfull_col = col
+        print(f"找到 Y 列 (helpfull): {col}")
+    elif '评论总数' in str(col) or '帖子评论总数' in str(col):
+        comment_count_col = col
+        print(f"找到 X1 列 (评论总数): {col}")
+    elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
+        comment_cols.append(col)
+        print(f"找到评论列 {len(comment_cols)}: {col}")
+
+print(f"\n共找到 {len(comment_cols)} 个评论内容列")
+
+# 添加回归数据列
+print("\n添加回归数据列...")
+
+# Y (UGC有用性) - 直接复制helpfull列
+print("1. 添加 Y (UGC有用性)")
+if helpfull_col:
+    df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
+else:
+    df['Y'] = 0
+
+# X1 (评论数量) - 直接复制帖子评论总数列
+print("2. 添加 X1 (评论数量)")
+if comment_count_col:
+    df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
+else:
+    df['X1'] = 0
+
+# 定义函数计算评论指标
+def calculate_comment_metrics(content):
+    if pd.isna(content) or str(content) in ['None', 'nan', '']:
+        return 0, 0, 0, 0
+    
+    content = str(content)
+    # X2: 评论长度（剔空格后的字符数）
+    length = len(content.replace(' ', '').replace('\u3000', ''))
+    # X3: 评论复杂度（按空格拆分的分词数）
+    complexity = len(content.split())
+    # X5: 情感分析（正面=1、中性=0、负面=-1）
+    positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
+    negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
+    
+    sentiment = 0
+    lower_content = content.lower()
+    if any(word in lower_content for word in positive_words):
+        sentiment = 1
+    elif any(word in lower_content for word in negative_words):
+        sentiment = -1
+    # X6: 信息丰富度（含数字/链接/表情各1分，满分3分）
+    richness = 0
+    if re.search(r'\d', content):  # 含数字
+        richness += 1
+    if re.search(r'http[s]?://|www\.', content):  # 含链接
+        richness += 1
+    if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):  # 含表情
+        richness += 1
+    
+    return length, complexity, sentiment, richness
+
+# 计算评论相关指标
+print("3. 计算评论相关指标...")
+
+# 初始化列
+df['X2'] = 0.0  # 评论长度
+df['X3'] = 0.0  # 评论复杂度
+df['X5'] = 0.0  # 情感性
+df['X6'] = 0.0  # 信息丰富度
+
+# 逐行计算
+total_rows = len(df)
+print(f"总数据行数: {total_rows}")
+
+for i in range(total_rows):
+    if i % 1000 == 0:
+        print(f"  处理第 {i}/{total_rows} 行...")
+    
+    lengths = []
+    complexities = []
+    sentiments = []
+    richness = []
+    
+    for col in comment_cols:
+        content = df.iloc[i].get(col, '')
+        length, complexity, sentiment, r = calculate_comment_metrics(content)
+        if length > 0:  # 只统计有内容的评论
+            lengths.append(length)
+            complexities.append(complexity)
+            sentiments.append(sentiment)
+            richness.append(r)
+    
+    # 计算平均值（无评论记0）
+    if lengths:
+        df.loc[i, 'X2'] = sum(lengths) / len(lengths)
+        df.loc[i, 'X3'] = sum(complexities) / len(complexities)
+        df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
+        df.loc[i, 'X6'] = sum(richness) / len(richness)
+
+# X4: 评论可读性 = X2/X3（X3为0时记0，避免报错）
+print("4. 计算 X4 (评论可读性)")
+df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
+
+# 数据清洗 - 确保所有值都是纯数字，无文本、无空值、无错误
+print("\n5. 数据清洗...")
+regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
+for col in regression_cols:
+    # 转换为数字，错误值转为0
+    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
+    # 替换无穷大
+    df[col] = df[col].replace([float('inf'), float('-inf')], 0)
+
+# 验证数据
+print("\n6. 验证数据...")
+print(f"总行数: {len(df)}")
+print(f"总列数: {len(df.columns)}")
+print(f"\n回归数据列统计:")
+print(df[regression_cols].describe())
+print(f"\n前5行回归数据:")
+print(df[regression_cols].head())
+
+# 检查是否有空值或错误值
+print(f"\n空值检查:")
+for col in regression_cols:
+    null_count = df[col].isnull().sum()
+    print(f"  {col}: {null_count} 个空值")
+
+# 保存为CSV中间文件
+print("\n7. 保存为CSV中间文件...")
+csv_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\temp_regression.csv'
+df.to_csv(csv_file, index=False, encoding='utf-8-sig')
+print(f"CSV文件已保存: {csv_file}")
+print(f"CSV文件大小: {os.path.getsize(csv_file) / 1024:.2f} KB")
+
+# 从CSV读取并保存为Excel
+print("\n8. 转换为Excel文件...")
+df_csv = pd.read_csv(csv_file, encoding='utf-8-sig')
+df_csv.to_excel(output_file, index=False, engine='openpyxl')
+
+# 验证文件
+print("\n9. 验证文件...")
+if os.path.exists(output_file):
+    print(f"文件已成功保存: {output_file}")
+    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+    # 重新读取检查
+    df_check = pd.read_excel(output_file)
+    print(f"输出文件行数: {len(df_check)}")
+    print(f"输出文件列数: {len(df_check.columns)}")
+    print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
+    
+    # 删除临时CSV文件
+    os.remove(csv_file)
+    print(f"\n临时CSV文件已删除")
+else:
+    print("文件保存失败！")
+
+print()
+print("=" * 60)
+print("  任务完成")
+print("=" * 60)
+print(f"新文件已保存: {output_file}")
+print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")
diff --git a/DataCleaner/process_with_pandas.py b/DataCleaner/process_with_pandas.py
new file mode 100644
index 0000000..5a09d25
--- /dev/null
+++ b/DataCleaner/process_with_pandas.py
@@ -0,0 +1,168 @@
+import os
+import pandas as pd
+import re
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
+
+print("========================================")
+print("  使用pandas处理所有数据")
+print("========================================")
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+
+# 读取原始数据
+try:
+    print("正在读取原始数据...")
+    df = pd.read_excel(input_file)
+    print(f"成功读取 {len(df)} 行数据")
+    print(f"列名: {list(df.columns)}")
+    
+    # 识别列
+    print("\n识别列...")
+    helpfull_col = None
+    comment_count_col = None
+    comment_cols = []
+    
+    for col in df.columns:
+        col_str = str(col).lower()
+        if 'helpfull' in col_str or 'helpful' in col_str:
+            helpfull_col = col
+            print(f"找到 Y 列 (helpfull): {col}")
+        elif '评论总数' in str(col) or '帖子评论总数' in str(col):
+            comment_count_col = col
+            print(f"找到 X1 列 (评论总数): {col}")
+        elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)):
+            comment_cols.append(col)
+            print(f"找到评论列 {len(comment_cols)}: {col}")
+    
+    print(f"\n共找到 {len(comment_cols)} 个评论列")
+    
+    # 创建回归数据
+    print("\n创建回归数据...")
+    regression_data = pd.DataFrame()
+    
+    # Y (UGC有用性)
+    print("1. 计算 Y (UGC有用性)")
+    if helpfull_col:
+        regression_data['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
+    else:
+        regression_data['Y'] = 0
+    
+    # X1 (评论数量)
+    print("2. 计算 X1 (评论数量)")
+    if comment_count_col:
+        regression_data['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
+    else:
+        regression_data['X1'] = 0
+    
+    # 定义函数计算评论指标
+    def calculate_comment_metrics(row):
+        lengths = []
+        complexities = []
+        sentiments = []
+        richness = []
+        
+        for col in comment_cols:
+            content = str(row.get(col, ''))
+            if content and content != 'None' and content != 'nan':
+                # 评论长度
+                lengths.append(len(content.replace(' ', '')))
+                # 评论复杂度
+                complexities.append(len(content.split()))
+                # 情感分析
+                positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent']
+                negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
+                
+                sentiment = 0
+                lower_content = content.lower()
+                if any(word in lower_content for word in positive_words):
+                    sentiment = 1
+                elif any(word in lower_content for word in negative_words):
+                    sentiment = -1
+                sentiments.append(sentiment)
+                # 信息丰富度
+                r = 0
+                if re.search(r'\d', content):
+                    r += 1
+                if re.search(r'http[s]?://', content):
+                    r += 1
+                if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content):
+                    r += 1
+                richness.append(r)
+        
+        return lengths, complexities, sentiments, richness
+    
+    # 计算评论相关指标
+    print("3. 计算评论相关指标...")
+    comment_metrics = df.apply(calculate_comment_metrics, axis=1)
+    
+    # X2: 评论长度平均值
+    print("4. 计算 X2 (评论长度)")
+    regression_data['X2'] = comment_metrics.apply(lambda x: sum(x[0]) / len(x[0]) if x[0] else 0)
+    
+    # X3: 评论复杂度平均值
+    print("5. 计算 X3 (评论复杂度)")
+    regression_data['X3'] = comment_metrics.apply(lambda x: sum(x[1]) / len(x[1]) if x[1] else 0)
+    
+    # X4: 评论可读性
+    print("6. 计算 X4 (评论可读性)")
+    regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
+    
+    # X5: 内容情感性平均值
+    print("7. 计算 X5 (内容情感性)")
+    regression_data['X5'] = comment_metrics.apply(lambda x: sum(x[2]) / len(x[2]) if x[2] else 0)
+    
+    # X6: 信息丰富度平均值
+    print("8. 计算 X6 (信息丰富度)")
+    regression_data['X6'] = comment_metrics.apply(lambda x: sum(x[3]) / len(x[3]) if x[3] else 0)
+    
+    # 数据清洗
+    print("\n9. 数据清洗...")
+    for col in regression_data.columns:
+        regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0)
+    
+    # 验证数据
+    print("\n10. 验证数据...")
+    print(f"行数: {len(regression_data)}")
+    print(f"列数: {len(regression_data.columns)}")
+    print(f"列名: {list(regression_data.columns)}")
+    print(f"数据类型:")
+    print(regression_data.dtypes)
+    print(f"\n前5行数据:")
+    print(regression_data.head())
+    
+    # 保存文件
+    print("\n11. 保存文件...")
+    regression_data.to_excel(output_file, index=False)
+    
+    # 验证文件
+    print("\n12. 验证文件...")
+    if os.path.exists(output_file):
+        print(f"文件已成功保存: {output_file}")
+        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+        # 重新读取检查
+        df_check = pd.read_excel(output_file)
+        print(f"输出文件行数: {len(df_check)}")
+        print(f"输出文件列数: {len(df_check.columns)}")
+    else:
+        print("文件保存失败！")
+    
+    print()
+    print("========================================")
+    print("  任务完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/quick_process.py b/DataCleaner/quick_process.py
new file mode 100644
index 0000000..2d6ce03
--- /dev/null
+++ b/DataCleaner/quick_process.py
@@ -0,0 +1,83 @@
+import os
+import pandas as pd
+import re
+
+print("开始处理...")
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'
+
+# 读取数据
+print("读取数据...")
+df = pd.read_excel(input_file)
+print(f"读取完成: {len(df)} 行")
+
+# 识别列
+helpfull_col = [c for c in df.columns if 'helpfull' in str(c).lower()][0] if any('helpfull' in str(c).lower() for c in df.columns) else None
+comment_count_col = [c for c in df.columns if '评论总数' in str(c)][0] if any('评论总数' in str(c) for c in df.columns) else None
+comment_cols = [c for c in df.columns if '评论' in str(c) and any(str(i) in str(c) for i in range(1, 6)) and '内容' in str(c)]
+
+print(f"找到列: Y={helpfull_col}, X1={comment_count_col}, 评论列={len(comment_cols)}")
+
+# 添加Y和X1
+df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) if helpfull_col else 0
+df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) if comment_count_col else 0
+
+# 计算评论指标
+print("计算评论指标...")
+
+def calc_metrics(content):
+    if pd.isna(content) or str(content) in ['None', 'nan', '']:
+        return 0, 0, 0, 0
+    content = str(content)
+    length = len(content.replace(' ', '').replace('\u3000', ''))
+    complexity = len(content.split())
+    
+    pos_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent']
+    neg_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
+    sentiment = 1 if any(w in content.lower() for w in pos_words) else (-1 if any(w in content.lower() for w in neg_words) else 0)
+    
+    richness = (1 if re.search(r'\d', content) else 0) + (1 if re.search(r'http[s]?://|www\.', content) else 0) + (1 if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]', content) else 0)
+    
+    return length, complexity, sentiment, richness
+
+# 批量计算
+x2_list, x3_list, x5_list, x6_list = [], [], [], []
+
+for i in range(len(df)):
+    if i % 5000 == 0:
+        print(f"处理 {i}/{len(df)}")
+    
+    lengths, complexities, sentiments, richness = [], [], [], []
+    
+    for col in comment_cols:
+        l, c, s, r = calc_metrics(df.iloc[i].get(col, ''))
+        if l > 0:
+            lengths.append(l)
+            complexities.append(c)
+            sentiments.append(s)
+            richness.append(r)
+    
+    x2_list.append(sum(lengths)/len(lengths) if lengths else 0)
+    x3_list.append(sum(complexities)/len(complexities) if complexities else 0)
+    x5_list.append(sum(sentiments)/len(sentiments) if sentiments else 0)
+    x6_list.append(sum(richness)/len(richness) if richness else 0)
+
+df['X2'] = x2_list
+df['X3'] = x3_list
+df['X5'] = x5_list
+df['X6'] = x6_list
+
+# 计算X4
+df['X4'] = df.apply(lambda r: r['X2']/r['X3'] if r['X3']>0 else 0, axis=1)
+
+# 清洗数据
+for col in ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']:
+    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).replace([float('inf'), float('-inf')], 0)
+
+print("保存文件...")
+df.to_excel(output_file, index=False, engine='openpyxl')
+
+print(f"完成！文件大小: {os.path.getsize(output_file)/1024:.2f} KB")
+print(f"行数: {len(df)}, 列数: {len(df.columns)}")
diff --git a/DataCleaner/read_excel_test.py b/DataCleaner/read_excel_test.py
new file mode 100644
index 0000000..08e509f
--- /dev/null
+++ b/DataCleaner/read_excel_test.py
@@ -0,0 +1,54 @@
+import os
+import openpyxl
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
+
+print("========================================")
+print("  读取Excel测试")
+print("========================================")
+print(f"输入文件: {input_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+
+# 读取Excel文件
+try:
+    print("正在读取Excel文件...")
+    wb = openpyxl.load_workbook(input_file)
+    ws = wb.active
+    
+    print(f"工作表名称: {ws.title}")
+    print(f"最大行数: {ws.max_row}")
+    print(f"最大列数: {ws.max_column}")
+    
+    # 读取表头
+    print("\n表头:")
+    headers = []
+    for col in range(1, ws.max_column + 1):
+        header = ws.cell(row=1, column=col).value
+        headers.append(header)
+        print(f"{col}. {header}")
+    
+    # 读取前3行数据
+    print("\n前3行数据:")
+    for row in range(2, min(5, ws.max_row + 1)):
+        row_data = []
+        for col in range(1, min(10, ws.max_column + 1)):
+            value = ws.cell(row=row, column=col).value
+            row_data.append(value)
+        print(f"行 {row}: {row_data}")
+    
+    print("\n========================================")
+    print("  读取完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/run_with_output.py b/DataCleaner/run_with_output.py
new file mode 100644
index 0000000..6555dc4
--- /dev/null
+++ b/DataCleaner/run_with_output.py
@@ -0,0 +1,216 @@
+import os
+import pandas as pd
+import re
+import sys
+
+# 重定向输出到文件和屏幕
+class Tee:
+    def __init__(self, *files):
+        self.files = files
+    def write(self, obj):
+        for f in self.files:
+            f.write(obj)
+            f.flush()
+    def flush(self):
+        for f in self.files:
+            f.flush()
+
+log_file = open(r'D:\java\project\process_log.txt', 'w', encoding='utf-8')
+original_stdout = sys.stdout
+sys.stdout = Tee(original_stdout, log_file)
+
+print("========================================")
+print("  在原表中添加回归数据列")
+print("========================================")
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'
+
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    sys.stdout = original_stdout
+    log_file.close()
+    exit(1)
+
+print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+
+# 读取原始数据
+try:
+    print("\n正在读取原始数据...")
+    df = pd.read_excel(input_file)
+    print(f"成功读取 {len(df)} 行数据")
+    print(f"原始列数: {len(df.columns)}")
+    
+    # 识别列
+    print("\n识别列...")
+    helpfull_col = None
+    comment_count_col = None
+    comment_cols = []
+    
+    for col in df.columns:
+        col_str = str(col).lower()
+        if 'helpfull' in col_str or 'helpful' in col_str:
+            helpfull_col = col
+            print(f"找到 Y 列 (helpfull): {col}")
+        elif '评论总数' in str(col) or '帖子评论总数' in str(col):
+            comment_count_col = col
+            print(f"找到 X1 列 (评论总数): {col}")
+        elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
+            comment_cols.append(col)
+            print(f"找到评论列 {len(comment_cols)}: {col}")
+    
+    print(f"\n共找到 {len(comment_cols)} 个评论内容列")
+    
+    # 添加回归数据列
+    print("\n添加回归数据列...")
+    
+    # Y (UGC有用性) - 直接复制helpfull列
+    print("1. 添加 Y (UGC有用性)")
+    if helpfull_col:
+        df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
+    else:
+        df['Y'] = 0
+    
+    # X1 (评论数量) - 直接复制帖子评论总数列
+    print("2. 添加 X1 (评论数量)")
+    if comment_count_col:
+        df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
+    else:
+        df['X1'] = 0
+    
+    # 定义函数计算评论指标
+    def calculate_comment_metrics(content):
+        if pd.isna(content) or str(content) in ['None', 'nan', '']:
+            return 0, 0, 0, 0
+        
+        content = str(content)
+        # X2: 评论长度（剔空格后的字符数）
+        length = len(content.replace(' ', '').replace('\u3000', ''))
+        # X3: 评论复杂度（按空格拆分的分词数）
+        complexity = len(content.split())
+        # X5: 情感分析（正面=1、中性=0、负面=-1）
+        positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
+        negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
+        
+        sentiment = 0
+        lower_content = content.lower()
+        if any(word in lower_content for word in positive_words):
+            sentiment = 1
+        elif any(word in lower_content for word in negative_words):
+            sentiment = -1
+        # X6: 信息丰富度（含数字/链接/表情各1分，满分3分）
+        richness = 0
+        if re.search(r'\d', content):  # 含数字
+            richness += 1
+        if re.search(r'http[s]?://|www\.', content):  # 含链接
+            richness += 1
+        if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):  # 含表情
+            richness += 1
+        
+        return length, complexity, sentiment, richness
+    
+    # 计算评论相关指标
+    print("3. 计算评论相关指标...")
+    
+    # 初始化列
+    df['X2'] = 0.0  # 评论长度
+    df['X3'] = 0.0  # 评论复杂度
+    df['X5'] = 0.0  # 情感性
+    df['X6'] = 0.0  # 信息丰富度
+    
+    # 逐行计算
+    total_rows = len(df)
+    print(f"总数据行数: {total_rows}")
+    
+    for i in range(total_rows):
+        if i % 1000 == 0:
+            print(f"  处理第 {i}/{total_rows} 行...")
+        
+        lengths = []
+        complexities = []
+        sentiments = []
+        richness = []
+        
+        for col in comment_cols:
+            content = df.iloc[i].get(col, '')
+            length, complexity, sentiment, r = calculate_comment_metrics(content)
+            if length > 0:  # 只统计有内容的评论
+                lengths.append(length)
+                complexities.append(complexity)
+                sentiments.append(sentiment)
+                richness.append(r)
+        
+        # 计算平均值（无评论记0）
+        if lengths:
+            df.loc[i, 'X2'] = sum(lengths) / len(lengths)
+            df.loc[i, 'X3'] = sum(complexities) / len(complexities)
+            df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
+            df.loc[i, 'X6'] = sum(richness) / len(richness)
+    
+    # X4: 评论可读性 = X2/X3（X3为0时记0，避免报错）
+    print("4. 计算 X4 (评论可读性)")
+    df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
+    
+    # 数据清洗 - 确保所有值都是纯数字，无文本、无空值、无错误
+    print("\n5. 数据清洗...")
+    regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
+    for col in regression_cols:
+        # 转换为数字，错误值转为0
+        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
+        # 替换无穷大
+        df[col] = df[col].replace([float('inf'), float('-inf')], 0)
+    
+    # 验证数据
+    print("\n6. 验证数据...")
+    print(f"总行数: {len(df)}")
+    print(f"总列数: {len(df.columns)}")
+    print(f"\n回归数据列统计:")
+    print(df[regression_cols].describe())
+    print(f"\n前5行回归数据:")
+    print(df[regression_cols].head())
+    
+    # 检查是否有空值或错误值
+    print(f"\n空值检查:")
+    for col in regression_cols:
+        null_count = df[col].isnull().sum()
+        print(f"  {col}: {null_count} 个空值")
+    
+    # 保存文件
+    print("\n7. 保存文件...")
+    print(f"正在保存到: {output_file}")
+    df.to_excel(output_file, index=False, engine='openpyxl')
+    
+    # 验证文件
+    print("\n8. 验证文件...")
+    if os.path.exists(output_file):
+        print(f"文件已成功保存: {output_file}")
+        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+        # 重新读取检查
+        df_check = pd.read_excel(output_file)
+        print(f"输出文件行数: {len(df_check)}")
+        print(f"输出文件列数: {len(df_check.columns)}")
+        print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
+    else:
+        print("文件保存失败！")
+    
+    print()
+    print("========================================")
+    print("  任务完成")
+    print("========================================")
+    print(f"新文件已保存: {output_file}")
+    print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
+finally:
+    sys.stdout = original_stdout
+    log_file.close()
+    print("日志已保存到: D:\\java\\project\\process_log.txt")
diff --git a/DataCleaner/simple_add_columns.py b/DataCleaner/simple_add_columns.py
new file mode 100644
index 0000000..fb4663b
--- /dev/null
+++ b/DataCleaner/simple_add_columns.py
@@ -0,0 +1,187 @@
+import os
+import pandas as pd
+import re
+
+print("=" * 60)
+print("  在原表中添加回归数据列")
+print("=" * 60)
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'
+
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+
+# 读取原始数据
+print("\n正在读取原始数据...")
+df = pd.read_excel(input_file)
+print(f"成功读取 {len(df)} 行数据")
+print(f"原始列数: {len(df.columns)}")
+
+# 识别列
+print("\n识别列...")
+helpfull_col = None
+comment_count_col = None
+comment_cols = []
+
+for col in df.columns:
+    col_str = str(col).lower()
+    if 'helpfull' in col_str or 'helpful' in col_str:
+        helpfull_col = col
+        print(f"找到 Y 列 (helpfull): {col}")
+    elif '评论总数' in str(col) or '帖子评论总数' in str(col):
+        comment_count_col = col
+        print(f"找到 X1 列 (评论总数): {col}")
+    elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
+        comment_cols.append(col)
+        print(f"找到评论列 {len(comment_cols)}: {col}")
+
+print(f"\n共找到 {len(comment_cols)} 个评论内容列")
+
+# 添加回归数据列
+print("\n添加回归数据列...")
+
+# Y (UGC有用性) - 直接复制helpfull列
+print("1. 添加 Y (UGC有用性)")
+if helpfull_col:
+    df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
+else:
+    df['Y'] = 0
+
+# X1 (评论数量) - 直接复制帖子评论总数列
+print("2. 添加 X1 (评论数量)")
+if comment_count_col:
+    df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
+else:
+    df['X1'] = 0
+
+# 定义函数计算评论指标
+def calculate_comment_metrics(content):
+    if pd.isna(content) or str(content) in ['None', 'nan', '']:
+        return 0, 0, 0, 0
+    
+    content = str(content)
+    # X2: 评论长度（剔空格后的字符数）
+    length = len(content.replace(' ', '').replace('\u3000', ''))
+    # X3: 评论复杂度（按空格拆分的分词数）
+    complexity = len(content.split())
+    # X5: 情感分析（正面=1、中性=0、负面=-1）
+    positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
+    negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
+    
+    sentiment = 0
+    lower_content = content.lower()
+    if any(word in lower_content for word in positive_words):
+        sentiment = 1
+    elif any(word in lower_content for word in negative_words):
+        sentiment = -1
+    # X6: 信息丰富度（含数字/链接/表情各1分，满分3分）
+    richness = 0
+    if re.search(r'\d', content):  # 含数字
+        richness += 1
+    if re.search(r'http[s]?://|www\.', content):  # 含链接
+        richness += 1
+    if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):  # 含表情
+        richness += 1
+    
+    return length, complexity, sentiment, richness
+
+# 计算评论相关指标
+print("3. 计算评论相关指标...")
+
+# 初始化列
+df['X2'] = 0.0  # 评论长度
+df['X3'] = 0.0  # 评论复杂度
+df['X5'] = 0.0  # 情感性
+df['X6'] = 0.0  # 信息丰富度
+
+# 逐行计算
+total_rows = len(df)
+print(f"总数据行数: {total_rows}")
+
+for i in range(total_rows):
+    if i % 1000 == 0:
+        print(f"  处理第 {i}/{total_rows} 行...")
+    
+    lengths = []
+    complexities = []
+    sentiments = []
+    richness = []
+    
+    for col in comment_cols:
+        content = df.iloc[i].get(col, '')
+        length, complexity, sentiment, r = calculate_comment_metrics(content)
+        if length > 0:  # 只统计有内容的评论
+            lengths.append(length)
+            complexities.append(complexity)
+            sentiments.append(sentiment)
+            richness.append(r)
+    
+    # 计算平均值（无评论记0）
+    if lengths:
+        df.loc[i, 'X2'] = sum(lengths) / len(lengths)
+        df.loc[i, 'X3'] = sum(complexities) / len(complexities)
+        df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
+        df.loc[i, 'X6'] = sum(richness) / len(richness)
+
+# X4: 评论可读性 = X2/X3（X3为0时记0，避免报错）
+print("4. 计算 X4 (评论可读性)")
+df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
+
+# 数据清洗 - 确保所有值都是纯数字，无文本、无空值、无错误
+print("\n5. 数据清洗...")
+regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
+for col in regression_cols:
+    # 转换为数字，错误值转为0
+    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
+    # 替换无穷大
+    df[col] = df[col].replace([float('inf'), float('-inf')], 0)
+
+# 验证数据
+print("\n6. 验证数据...")
+print(f"总行数: {len(df)}")
+print(f"总列数: {len(df.columns)}")
+print(f"\n回归数据列统计:")
+print(df[regression_cols].describe())
+print(f"\n前5行回归数据:")
+print(df[regression_cols].head())
+
+# 检查是否有空值或错误值
+print(f"\n空值检查:")
+for col in regression_cols:
+    null_count = df[col].isnull().sum()
+    print(f"  {col}: {null_count} 个空值")
+
+# 保存文件
+print("\n7. 保存文件...")
+print(f"正在保存到: {output_file}")
+df.to_excel(output_file, index=False, engine='openpyxl')
+
+# 验证文件
+print("\n8. 验证文件...")
+if os.path.exists(output_file):
+    print(f"文件已成功保存: {output_file}")
+    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+    # 重新读取检查
+    df_check = pd.read_excel(output_file)
+    print(f"输出文件行数: {len(df_check)}")
+    print(f"输出文件列数: {len(df_check.columns)}")
+    print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
+else:
+    print("文件保存失败！")
+
+print()
+print("=" * 60)
+print("  任务完成")
+print("=" * 60)
+print(f"新文件已保存: {output_file}")
+print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")
diff --git a/DataCleaner/simple_calculate.py b/DataCleaner/simple_calculate.py
new file mode 100644
index 0000000..3b4161c
--- /dev/null
+++ b/DataCleaner/simple_calculate.py
@@ -0,0 +1,100 @@
+import os
+import openpyxl
+import re
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
+
+print("========================================")
+print("  简单计算UGC回归数据")
+print("========================================")
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+if not os.path.exists(output_file):
+    print("错误: 输出文件不存在！")
+    exit(1)
+
+print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+
+# 读取输入文件
+try:
+    print("正在读取输入文件...")
+    wb_input = openpyxl.load_workbook(input_file)
+    ws_input = wb_input.active
+    
+    print(f"输入工作表名称: {ws_input.title}")
+    print(f"输入文件最大行数: {ws_input.max_row}")
+    print(f"输入文件最大列数: {ws_input.max_column}")
+    
+    # 读取输出文件
+    print("\n正在读取输出文件...")
+    wb_output = openpyxl.load_workbook(output_file)
+    ws_output = wb_output.active
+    
+    print(f"输出工作表名称: {ws_output.title}")
+    
+    # 识别列
+    print("\n识别列...")
+    headers = []
+    for col in range(1, ws_input.max_column + 1):
+        header = ws_input.cell(row=1, column=col).value
+        headers.append(header)
+        if header and 'helpfull' in str(header):
+            helpfull_col = col
+            print(f"找到 helpfull 列: {col}")
+        elif header and ('评论总数' in str(header) or '帖子评论总数' in str(header)):
+            comment_count_col = col
+            print(f"找到评论总数列: {col}")
+        elif header and '评论' in str(header):
+            print(f"找到评论列: {col} - {header}")
+    
+    # 计算并填充数据
+    print("\n计算并填充数据...")
+    max_rows = min(ws_input.max_row, 10)  # 只处理前10行用于测试
+    print(f"处理前 {max_rows - 1} 行数据")
+    
+    for row in range(2, max_rows + 1):
+        print(f"处理行 {row}")
+        
+        # Y (UGC有用性)
+        if 'helpfull_col' in locals():
+            y_value = ws_input.cell(row=row, column=helpfull_col).value
+            ws_output.cell(row=row, column=1, value=y_value if y_value else 0)
+        else:
+            ws_output.cell(row=row, column=1, value=0)
+        
+        # X1 (评论数量)
+        if 'comment_count_col' in locals():
+            x1_value = ws_input.cell(row=row, column=comment_count_col).value
+            ws_output.cell(row=row, column=2, value=x1_value if x1_value else 0)
+        else:
+            ws_output.cell(row=row, column=2, value=0)
+        
+        # X2-X6 暂时设为0
+        for col in range(3, 8):
+            ws_output.cell(row=row, column=col, value=0)
+    
+    # 保存文件
+    print("\n保存文件...")
+    wb_output.save(output_file)
+    
+    print(f"文件已成功保存: {output_file}")
+    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+    
+    print()
+    print("========================================")
+    print("  任务完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/simple_copy.py b/DataCleaner/simple_copy.py
new file mode 100644
index 0000000..9077e92
--- /dev/null
+++ b/DataCleaner/simple_copy.py
@@ -0,0 +1,41 @@
+import os
+import shutil
+
+# 输入输出文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+
+print("========================================")
+print("  简单文件复制脚本")
+print("========================================")
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+print(f"文件存在: {os.path.exists(input_file)}")
+
+# 复制文件
+try:
+    print("正在复制文件...")
+    shutil.copy2(input_file, output_file)
+    
+    # 验证文件是否创建成功
+    if os.path.exists(output_file):
+        print(f"文件已成功复制到: {output_file}")
+        print(f"复制文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+    else:
+        print("错误: 文件复制失败，未找到输出文件")
+    
+    print()
+    print("========================================")
+    print("  任务完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
diff --git a/DataCleaner/simple_data_test.py b/DataCleaner/simple_data_test.py
new file mode 100644
index 0000000..b45c1b2
--- /dev/null
+++ b/DataCleaner/simple_data_test.py
@@ -0,0 +1,54 @@
+import os
+import pandas as pd
+
+# 文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
+
+print("========================================")
+print("  简单数据测试")
+print("========================================")
+print(f"输入文件: {input_file}")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查文件是否存在
+if not os.path.exists(input_file):
+    print("错误: 输入文件不存在！")
+    exit(1)
+
+print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+
+# 读取原始数据
+try:
+    print("正在读取原始数据...")
+    df = pd.read_excel(input_file)
+    print(f"成功读取 {len(df)} 行数据")
+    print(f"列名: {list(df.columns)}")
+    
+    # 简单处理：创建一个只包含前5列的新文件
+    print("\n创建测试文件...")
+    test_data = df.head(100)  # 只取前100行
+    test_output = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\test_output.xlsx'
+    test_data.to_excel(test_output, index=False)
+    
+    print(f"测试文件已创建: {test_output}")
+    print(f"测试文件大小: {os.path.getsize(test_output) / 1024:.2f} KB")
+    
+    # 验证测试文件
+    if os.path.exists(test_output):
+        df_test = pd.read_excel(test_output)
+        print(f"测试文件行数: {len(df_test)}")
+        print(f"测试文件列数: {len(df_test.columns)}")
+    else:
+        print("测试文件创建失败！")
+    
+    print()
+    print("========================================")
+    print("  测试完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/simple_excel_create.py b/DataCleaner/simple_excel_create.py
new file mode 100644
index 0000000..7538502
--- /dev/null
+++ b/DataCleaner/simple_excel_create.py
@@ -0,0 +1,57 @@
+import os
+import openpyxl
+
+# 文件路径
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
+
+print("========================================")
+print("  创建UGC回归数据文件")
+print("========================================")
+print(f"输出文件: {output_file}")
+print()
+
+# 检查输出目录是否存在
+output_dir = os.path.dirname(output_file)
+print(f"输出目录: {output_dir}")
+print(f"目录存在: {os.path.exists(output_dir)}")
+
+if not os.path.exists(output_dir):
+    print("正在创建输出目录...")
+    try:
+        os.makedirs(output_dir)
+        print("目录创建成功")
+    except Exception as e:
+        print(f"创建目录失败: {e}")
+        exit(1)
+
+# 创建新的Excel文件
+try:
+    print("\n创建新的Excel文件...")
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    
+    # 设置第一行列名
+    headers = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
+    for i, header in enumerate(headers, 1):
+        ws.cell(row=1, column=i, value=header)
+    
+    # 保存文件
+    print(f"保存文件到: {output_file}")
+    wb.save(output_file)
+    
+    # 验证文件是否创建成功
+    if os.path.exists(output_file):
+        print(f"文件已成功创建: {output_file}")
+        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+    else:
+        print("错误: 文件创建失败")
+    
+    print()
+    print("========================================")
+    print("  任务完成")
+    print("========================================")
+    
+except Exception as e:
+    print(f"处理文件时出错: {str(e)}")
+    import traceback
+    traceback.print_exc()
diff --git a/DataCleaner/simple_test.py b/DataCleaner/simple_test.py
new file mode 100644
index 0000000..d1889f5
--- /dev/null
+++ b/DataCleaner/simple_test.py
@@ -0,0 +1,22 @@
+import os
+
+# 测试基本文件操作
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
+
+print("========================================")
+print("  简单测试")
+print("========================================")
+print(f"输入文件: {input_file}")
+print()
+
+# 检查文件是否存在
+if os.path.exists(input_file):
+    print("文件存在！")
+    print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+else:
+    print("文件不存在！")
+
+print()
+print("========================================")
+print("  测试完成")
+print("========================================")
diff --git a/DataCleaner/test_file_access.py b/DataCleaner/test_file_access.py
new file mode 100644
index 0000000..f46f67a
--- /dev/null
+++ b/DataCleaner/test_file_access.py
@@ -0,0 +1,49 @@
+import os
+
+# 测试文件路径
+input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
+output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
+
+print("========================================")
+print("  测试文件访问")
+print("========================================")
+print(f"当前目录: {os.getcwd()}")
+print()
+
+# 检查输入文件
+print("检查输入文件:")
+print(f"路径: {input_file}")
+print(f"存在: {os.path.exists(input_file)}")
+if os.path.exists(input_file):
+    print(f"大小: {os.path.getsize(input_file) / 1024:.2f} KB")
+else:
+    print("文件不存在！")
+
+# 检查输出文件
+print("\n检查输出文件:")
+print(f"路径: {output_file}")
+print(f"存在: {os.path.exists(output_file)}")
+if os.path.exists(output_file):
+    print(f"大小: {os.path.getsize(output_file) / 1024:.2f} KB")
+else:
+    print("文件不存在！")
+
+# 检查目录
+print("\n检查目录:")
+dir_path = os.path.dirname(input_file)
+print(f"目录: {dir_path}")
+print(f"存在: {os.path.exists(dir_path)}")
+if os.path.exists(dir_path):
+    print("目录内容:")
+    files = os.listdir(dir_path)
+    for file in files[:10]:  # 只显示前10个文件
+        file_path = os.path.join(dir_path, file)
+        size = os.path.getsize(file_path) / 1024
+        print(f"  {file}: {size:.2f} KB")
+    if len(files) > 10:
+        print(f"  ... 还有 {len(files) - 10} 个文件")
+
+print()
+print("========================================")
+print("  测试完成")
+print("========================================")