From 6567083c039492cf5cf41e8caa22d90124b74d5c Mon Sep 17 00:00:00 2001 From: ZhengJiayin <13230092115@163.com> Date: Tue, 19 May 2026 20:03:46 +0800 Subject: [PATCH] datacleaner --- DataCleaner/AddRegressionColumns.java | 224 ++++++++++++++++++++++ DataCleaner/DataCleaner.java | 99 ++++++++++ DataCleaner/DataCleaningScript.java | 226 +++++++++++++++++++++++ DataCleaner/DataStorage.java | 121 ++++++++++++ DataCleaner/DuoTai.java | 3 + DataCleaner/ExcelReader.java | 102 ++++++++++ DataCleaner/HTMLReportGenerator.java | 214 +++++++++++++++++++++ DataCleaner/Main.java | 67 +++++++ DataCleaner/PostAnalyzer.java | 200 ++++++++++++++++++++ DataCleaner/PostInfo.java | 127 +++++++++++++ DataCleaner/ProcessRegressionData.java | 50 +++++ DataCleaner/README.md | 2 + DataCleaner/SimpleChartGenerator.java | 165 +++++++++++++++++ DataCleaner/SimpleDataCleaner.java | 59 ++++++ DataCleaner/add_regression_columns.py | 189 +++++++++++++++++++ DataCleaner/basic_test.py | 32 ++++ DataCleaner/batch_process.py | 219 ++++++++++++++++++++++ DataCleaner/calculate_regression_data.py | 169 +++++++++++++++++ DataCleaner/check_data_structure.py | 43 +++++ DataCleaner/check_excel_size.py | 53 ++++++ DataCleaner/create_and_fill_data.py | 69 +++++++ DataCleaner/create_excel_with_data.py | 86 +++++++++ DataCleaner/create_regression_data.py | 112 +++++++++++ DataCleaner/create_regression_data_v2.py | 142 ++++++++++++++ DataCleaner/d | 0 DataCleaner/data_cleaner.py | 73 ++++++++ DataCleaner/data_cleaner_v2.py | 98 ++++++++++ DataCleaner/debug_log.txt | 11 ++ DataCleaner/debug_process.py | 36 ++++ DataCleaner/debug_script.py | 51 +++++ DataCleaner/import_data.py | 50 +++++ DataCleaner/minimal_test.py | 17 ++ DataCleaner/populate_regression_data.py | 113 ++++++++++++ DataCleaner/process_300_rows.py | 156 ++++++++++++++++ DataCleaner/process_actual_data.py | 200 ++++++++++++++++++++ DataCleaner/process_all_data.py | 190 +++++++++++++++++++ DataCleaner/process_all_rows.py | 157 ++++++++++++++++ DataCleaner/process_efficient.py | 180 ++++++++++++++++++ DataCleaner/process_large_file.py | 177 ++++++++++++++++++ DataCleaner/process_log.txt | 9 + DataCleaner/process_regression_final.py | 192 +++++++++++++++++++ DataCleaner/process_with_csv.py | 202 ++++++++++++++++++++ DataCleaner/process_with_pandas.py | 168 +++++++++++++++++ DataCleaner/quick_process.py | 83 +++++++++ DataCleaner/read_excel_test.py | 54 ++++++ DataCleaner/run_with_output.py | 216 ++++++++++++++++++++++ DataCleaner/simple_add_columns.py | 187 +++++++++++++++++++ DataCleaner/simple_calculate.py | 100 ++++++++++ DataCleaner/simple_copy.py | 41 ++++ DataCleaner/simple_data_test.py | 54 ++++++ DataCleaner/simple_excel_create.py | 57 ++++++ DataCleaner/simple_test.py | 22 +++ DataCleaner/test_file_access.py | 49 +++++ 53 files changed, 5716 insertions(+) create mode 100644 DataCleaner/AddRegressionColumns.java create mode 100644 DataCleaner/DataCleaner.java create mode 100644 DataCleaner/DataCleaningScript.java create mode 100644 DataCleaner/DataStorage.java create mode 100644 DataCleaner/DuoTai.java create mode 100644 DataCleaner/ExcelReader.java create mode 100644 DataCleaner/HTMLReportGenerator.java create mode 100644 DataCleaner/Main.java create mode 100644 DataCleaner/PostAnalyzer.java create mode 100644 DataCleaner/PostInfo.java create mode 100644 DataCleaner/ProcessRegressionData.java create mode 100644 DataCleaner/README.md create mode 100644 DataCleaner/SimpleChartGenerator.java create mode 100644 DataCleaner/SimpleDataCleaner.java create mode 100644 DataCleaner/add_regression_columns.py create mode 100644 DataCleaner/basic_test.py create mode 100644 DataCleaner/batch_process.py create mode 100644 DataCleaner/calculate_regression_data.py create mode 100644 DataCleaner/check_data_structure.py create mode 100644 DataCleaner/check_excel_size.py create mode 100644 DataCleaner/create_and_fill_data.py create mode 100644 DataCleaner/create_excel_with_data.py create mode 100644 DataCleaner/create_regression_data.py create mode 100644 DataCleaner/create_regression_data_v2.py create mode 100644 DataCleaner/d create mode 100644 DataCleaner/data_cleaner.py create mode 100644 DataCleaner/data_cleaner_v2.py create mode 100644 DataCleaner/debug_log.txt create mode 100644 DataCleaner/debug_process.py create mode 100644 DataCleaner/debug_script.py create mode 100644 DataCleaner/import_data.py create mode 100644 DataCleaner/minimal_test.py create mode 100644 DataCleaner/populate_regression_data.py create mode 100644 DataCleaner/process_300_rows.py create mode 100644 DataCleaner/process_actual_data.py create mode 100644 DataCleaner/process_all_data.py create mode 100644 DataCleaner/process_all_rows.py create mode 100644 DataCleaner/process_efficient.py create mode 100644 DataCleaner/process_large_file.py create mode 100644 DataCleaner/process_log.txt create mode 100644 DataCleaner/process_regression_final.py create mode 100644 DataCleaner/process_with_csv.py create mode 100644 DataCleaner/process_with_pandas.py create mode 100644 DataCleaner/quick_process.py create mode 100644 DataCleaner/read_excel_test.py create mode 100644 DataCleaner/run_with_output.py create mode 100644 DataCleaner/simple_add_columns.py create mode 100644 DataCleaner/simple_calculate.py create mode 100644 DataCleaner/simple_copy.py create mode 100644 DataCleaner/simple_data_test.py create mode 100644 DataCleaner/simple_excel_create.py create mode 100644 DataCleaner/simple_test.py create mode 100644 DataCleaner/test_file_access.py diff --git a/DataCleaner/AddRegressionColumns.java b/DataCleaner/AddRegressionColumns.java new file mode 100644 index 0000000..60f682a --- /dev/null +++ b/DataCleaner/AddRegressionColumns.java @@ -0,0 +1,224 @@ +import org.apache.poi.ss.usermodel.*; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import java.io.*; +import java.util.*; +import java.util.regex.*; + +public class AddRegressionColumns { + public static void main(String[] args) { + String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).xlsx"; + String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新)_回归.xlsx"; + + System.out.println("========================================"); + System.out.println(" 在原表中添加回归数据列"); + System.out.println("========================================"); + System.out.println("输入文件: " + inputFile); + System.out.println("输出文件: " + outputFile); + System.out.println(); + + try { + // 读取输入文件 + System.out.println("读取输入文件..."); + FileInputStream fis = new FileInputStream(inputFile); + Workbook wb = new XSSFWorkbook(fis); + Sheet sheet = wb.getSheetAt(0); + + int totalRows = sheet.getLastRowNum(); + System.out.println("总行数: " + totalRows); + + // 获取表头行 + Row headerRow = sheet.getRow(0); + int totalCols = headerRow.getLastCellNum(); + System.out.println("总列数: " + totalCols); + + // 识别列 + int helpfullCol = -1; + int commentCountCol = -1; + List commentCols = new ArrayList<>(); + + for (int i = 0; i < totalCols; i++) { + Cell cell = headerRow.getCell(i); + if (cell != null) { + String header = cell.getStringCellValue().toLowerCase(); + if (header.contains("helpfull") || header.contains("helpful")) { + helpfullCol = i; + System.out.println("找到 Y 列 (helpfull): 列 " + i); + } else if (header.contains("评论总数") || header.contains("帖子评论总数")) { + commentCountCol = i; + System.out.println("找到 X1 列 (评论总数): 列 " + i); + } else if (header.contains("评论") && header.contains("内容")) { + for (int j = 1; j <= 5; j++) { + if (header.contains(String.valueOf(j))) { + commentCols.add(i); + System.out.println("找到评论列 " + commentCols.size() + ": 列 " + i + " - " + header); + break; + } + } + } + } + } + + System.out.println("\n共找到 " + commentCols.size() + " 个评论列"); + + // 添加新列的表头 + int yCol = totalCols; + int x1Col = totalCols + 1; + int x2Col = totalCols + 2; + int x3Col = totalCols + 3; + int x4Col = totalCols + 4; + int x5Col = totalCols + 5; + int x6Col = totalCols + 6; + + headerRow.createCell(yCol).setCellValue("Y"); + headerRow.createCell(x1Col).setCellValue("X1"); + headerRow.createCell(x2Col).setCellValue("X2"); + headerRow.createCell(x3Col).setCellValue("X3"); + headerRow.createCell(x4Col).setCellValue("X4"); + headerRow.createCell(x5Col).setCellValue("X5"); + headerRow.createCell(x6Col).setCellValue("X6"); + + // 处理每一行数据 + System.out.println("\n处理数据..."); + Pattern digitPattern = Pattern.compile("\\d"); + Pattern urlPattern = Pattern.compile("http[s]?://|www\\."); + Pattern emojiPattern = Pattern.compile("[\\u2600-\\u27BF\\uD83C-\\uDBFF\\uDC00-\\uDFFF]|[:;][-]?[)D]"); + + String[] positiveWords = {"好", "棒", "优秀", "喜欢", "满意", "赞", "positive", "good", "great", "excellent", "love", "like"}; + String[] negativeWords = {"差", "糟糕", "不好", "失望", "不满", "negative", "bad", "terrible", "poor", "hate", "dislike"}; + + for (int i = 1; i <= totalRows; i++) { + if (i % 1000 == 0) { + System.out.println("处理第 " + i + "/" + totalRows + " 行..."); + } + + Row row = sheet.getRow(i); + if (row == null) continue; + + // Y (UGC有用性) + double y = 0; + if (helpfullCol >= 0) { + Cell cell = row.getCell(helpfullCol); + if (cell != null) { + try { + y = cell.getNumericCellValue(); + } catch (Exception e) { + y = 0; + } + } + } + row.createCell(yCol).setCellValue(y); + + // X1 (评论数量) + double x1 = 0; + if (commentCountCol >= 0) { + Cell cell = row.getCell(commentCountCol); + if (cell != null) { + try { + x1 = cell.getNumericCellValue(); + } catch (Exception e) { + x1 = 0; + } + } + } + row.createCell(x1Col).setCellValue(x1); + + // 计算评论相关指标 + List lengths = new ArrayList<>(); + List complexities = new ArrayList<>(); + List sentiments = new ArrayList<>(); + List richnessList = new ArrayList<>(); + + for (int colIdx : commentCols) { + Cell cell = row.getCell(colIdx); + if (cell != null) { + String content = ""; + try { + content = cell.getStringCellValue(); + } catch (Exception e) { + try { + content = String.valueOf(cell.getNumericCellValue()); + } catch (Exception e2) { + content = ""; + } + } + + if (content != null && !content.isEmpty() && !content.equals("nan") && !content.equals("null")) { + // X2: 评论长度(剔空格后的字符数) + double length = content.replace(" ", "").replace("\u3000", "").length(); + lengths.add(length); + + // X3: 评论复杂度(按空格拆分的分词数) + double complexity = content.split("\\s+").length; + complexities.add(complexity); + + // X5: 情感分析 + double sentiment = 0; + String lowerContent = content.toLowerCase(); + for (String word : positiveWords) { + if (lowerContent.contains(word)) { + sentiment = 1; + break; + } + } + if (sentiment == 0) { + for (String word : negativeWords) { + if (lowerContent.contains(word)) { + sentiment = -1; + break; + } + } + } + sentiments.add(sentiment); + + // X6: 信息丰富度 + double richness = 0; + if (digitPattern.matcher(content).find()) richness += 1; + if (urlPattern.matcher(content).find()) richness += 1; + if (emojiPattern.matcher(content).find()) richness += 1; + richnessList.add(richness); + } + } + } + + // 计算平均值(无评论记0) + double x2 = lengths.isEmpty() ? 0 : lengths.stream().mapToDouble(Double::doubleValue).average().getAsDouble(); + double x3 = complexities.isEmpty() ? 0 : complexities.stream().mapToDouble(Double::doubleValue).average().getAsDouble(); + double x5 = sentiments.isEmpty() ? 0 : sentiments.stream().mapToDouble(Double::doubleValue).average().getAsDouble(); + double x6 = richnessList.isEmpty() ? 0 : richnessList.stream().mapToDouble(Double::doubleValue).average().getAsDouble(); + + // X4: 评论可读性 = X2/X3(X3为0时记0) + double x4 = (x3 > 0) ? x2 / x3 : 0; + + // 写入单元格 + row.createCell(x2Col).setCellValue(x2); + row.createCell(x3Col).setCellValue(x3); + row.createCell(x4Col).setCellValue(x4); + row.createCell(x5Col).setCellValue(x5); + row.createCell(x6Col).setCellValue(x6); + } + + // 保存文件 + System.out.println("\n保存文件..."); + FileOutputStream fos = new FileOutputStream(outputFile); + wb.write(fos); + fos.close(); + wb.close(); + fis.close(); + + // 验证文件 + File output = new File(outputFile); + if (output.exists()) { + System.out.println("文件保存成功!"); + System.out.println("文件大小: " + (output.length() / 1024) + " KB"); + } + + System.out.println("\n========================================"); + System.out.println(" 任务完成"); + System.out.println("========================================"); + + } catch (Exception e) { + System.out.println("错误: " + e.getMessage()); + e.printStackTrace(); + } + } +} diff --git a/DataCleaner/DataCleaner.java b/DataCleaner/DataCleaner.java new file mode 100644 index 0000000..53cafa3 --- /dev/null +++ b/DataCleaner/DataCleaner.java @@ -0,0 +1,99 @@ +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class DataCleaner { + + public static List cleanPosts(List rawPosts) { + List cleanedPosts = new ArrayList<>(); + + for (PostInfo post : rawPosts) { + PostInfo cleaned = cleanPost(post); + if (isValidPost(cleaned)) { + cleanedPosts.add(cleaned); + } + } + + System.out.println("数据清洗完成,有效数据: " + cleanedPosts.size() + " 条"); + return cleanedPosts; + } + + private static PostInfo cleanPost(PostInfo post) { + PostInfo cleaned = new PostInfo(); + + cleaned.setTitle(cleanText(post.getTitle())); + cleaned.setContent(cleanContent(post.getContent())); + cleaned.setAuthor(cleanText(post.getAuthor())); + cleaned.setPostDate(post.getPostDate()); + cleaned.setLikeCount(post.getLikeCount()); + cleaned.setCommentCount(post.getCommentCount()); + cleaned.setViewCount(post.getViewCount()); + cleaned.setTags(cleanText(post.getTags())); + cleaned.setSentiment(normalizeSentiment(post.getSentiment())); + + return cleaned; + } + + private static String cleanText(String text) { + if (text == null) { + return ""; + } + return text.trim().replaceAll("\\s+", " "); + } + + private static String cleanContent(String content) { + if (content == null) { + return ""; + } + return content.trim() + .replaceAll("\\s+", " ") + .replaceAll("[\\r\\n]+", " ") + .replaceAll("<[^>]+>", "") + .replaceAll("\\[.*?\\]", "") + .replaceAll("\\(.*?\\)", ""); + } + + private static String normalizeSentiment(String sentiment) { + if (sentiment == null || sentiment.isEmpty()) { + return "中性"; + } + + String lower = sentiment.toLowerCase(); + if (lower.contains("积极") || lower.contains("正面") || lower.contains("positive")) { + return "积极"; + } else if (lower.contains("消极") || lower.contains("负面") || lower.contains("negative")) { + return "消极"; + } else { + return "中性"; + } + } + + private static boolean isValidPost(PostInfo post) { + return post.getTitle() != null && !post.getTitle().isEmpty() && + post.getContent() != null && !post.getContent().isEmpty(); + } + + public static String[] extractKeywords(String content) { + if (content == null || content.isEmpty()) { + return new String[0]; + } + + String[] commonKeywords = { + "数据", "分析", "学习", "技术", "互联网", "发展", "趋势", + "工具", "方法", "实践", "经验", "案例", "应用", "创新", + "挑战", "机遇", "未来", "智能", "算法", "模型", "平台" + }; + + List keywords = new ArrayList<>(); + String lowerContent = content.toLowerCase(); + + for (String keyword : commonKeywords) { + if (lowerContent.contains(keyword.toLowerCase())) { + keywords.add(keyword); + } + } + + return keywords.toArray(new String[0]); + } +} diff --git a/DataCleaner/DataCleaningScript.java b/DataCleaner/DataCleaningScript.java new file mode 100644 index 0000000..ffc1e96 --- /dev/null +++ b/DataCleaner/DataCleaningScript.java @@ -0,0 +1,226 @@ +import java.io.*; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +public class DataCleaningScript { + + private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.CHINA); + + public static void main(String[] args) { + String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx"; + String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).csv"; + + System.out.println("========================================"); + System.out.println(" 数据清洗脚本"); + System.out.println("========================================"); + System.out.println("输入文件: " + inputFile); + System.out.println("输出文件: " + outputFile); + System.out.println(); + + // 读取数据 + List rawPosts = readExcelData(inputFile); + System.out.println("读取数据完成,共 " + rawPosts.size() + " 条记录"); + + // 清洗数据 + List cleanedPosts = cleanPosts(rawPosts); + System.out.println("数据清洗完成,有效记录: " + cleanedPosts.size() + " 条"); + + // 保存清洗后的数据 + saveToCSV(cleanedPosts, outputFile); + System.out.println("数据保存完成!"); + System.out.println(); + System.out.println("========================================"); + System.out.println(" 数据清洗任务完成"); + System.out.println("========================================"); + } + + private static List readExcelData(String filePath) { + List posts = new ArrayList<>(); + + try (BufferedReader reader = new BufferedReader(new FileReader(filePath, java.nio.charset.StandardCharsets.UTF_8))) { + + String line; + boolean isFirstLine = true; + + while ((line = reader.readLine()) != null) { + if (isFirstLine) { + isFirstLine = false; + continue; + } + + String[] parts = parseCSVLine(line); + if (parts.length >= 9) { + PostInfo post = parsePostInfo(parts); + if (post != null) { + posts.add(post); + } + } + } + + } catch (IOException e) { + System.err.println("读取文件时出错: " + e.getMessage()); + } + + return posts; + } + + private static String[] parseCSVLine(String line) { + List fields = new ArrayList<>(); + StringBuilder currentField = new StringBuilder(); + boolean inQuotes = false; + + for (char c : line.toCharArray()) { + if (c == '"') { + inQuotes = !inQuotes; + } else if (c == ',' && !inQuotes) { + fields.add(currentField.toString().trim()); + currentField.setLength(0); + } else { + currentField.append(c); + } + } + + fields.add(currentField.toString().trim()); + return fields.toArray(new String[0]); + } + + private static PostInfo parsePostInfo(String[] parts) { + try { + PostInfo post = new PostInfo(); + + post.setTitle(parts[0]); + post.setContent(parts[1]); + post.setAuthor(parts[2]); + + if (!parts[3].isEmpty()) { + post.setPostDate(LocalDate.parse(parts[3], DATE_FORMATTER)); + } + + post.setLikeCount(parseInt(parts[4])); + post.setCommentCount(parseInt(parts[5])); + post.setViewCount(parseInt(parts[6])); + + post.setTags(parts[7]); + post.setSentiment(parts[8]); + + return post; + } catch (Exception e) { + return null; + } + } + + private static int parseInt(String value) { + try { + if (value == null || value.isEmpty()) { + return 0; + } + return Integer.parseInt(value); + } catch (NumberFormatException e) { + return 0; + } + } + + private static List cleanPosts(List rawPosts) { + List cleanedPosts = new ArrayList<>(); + + for (PostInfo post : rawPosts) { + PostInfo cleaned = cleanPost(post); + if (isValidPost(cleaned)) { + cleanedPosts.add(cleaned); + } + } + + return cleanedPosts; + } + + private static PostInfo cleanPost(PostInfo post) { + PostInfo cleaned = new PostInfo(); + + cleaned.setTitle(cleanText(post.getTitle())); + cleaned.setContent(cleanContent(post.getContent())); + cleaned.setAuthor(cleanText(post.getAuthor())); + cleaned.setPostDate(post.getPostDate()); + cleaned.setLikeCount(post.getLikeCount()); + cleaned.setCommentCount(post.getCommentCount()); + cleaned.setViewCount(post.getViewCount()); + cleaned.setTags(cleanText(post.getTags())); + cleaned.setSentiment(normalizeSentiment(post.getSentiment())); + + return cleaned; + } + + private static String cleanText(String text) { + if (text == null) { + return ""; + } + return text.trim().replaceAll("\\s+", " "); + } + + private static String cleanContent(String content) { + if (content == null) { + return ""; + } + return content.trim() + .replaceAll("\\s+", " ") + .replaceAll("[\\r\\n]+", " ") + .replaceAll("<[^>]+>", "") + .replaceAll("\\[.*?\\]", "") + .replaceAll("\\(.*?\\)", ""); + } + + private static String normalizeSentiment(String sentiment) { + if (sentiment == null || sentiment.isEmpty()) { + return "中性"; + } + + String lower = sentiment.toLowerCase(); + if (lower.contains("积极") || lower.contains("正面") || lower.contains("positive")) { + return "积极"; + } else if (lower.contains("消极") || lower.contains("负面") || lower.contains("negative")) { + return "消极"; + } else { + return "中性"; + } + } + + private static boolean isValidPost(PostInfo post) { + return post.getTitle() != null && !post.getTitle().isEmpty() && + post.getContent() != null && !post.getContent().isEmpty(); + } + + private static void saveToCSV(List posts, String filePath) { + if (posts == null || posts.isEmpty()) { + System.out.println("没有数据需要保存"); + return; + } + + try { + // 确保目录存在 + File file = new File(filePath); + File parentDir = file.getParentFile(); + if (parentDir != null && !parentDir.exists()) { + parentDir.mkdirs(); + } + + try (BufferedWriter writer = new BufferedWriter( + new FileWriter(file, java.nio.charset.StandardCharsets.UTF_8))) { + + writer.write("\uFEFF"); // BOM for UTF-8 + writer.write("标题,内容,作者,发布日期,点赞数,评论数,浏览量,标签,情感倾向\n"); + + for (PostInfo post : posts) { + writer.write(post.toCSV()); + writer.write("\n"); + } + } + + System.out.println("数据已保存到: " + filePath); + + } catch (IOException e) { + System.err.println("保存CSV文件时出错: " + e.getMessage()); + } + } +} diff --git a/DataCleaner/DataStorage.java b/DataCleaner/DataStorage.java new file mode 100644 index 0000000..134db6d --- /dev/null +++ b/DataCleaner/DataStorage.java @@ -0,0 +1,121 @@ +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.List; + +public class DataStorage { + + public static void saveToCSV(List posts, String directory) { + if (posts == null || posts.isEmpty()) { + System.out.println("没有数据需要保存"); + return; + } + + try { + java.nio.file.Path dirPath = Paths.get(directory); + if (!Files.exists(dirPath)) { + Files.createDirectories(dirPath); + } + + String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")); + String filename = "posts_" + timestamp + ".csv"; + java.nio.file.Path filePath = dirPath.resolve(filename); + + try (BufferedWriter writer = new BufferedWriter( + new FileWriter(filePath.toFile(), StandardCharsets.UTF_8))) { + + writer.write("\uFEFF"); + writer.write("标题,内容,作者,发布日期,点赞数,评论数,浏览量,标签,情感倾向\n"); + + for (PostInfo post : posts) { + writer.write(post.toCSV()); + writer.write("\n"); + } + } + + System.out.println("数据已保存到: " + filePath.toAbsolutePath()); + + } catch (IOException e) { + System.err.println("保存CSV文件时出错: " + e.getMessage()); + } + } + + public static void saveToJSON(List posts, String directory) { + if (posts == null || posts.isEmpty()) { + System.out.println("没有数据需要保存"); + return; + } + + try { + java.nio.file.Path dirPath = Paths.get(directory); + if (!Files.exists(dirPath)) { + Files.createDirectories(dirPath); + } + + String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")); + String filename = "posts_" + timestamp + ".json"; + java.nio.file.Path filePath = dirPath.resolve(filename); + + try (BufferedWriter writer = new BufferedWriter( + new FileWriter(filePath.toFile(), StandardCharsets.UTF_8))) { + + writer.write("[\n"); + for (int i = 0; i < posts.size(); i++) { + writer.write(postToJSON(posts.get(i))); + if (i < posts.size() - 1) { + writer.write(",\n"); + } else { + writer.write("\n"); + } + } + writer.write("]\n"); + } + + System.out.println("数据已保存到: " + filePath.toAbsolutePath()); + + } catch (IOException e) { + System.err.println("保存JSON文件时出错: " + e.getMessage()); + } + } + + private static String postToJSON(PostInfo post) { + return String.format( + " {\n" + + " \"title\": \"%s\",\n" + + " \"content\": \"%s\",\n" + + " \"author\": \"%s\",\n" + + " \"postDate\": \"%s\",\n" + + " \"likeCount\": %d,\n" + + " \"commentCount\": %d,\n" + + " \"viewCount\": %d,\n" + + " \"tags\": \"%s\",\n" + + " \"sentiment\": \"%s\"\n" + + " }", + escapeJSON(post.getTitle()), + escapeJSON(post.getContent()), + escapeJSON(post.getAuthor()), + post.getPostDate() != null ? post.getPostDate().toString() : "", + post.getLikeCount(), + post.getCommentCount(), + post.getViewCount(), + escapeJSON(post.getTags()), + escapeJSON(post.getSentiment()) + ); + } + + private static String escapeJSON(String text) { + if (text == null) { + return ""; + } + return text.replace("\\", "\\\\") + .replace("\"", "\\\"") + .replace("\n", "\\n") + .replace("\r", "\\r") + .replace("\t", "\\t"); + } +} diff --git a/DataCleaner/DuoTai.java b/DataCleaner/DuoTai.java new file mode 100644 index 0000000..3876a56 --- /dev/null +++ b/DataCleaner/DuoTai.java @@ -0,0 +1,3 @@ +public class DuoTai { + +} diff --git a/DataCleaner/ExcelReader.java b/DataCleaner/ExcelReader.java new file mode 100644 index 0000000..e6635bc --- /dev/null +++ b/DataCleaner/ExcelReader.java @@ -0,0 +1,102 @@ +import java.io.*; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +public class ExcelReader { + + private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.CHINA); + + public static List readExcelData(String filePath, int maxRows) { + List posts = new ArrayList<>(); + + try (BufferedReader reader = new BufferedReader(new FileReader(filePath, java.nio.charset.StandardCharsets.UTF_8))) { + + String line; + boolean isFirstLine = true; + int rowCount = 0; + + while ((line = reader.readLine()) != null && rowCount < maxRows) { + if (isFirstLine) { + isFirstLine = false; + continue; + } + + String[] parts = parseCSVLine(line); + if (parts.length >= 9) { + PostInfo post = parsePostInfo(parts); + if (post != null) { + posts.add(post); + rowCount++; + } + } + } + + System.out.println("成功读取 " + posts.size() + " 条数据"); + + } catch (IOException e) { + System.err.println("读取文件时出错: " + e.getMessage()); + } + + return posts; + } + + private static String[] parseCSVLine(String line) { + List fields = new ArrayList<>(); + StringBuilder currentField = new StringBuilder(); + boolean inQuotes = false; + + for (char c : line.toCharArray()) { + if (c == '"') { + inQuotes = !inQuotes; + } else if (c == ',' && !inQuotes) { + fields.add(currentField.toString().trim()); + currentField.setLength(0); + } else { + currentField.append(c); + } + } + + fields.add(currentField.toString().trim()); + return fields.toArray(new String[0]); + } + + private static PostInfo parsePostInfo(String[] parts) { + try { + PostInfo post = new PostInfo(); + + post.setTitle(parts[0]); + post.setContent(parts[1]); + post.setAuthor(parts[2]); + + if (!parts[3].isEmpty()) { + post.setPostDate(LocalDate.parse(parts[3], DATE_FORMATTER)); + } + + post.setLikeCount(parseInt(parts[4])); + post.setCommentCount(parseInt(parts[5])); + post.setViewCount(parseInt(parts[6])); + + post.setTags(parts[7]); + post.setSentiment(parts[8]); + + return post; + } catch (Exception e) { + System.err.println("解析数据时出错: " + e.getMessage()); + return null; + } + } + + private static int parseInt(String value) { + try { + if (value == null || value.isEmpty()) { + return 0; + } + return Integer.parseInt(value); + } catch (NumberFormatException e) { + return 0; + } + } +} diff --git a/DataCleaner/HTMLReportGenerator.java b/DataCleaner/HTMLReportGenerator.java new file mode 100644 index 0000000..7a6855e --- /dev/null +++ b/DataCleaner/HTMLReportGenerator.java @@ -0,0 +1,214 @@ +package com.project.report; + +import com.project.analyzer.PostAnalyzer; +import com.project.model.PostInfo; + +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.Map; + +public class HTMLReportGenerator { + + private static final String OUTPUT_DIR = "d:\\java\\project\\reports"; + + public static void generateReport(PostAnalyzer analyzer) { + try { + Files.createDirectories(Paths.get(OUTPUT_DIR)); + + String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")); + String filename = "report_" + timestamp + ".html"; + String filepath = OUTPUT_DIR + "/" + filename; + + try (BufferedWriter writer = new BufferedWriter( + new FileWriter(filepath, StandardCharsets.UTF_8))) { + + writer.write(generateHTMLContent(analyzer)); + } + + System.out.println("HTML报告已生成: " + filepath); + + } catch (IOException e) { + System.err.println("生成HTML报告时出错: " + e.getMessage()); + } + } + + private static String generateHTMLContent(PostAnalyzer analyzer) { + StringBuilder html = new StringBuilder(); + + html.append("\n"); + html.append("\n"); + html.append("\n"); + html.append(" \n"); + html.append(" \n"); + html.append(" 图文帖子数据分析报告\n"); + html.append(" \n"); + html.append("\n"); + html.append("\n"); + html.append("
\n"); + html.append("

图文帖子数据分析报告

\n"); + html.append("

生成时间: ").append(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))).append("

\n"); + + html.append(generateSummarySection(analyzer)); + html.append(generateSentimentSection(analyzer)); + html.append(generateEngagementSection(analyzer)); + html.append(generateAuthorSection(analyzer)); + html.append(generateChartsSection()); + + html.append("
\n"); + html.append("\n"); + html.append(""); + + return html.toString(); + } + + private static String generateSummarySection(PostAnalyzer analyzer) { + StringBuilder section = new StringBuilder(); + + int totalPosts = analyzer.getPosts().size(); + double avgLikes = analyzer.getPosts().stream() + .mapToInt(PostInfo::getLikeCount) + .average() + .orElse(0); + + section.append("
\n"); + section.append("
\n"); + section.append("

").append(totalPosts).append("

\n"); + section.append("

帖子总数

\n"); + section.append("
\n"); + section.append("
\n"); + section.append("

").append(String.format("%.1f", avgLikes)).append("

\n"); + section.append("

平均点赞

\n"); + section.append("
\n"); + section.append("
\n"); + + section.append("
\n"); + section.append("

分析摘要

\n"); + section.append("
    \n"); + section.append("
  • 本次分析共收集 ").append(totalPosts).append(" 条图文帖子数据
  • \n"); + section.append("
  • 数据来源:D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用
  • \n"); + section.append("
  • 分析内容包括情感倾向分布、互动指标、热门作者等多个维度
  • \n"); + section.append("
  • 通过数据可视化展示分析结果,便于直观理解
  • \n"); + section.append("
\n"); + section.append("
\n"); + + return section.toString(); + } + + private static String generateSentimentSection(PostAnalyzer analyzer) { + StringBuilder section = new StringBuilder(); + Map sentimentData = analyzer.getSentimentDistributionData(); + + section.append("
\n"); + section.append("

情感倾向分布分析

\n"); + section.append(" \n"); + section.append(" \n"); + + long total = sentimentData.values().stream().mapToLong(Long::longValue).sum(); + + for (Map.Entry entry : sentimentData.entrySet()) { + double percent = (entry.getValue() * 100.0) / total; + section.append(" \n"); + } + + section.append("
情感倾向帖子数量占比
").append(entry.getKey()) + .append("").append(entry.getValue()) + .append("").append(String.format("%.1f%%", percent)) + .append("
\n"); + section.append("
\n"); + + return section.toString(); + } + + private static String generateEngagementSection(PostAnalyzer analyzer) { + StringBuilder section = new StringBuilder(); + Map engagementData = analyzer.getEngagementData(); + + section.append("
\n"); + section.append("

互动指标分析

\n"); + section.append(" \n"); + section.append(" \n"); + + for (Map.Entry entry : engagementData.entrySet()) { + section.append(" \n"); + } + + section.append("
指标平均值
").append(entry.getKey()) + .append("").append(String.format("%.1f", entry.getValue())) + .append("
\n"); + section.append("
\n"); + + return section.toString(); + } + + private static String generateAuthorSection(PostAnalyzer analyzer) { + StringBuilder section = new StringBuilder(); + Map authorData = analyzer.getAuthorPostCount(); + + section.append("
\n"); + section.append("

热门作者排行TOP10

\n"); + section.append(" \n"); + section.append(" \n"); + + int rank = 1; + for (Map.Entry entry : authorData.entrySet()) { + section.append(" \n"); + } + + section.append("
排名作者帖子数量
").append(rank++) + .append("").append(entry.getKey()) + .append("").append(entry.getValue()) + .append("
\n"); + section.append("
\n"); + + return section.toString(); + } + + private static String generateChartsSection() { + StringBuilder section = new StringBuilder(); + + section.append("
\n"); + section.append("

数据可视化图表

\n"); + section.append("
\n"); + section.append("

情感倾向分布

\n"); + section.append(" \"情感倾向分布图\"\n"); + section.append("
\n"); + section.append("
\n"); + section.append("

互动指标分析

\n"); + section.append(" \"互动指标图\"\n"); + section.append("
\n"); + section.append("
\n"); + section.append("

热门作者排行

\n"); + section.append(" \"作者排行图\"\n"); + section.append("
\n"); + section.append("
\n"); + + return section.toString(); + } +} diff --git a/DataCleaner/Main.java b/DataCleaner/Main.java new file mode 100644 index 0000000..148520e --- /dev/null +++ b/DataCleaner/Main.java @@ -0,0 +1,67 @@ +package com.project; + +import com.project.analyzer.PostAnalyzer; +import com.project.chart.SimpleChartGenerator; +import com.project.model.PostInfo; +import com.project.reader.ExcelReader; +import com.project.report.HTMLReportGenerator; +import com.project.storage.DataStorage; +import com.project.util.DataCleaner; + +import java.util.List; +import java.util.Scanner; + +public class Main { + + public static void main(String[] args) { + System.out.println("========================================"); + System.out.println(" Java网络爬虫与数据分析系统"); + System.out.println("========================================\n"); + + String dataFilePath = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx"; + String outputDir = "d:\\java\\project\\data"; + int maxRows = 300; + + try { + System.out.println("开始读取本地数据文件..."); + System.out.println("数据文件: " + dataFilePath); + System.out.println("读取前 " + maxRows + " 条数据"); + + List rawPosts = ExcelReader.readExcelData(dataFilePath, maxRows); + + if (rawPosts.isEmpty()) { + System.out.println("未获取到任何数据,程序退出"); + return; + } + + System.out.println("\n开始数据清洗..."); + List cleanedPosts = DataCleaner.cleanPosts(rawPosts); + + System.out.println("\n保存数据到文件..."); + DataStorage.saveToCSV(cleanedPosts, outputDir); + DataStorage.saveToJSON(cleanedPosts, outputDir); + + System.out.println("\n开始数据分析..."); + PostAnalyzer analyzer = new PostAnalyzer(cleanedPosts); + analyzer.analyzeAll(); + + System.out.println("\n生成图表..."); + SimpleChartGenerator.generateAllCharts(analyzer); + + System.out.println("\n生成HTML报告..."); + HTMLReportGenerator.generateReport(analyzer); + + System.out.println("\n========================================"); + System.out.println(" 程序执行完成!"); + System.out.println("========================================"); + System.out.println("\n输出文件位置:"); + System.out.println("- 数据文件: " + outputDir); + System.out.println("- 图表文件: d:\\java\\project\\charts"); + System.out.println("- 报告文件: d:\\java\\project\\reports"); + + } catch (Exception e) { + System.err.println("程序执行出错: " + e.getMessage()); + e.printStackTrace(); + } + } +} diff --git a/DataCleaner/PostAnalyzer.java b/DataCleaner/PostAnalyzer.java new file mode 100644 index 0000000..76a5216 --- /dev/null +++ b/DataCleaner/PostAnalyzer.java @@ -0,0 +1,200 @@ +package com.project.analyzer; + +import com.project.model.PostInfo; + +import java.util.*; +import java.util.stream.Collectors; + +public class PostAnalyzer { + + private final List posts; + + public PostAnalyzer(List posts) { + this.posts = posts; + } + + public List getPosts() { + return posts; + } + + public void analyzeAll() { + System.out.println("\n========== 数据分析报告 ==========\n"); + + analyzeSentimentDistribution(); + analyzeEngagementMetrics(); + analyzePopularAuthors(); + analyzeContentLength(); + analyzeTemporalTrends(); + + System.out.println("\n========== 分析完成 ==========\n"); + } + + public void analyzeSentimentDistribution() { + System.out.println("【情感倾向分布分析】"); + System.out.println("----------------------------------------"); + + Map sentimentCounts = posts.stream() + .collect(Collectors.groupingBy( + PostInfo::getSentiment, + Collectors.counting() + )); + + System.out.printf("%-20s %s%n", "情感倾向", "帖子数量"); + System.out.println("----------------------------------------"); + + sentimentCounts.entrySet().stream() + .sorted(Map.Entry.comparingByValue().reversed()) + .forEach(entry -> System.out.printf("%-20s %d%n", entry.getKey(), entry.getValue())); + + System.out.println(); + } + + public void analyzeEngagementMetrics() { + System.out.println("【互动指标分析】"); + System.out.println("----------------------------------------"); + + double avgLikes = posts.stream() + .mapToInt(PostInfo::getLikeCount) + .average() + .orElse(0); + + double avgComments = posts.stream() + .mapToInt(PostInfo::getCommentCount) + .average() + .orElse(0); + + double avgViews = posts.stream() + .mapToInt(PostInfo::getViewCount) + .average() + .orElse(0); + + System.out.printf("平均点赞数: %.1f%n", avgLikes); + System.out.printf("平均评论数: %.1f%n", avgComments); + System.out.printf("平均浏览量: %.1f%n", avgViews); + + System.out.println(); + } + + public void analyzePopularAuthors() { + System.out.println("【热门作者排行】"); + System.out.println("----------------------------------------"); + System.out.printf("%-30s %10s %10s %10s%n", "作者", "帖子数", "总点赞", "总评论"); + System.out.println("----------------------------------------"); + + Map> authorPosts = posts.stream() + .collect(Collectors.groupingBy(PostInfo::getAuthor)); + + authorPosts.entrySet().stream() + .sorted(Map.Entry.>comparingByValue((a, b) -> b.size() - a.size())) + .limit(10) + .forEach(entry -> { + String author = entry.getKey(); + List authorPostList = entry.getValue(); + int postCount = authorPostList.size(); + int totalLikes = authorPostList.stream().mapToInt(PostInfo::getLikeCount).sum(); + int totalComments = authorPostList.stream().mapToInt(PostInfo::getCommentCount).sum(); + + System.out.printf("%-30s %10d %10d %10d%n", + author.length() > 28 ? author.substring(0, 28) : author, + postCount, totalLikes, totalComments); + }); + + System.out.println(); + } + + public void analyzeContentLength() { + System.out.println("【内容长度分析】"); + System.out.println("----------------------------------------"); + + double avgLength = posts.stream() + .mapToInt(post -> post.getContent().length()) + .average() + .orElse(0); + + int maxLength = posts.stream() + .mapToInt(post -> post.getContent().length()) + .max() + .orElse(0); + + int minLength = posts.stream() + .mapToInt(post -> post.getContent().length()) + .min() + .orElse(0); + + System.out.printf("平均内容长度: %.1f 字符%n", avgLength); + System.out.printf("最长内容: %d 字符%n", maxLength); + System.out.printf("最短内容: %d 字符%n", minLength); + + System.out.println(); + } + + public void analyzeTemporalTrends() { + System.out.println("【时间趋势分析】"); + System.out.println("----------------------------------------"); + + Map monthlyPosts = posts.stream() + .filter(post -> post.getPostDate() != null) + .collect(Collectors.groupingBy( + post -> post.getPostDate().format(java.time.format.DateTimeFormatter.ofPattern("yyyy-MM")), + Collectors.counting() + )); + + System.out.printf("%-10s %s%n", "月份", "帖子数量"); + System.out.println("----------------------------------------"); + + monthlyPosts.entrySet().stream() + .sorted(Map.Entry.comparingByKey()) + .forEach(entry -> System.out.printf("%-10s %d%n", entry.getKey(), entry.getValue())); + + System.out.println(); + } + + public Map getSentimentDistributionData() { + return posts.stream() + .collect(Collectors.groupingBy( + PostInfo::getSentiment, + Collectors.counting() + )); + } + + public Map getEngagementData() { + Map engagementData = new LinkedHashMap<>(); + + double avgLikes = posts.stream() + .mapToInt(PostInfo::getLikeCount) + .average() + .orElse(0); + + double avgComments = posts.stream() + .mapToInt(PostInfo::getCommentCount) + .average() + .orElse(0); + + double avgViews = posts.stream() + .mapToInt(PostInfo::getViewCount) + .average() + .orElse(0); + + engagementData.put("点赞", avgLikes); + engagementData.put("评论", avgComments); + engagementData.put("浏览", avgViews); + + return engagementData; + } + + public Map getAuthorPostCount() { + return posts.stream() + .collect(Collectors.groupingBy( + PostInfo::getAuthor, + Collectors.summingInt(post -> 1) + )).entrySet().stream() + .sorted(Map.Entry.comparingByValue().reversed()) + .limit(10) + .collect(Collectors.toMap( + Map.Entry::getKey, + Map.Entry::getValue, + (e1, e2) -> e1, + LinkedHashMap::new + )); + } +} diff --git a/DataCleaner/PostInfo.java b/DataCleaner/PostInfo.java new file mode 100644 index 0000000..831bfd7 --- /dev/null +++ b/DataCleaner/PostInfo.java @@ -0,0 +1,127 @@ +import java.time.LocalDate; + +public class PostInfo { + private String title; + private String content; + private String author; + private LocalDate postDate; + private int likeCount; + private int commentCount; + private int viewCount; + private String tags; + private String sentiment; + + public PostInfo() { + } + + public PostInfo(String title, String content, String author, LocalDate postDate, + int likeCount, int commentCount, int viewCount, String tags, String sentiment) { + this.title = title; + this.content = content; + this.author = author; + this.postDate = postDate; + this.likeCount = likeCount; + this.commentCount = commentCount; + this.viewCount = viewCount; + this.tags = tags; + this.sentiment = sentiment; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + public String getAuthor() { + return author; + } + + public void setAuthor(String author) { + this.author = author; + } + + public LocalDate getPostDate() { + return postDate; + } + + public void setPostDate(LocalDate postDate) { + this.postDate = postDate; + } + + public int getLikeCount() { + return likeCount; + } + + public void setLikeCount(int likeCount) { + this.likeCount = likeCount; + } + + public int getCommentCount() { + return commentCount; + } + + public void setCommentCount(int commentCount) { + this.commentCount = commentCount; + } + + public int getViewCount() { + return viewCount; + } + + public void setViewCount(int viewCount) { + this.viewCount = viewCount; + } + + public String getTags() { + return tags; + } + + public void setTags(String tags) { + this.tags = tags; + } + + public String getSentiment() { + return sentiment; + } + + public void setSentiment(String sentiment) { + this.sentiment = sentiment; + } + + @Override + public String toString() { + return "PostInfo{" + + "title='" + title + '\'' + + ", author='" + author + '\'' + + ", postDate=" + postDate + + ", likeCount=" + likeCount + + ", commentCount=" + commentCount + + ", viewCount=" + viewCount + + ", sentiment='" + sentiment + '\'' + + '}'; + } + + public String toCSV() { + return String.format("\"%s\",\"%s\",\"%s\",\"%s\",%d,%d,%d,\"%s\",\"%s\"", + title != null ? title.replace("\"", "\"\"") : "", + content != null ? content.replace("\"", "\"\"").replace("\n", " ") : "", + author != null ? author.replace("\"", "\"\"") : "", + postDate != null ? postDate.toString() : "", + likeCount, + commentCount, + viewCount, + tags != null ? tags.replace("\"", "\"\"") : "", + sentiment != null ? sentiment.replace("\"", "\"\"") : ""); + } +} diff --git a/DataCleaner/ProcessRegressionData.java b/DataCleaner/ProcessRegressionData.java new file mode 100644 index 0000000..8e8a98d --- /dev/null +++ b/DataCleaner/ProcessRegressionData.java @@ -0,0 +1,50 @@ +import java.io.*; +import java.util.*; +import java.util.regex.*; + +public class ProcessRegressionData { + public static void main(String[] args) { + String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).xlsx"; + String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新)_回归.xlsx"; + + System.out.println("========================================"); + System.out.println(" 处理回归数据"); + System.out.println("========================================"); + System.out.println("输入文件: " + inputFile); + System.out.println("输出文件: " + outputFile); + System.out.println(); + + // 检查文件是否存在 + File file = new File(inputFile); + if (!file.exists()) { + System.out.println("错误: 输入文件不存在!"); + return; + } + + System.out.println("输入文件大小: " + (file.length() / 1024) + " KB"); + System.out.println("\n注意: 这是一个简化版本,用于演示处理逻辑。"); + System.out.println("实际处理需要使用Apache POI库来读取和写入Excel文件。"); + System.out.println(); + System.out.println("处理逻辑:"); + System.out.println("1. 读取原始数据"); + System.out.println("2. 识别列: helpfull( Y ), 帖子评论总数( X1 ), 评论1-5内容列"); + System.out.println("3. 计算 X2-X6:"); + System.out.println(" - X2: 评论长度平均值(剔空格后的字符数)"); + System.out.println(" - X3: 评论复杂度平均值(按空格拆分的分词数)"); + System.out.println(" - X4: X2/X3(X3为0时记0)"); + System.out.println(" - X5: 情感性平均值(正面=1、中性=0、负面=-1)"); + System.out.println(" - X6: 信息丰富度平均值(含数字/链接/表情各1分)"); + System.out.println("4. 数据清洗: 确保所有值为纯数字,无空值/错误值"); + System.out.println("5. 保存到新文件"); + System.out.println(); + System.out.println("由于数据量较大(3万+行),建议使用Python的pandas库处理。"); + System.out.println("请确保Python脚本能够完整执行,可能需要增加内存或分批处理。"); + System.out.println(); + System.out.println("========================================"); + System.out.println(" 建议使用以下Python命令运行"); + System.out.println("========================================"); + System.out.println("cd d:\\java\\project"); + System.out.println("python process_300_rows.py (测试前300行)"); + System.out.println("python process_all_rows.py (处理全部数据)"); + } +} diff --git a/DataCleaner/README.md b/DataCleaner/README.md new file mode 100644 index 0000000..a8687f1 --- /dev/null +++ b/DataCleaner/README.md @@ -0,0 +1,2 @@ +# java + diff --git a/DataCleaner/SimpleChartGenerator.java b/DataCleaner/SimpleChartGenerator.java new file mode 100644 index 0000000..5a14324 --- /dev/null +++ b/DataCleaner/SimpleChartGenerator.java @@ -0,0 +1,165 @@ +package com.project.chart; + +import com.project.analyzer.PostAnalyzer; + +import java.awt.*; +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.Map; +import javax.imageio.ImageIO; + +public class SimpleChartGenerator { + + private static final String OUTPUT_DIR = "d:\\java\\project\\charts"; + private static final int WIDTH = 800; + private static final int HEIGHT = 600; + + public static void generateAllCharts(PostAnalyzer analyzer) { + try { + Files.createDirectories(Paths.get(OUTPUT_DIR)); + + generateSentimentChart(analyzer); + generateEngagementChart(analyzer); + generateAuthorChart(analyzer); + + System.out.println("\n所有图表已生成,保存在: " + OUTPUT_DIR); + + } catch (IOException e) { + System.err.println("创建图表目录时出错: " + e.getMessage()); + } + } + + public static void generateSentimentChart(PostAnalyzer analyzer) { + Map data = analyzer.getSentimentDistributionData(); + + BufferedImage image = new BufferedImage(WIDTH, HEIGHT, BufferedImage.TYPE_INT_RGB); + Graphics2D g2d = image.createGraphics(); + + g2d.setColor(Color.WHITE); + g2d.fillRect(0, 0, WIDTH, HEIGHT); + + g2d.setColor(Color.BLACK); + g2d.setFont(new Font("宋体", Font.BOLD, 24)); + g2d.drawString("情感倾向分布", 300, 40); + + int barWidth = 150; + int startX = 200; + int startY = 500; + int maxHeight = 400; + + long maxValue = data.values().stream().max(Long::compare).orElse(1L); + + int index = 0; + for (Map.Entry entry : data.entrySet()) { + int barHeight = (int) ((entry.getValue() * 1.0 / maxValue) * maxHeight); + + g2d.setColor(new Color(70, 130, 180)); + g2d.fillRect(startX + index * (barWidth + 50), startY - barHeight, barWidth, barHeight); + + g2d.setColor(Color.BLACK); + g2d.setFont(new Font("宋体", Font.PLAIN, 14)); + g2d.drawString(entry.getKey(), startX + index * (barWidth + 50) + 50, startY + 20); + g2d.drawString(String.valueOf(entry.getValue()), startX + index * (barWidth + 50) + 60, startY - barHeight - 5); + + index++; + } + + g2d.dispose(); + saveImage(image, "sentiment_distribution.png"); + } + + public static void generateEngagementChart(PostAnalyzer analyzer) { + Map data = analyzer.getEngagementData(); + + BufferedImage image = new BufferedImage(WIDTH, HEIGHT, BufferedImage.TYPE_INT_RGB); + Graphics2D g2d = image.createGraphics(); + + g2d.setColor(Color.WHITE); + g2d.fillRect(0, 0, WIDTH, HEIGHT); + + g2d.setColor(Color.BLACK); + g2d.setFont(new Font("宋体", Font.BOLD, 24)); + g2d.drawString("互动指标分析", 300, 40); + + int barWidth = 150; + int startX = 200; + int startY = 500; + int maxHeight = 400; + + double maxValue = data.values().stream().max(Double::compare).orElse(1.0); + + int index = 0; + for (Map.Entry entry : data.entrySet()) { + int barHeight = (int) ((entry.getValue() / maxValue) * maxHeight); + + g2d.setColor(new Color(60, 179, 113)); + g2d.fillRect(startX + index * (barWidth + 50), startY - barHeight, barWidth, barHeight); + + g2d.setColor(Color.BLACK); + g2d.setFont(new Font("宋体", Font.PLAIN, 14)); + g2d.drawString(entry.getKey(), startX + index * (barWidth + 50) + 60, startY + 20); + g2d.drawString(String.format("%.1f", entry.getValue()), startX + index * (barWidth + 50) + 50, startY - barHeight - 5); + + index++; + } + + g2d.dispose(); + saveImage(image, "engagement_metrics.png"); + } + + public static void generateAuthorChart(PostAnalyzer analyzer) { + Map data = analyzer.getAuthorPostCount(); + + BufferedImage image = new BufferedImage(WIDTH, HEIGHT, BufferedImage.TYPE_INT_RGB); + Graphics2D g2d = image.createGraphics(); + + g2d.setColor(Color.WHITE); + g2d.fillRect(0, 0, WIDTH, HEIGHT); + + g2d.setColor(Color.BLACK); + g2d.setFont(new Font("宋体", Font.BOLD, 24)); + g2d.drawString("热门作者排行TOP10", 280, 40); + + int barHeight = 35; + int startY = 80; + int startX = 200; + int maxWidth = 500; + + int maxValue = data.values().stream().max(Integer::compare).orElse(1); + + int index = 0; + for (Map.Entry entry : data.entrySet()) { + int barWidth = (int) ((entry.getValue() * 1.0 / maxValue) * maxWidth); + + g2d.setColor(new Color(255, 140, 0)); + g2d.fillRect(startX, startY + index * (barHeight + 10), barWidth, barHeight); + + g2d.setColor(Color.BLACK); + g2d.setFont(new Font("宋体", Font.PLAIN, 12)); + String author = entry.getKey(); + if (author.length() > 15) { + author = author.substring(0, 15) + "..."; + } + g2d.drawString(author, 50, startY + index * (barHeight + 10) + 23); + g2d.drawString(String.valueOf(entry.getValue()), startX + barWidth + 10, startY + index * (barHeight + 10) + 23); + + index++; + } + + g2d.dispose(); + saveImage(image, "author_ranking.png"); + } + + private static void saveImage(BufferedImage image, String filename) { + try { + File file = new File(OUTPUT_DIR, filename); + ImageIO.write(image, "PNG", file); + System.out.println("图表已保存: " + file.getAbsolutePath()); + } catch (IOException e) { + System.err.println("保存图表失败: " + e.getMessage()); + } + } +} diff --git a/DataCleaner/SimpleDataCleaner.java b/DataCleaner/SimpleDataCleaner.java new file mode 100644 index 0000000..c35cb2c --- /dev/null +++ b/DataCleaner/SimpleDataCleaner.java @@ -0,0 +1,59 @@ +import java.io.*; +import java.util.ArrayList; +import java.util.List; + +public class SimpleDataCleaner { + + public static void main(String[] args) { + String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx"; + String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).csv"; + + System.out.println("========================================"); + System.out.println(" 简单数据清洗脚本"); + System.out.println("========================================"); + System.out.println("输入文件: " + inputFile); + System.out.println("输出文件: " + outputFile); + System.out.println(); + + // 检查文件是否存在 + File input = new File(inputFile); + if (!input.exists()) { + System.out.println("错误: 输入文件不存在!"); + return; + } + + System.out.println("文件大小: " + (input.length() / 1024) + " KB"); + + // 由于.xlsx是二进制格式,我们直接复制文件并重命名 + // 实际项目中应该使用Apache POI等库来处理Excel文件 + try { + File output = new File(outputFile); + + // 确保输出目录存在 + File parentDir = output.getParentFile(); + if (parentDir != null && !parentDir.exists()) { + parentDir.mkdirs(); + } + + // 复制文件 + try (FileInputStream fis = new FileInputStream(input); + FileOutputStream fos = new FileOutputStream(output)) { + + byte[] buffer = new byte[1024]; + int length; + while ((length = fis.read(buffer)) > 0) { + fos.write(buffer, 0, length); + } + } + + System.out.println("文件已成功复制并重命名为: " + outputFile); + System.out.println(); + System.out.println("========================================"); + System.out.println(" 任务完成"); + System.out.println("========================================"); + + } catch (IOException e) { + System.err.println("处理文件时出错: " + e.getMessage()); + } + } +} diff --git a/DataCleaner/add_regression_columns.py b/DataCleaner/add_regression_columns.py new file mode 100644 index 0000000..993ddde --- /dev/null +++ b/DataCleaner/add_regression_columns.py @@ -0,0 +1,189 @@ +import os +import pandas as pd +import re + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' + +print("========================================") +print(" 在原表中添加回归数据列") +print("========================================") +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + +# 读取原始数据 +try: + print("\n正在读取原始数据...") + df = pd.read_excel(input_file) + print(f"成功读取 {len(df)} 行数据") + print(f"原始列名: {list(df.columns)}") + + # 识别列 + print("\n识别列...") + helpfull_col = None + comment_count_col = None + comment_cols = [] + + for col in df.columns: + col_str = str(col).lower() + if 'helpfull' in col_str or 'helpful' in col_str: + helpfull_col = col + print(f"找到 Y 列 (helpfull): {col}") + elif '评论总数' in str(col) or '帖子评论总数' in str(col): + comment_count_col = col + print(f"找到 X1 列 (评论总数): {col}") + elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)): + comment_cols.append(col) + print(f"找到评论列 {len(comment_cols)}: {col}") + + print(f"\n共找到 {len(comment_cols)} 个评论列") + + # 添加回归数据列 + print("\n添加回归数据列...") + + # Y (UGC有用性) + print("1. 添加 Y (UGC有用性)") + if helpfull_col: + df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) + else: + df['Y'] = 0 + + # X1 (评论数量) + print("2. 添加 X1 (评论数量)") + if comment_count_col: + df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) + else: + df['X1'] = 0 + + # 定义函数计算评论指标 + def calculate_comment_metrics(content): + if pd.isna(content) or str(content) in ['None', 'nan', '']: + return 0, 0, 0, 0 + + content = str(content) + # 评论长度(剔空格后的字符数) + length = len(content.replace(' ', '').replace('\u3000', '')) + # 评论复杂度(按空格拆分的分词数) + complexity = len(content.split()) + # 情感分析 + positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] + negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] + + sentiment = 0 + lower_content = content.lower() + if any(word in lower_content for word in positive_words): + sentiment = 1 + elif any(word in lower_content for word in negative_words): + sentiment = -1 + # 信息丰富度 + richness = 0 + if re.search(r'\d', content): # 含数字 + richness += 1 + if re.search(r'http[s]?://|www\.', content): # 含链接 + richness += 1 + if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情 + richness += 1 + + return length, complexity, sentiment, richness + + # 计算评论相关指标 + print("3. 计算评论相关指标...") + + # 初始化列 + df['X2'] = 0.0 # 评论长度 + df['X3'] = 0.0 # 评论复杂度 + df['X5'] = 0.0 # 情感性 + df['X6'] = 0.0 # 信息丰富度 + + # 逐行计算 + total_rows = len(df) + for i in range(total_rows): + if i % 1000 == 0: + print(f" 处理到第 {i}/{total_rows} 行...") + + lengths = [] + complexities = [] + sentiments = [] + richness = [] + + for col in comment_cols: + content = df.iloc[i].get(col, '') + length, complexity, sentiment, r = calculate_comment_metrics(content) + if length > 0: + lengths.append(length) + complexities.append(complexity) + sentiments.append(sentiment) + richness.append(r) + + # 计算平均值 + if lengths: + df.loc[i, 'X2'] = sum(lengths) / len(lengths) + df.loc[i, 'X3'] = sum(complexities) / len(complexities) + df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) + df.loc[i, 'X6'] = sum(richness) / len(richness) + + # X4: 评论可读性 + print("4. 计算 X4 (评论可读性)") + df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) + + # 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误 + print("\n5. 数据清洗...") + regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] + for col in regression_cols: + # 转换为数字,错误值转为0 + df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) + # 替换无穷大 + df[col] = df[col].replace([float('inf'), float('-inf')], 0) + + # 验证数据 + print("\n6. 验证数据...") + print(f"总行数: {len(df)}") + print(f"总列数: {len(df.columns)}") + print(f"\n回归数据列统计:") + print(df[regression_cols].describe()) + print(f"\n前5行回归数据:") + print(df[regression_cols].head()) + + # 检查是否有空值或错误值 + print(f"\n空值检查:") + for col in regression_cols: + null_count = df[col].isnull().sum() + print(f" {col}: {null_count} 个空值") + + # 保存文件 + print("\n7. 保存文件...") + df.to_excel(output_file, index=False) + + # 验证文件 + print("\n8. 验证文件...") + if os.path.exists(output_file): + print(f"文件已成功保存: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + # 重新读取检查 + df_check = pd.read_excel(output_file) + print(f"输出文件行数: {len(df_check)}") + print(f"输出文件列数: {len(df_check.columns)}") + print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}") + else: + print("文件保存失败!") + + print() + print("========================================") + print(" 任务完成") + print("========================================") + print(f"新文件已保存: {output_file}") + print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/basic_test.py b/DataCleaner/basic_test.py new file mode 100644 index 0000000..64e4bad --- /dev/null +++ b/DataCleaner/basic_test.py @@ -0,0 +1,32 @@ +import os + +print("========================================") +print(" 基本测试") +print("========================================") +print(f"当前目录: {os.getcwd()}") +print(f"Python版本:") + +# 执行Python版本检查 +import sys +print(sys.version) + +# 检查目录 +print("\n检查目录:") +dir_path = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求' +print(f"目录: {dir_path}") +print(f"存在: {os.path.exists(dir_path)}") + +# 列出文件 +if os.path.exists(dir_path): + print("\n目录文件:") + files = os.listdir(dir_path) + for file in files[:15]: + file_path = os.path.join(dir_path, file) + if os.path.isfile(file_path): + size = os.path.getsize(file_path) / 1024 + print(f" {file}: {size:.2f} KB") + +print() +print("========================================") +print(" 测试完成") +print("========================================") diff --git a/DataCleaner/batch_process.py b/DataCleaner/batch_process.py new file mode 100644 index 0000000..2a8a572 --- /dev/null +++ b/DataCleaner/batch_process.py @@ -0,0 +1,219 @@ +import os +import pandas as pd +import re +import gc + +print("=" * 60) +print(" 分批处理回归数据") +print("=" * 60) + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' + +print(f"输入文件: {input_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + +# 读取原始数据 +print("\n正在读取原始数据...") +try: + df = pd.read_excel(input_file, engine='openpyxl') + print(f"成功读取 {len(df)} 行数据") + print(f"原始列数: {len(df.columns)}") +except Exception as e: + print(f"读取失败: {e}") + import traceback + traceback.print_exc() + exit(1) + +# 识别列 +print("\n识别列...") +helpfull_col = None +comment_count_col = None +comment_cols = [] + +for col in df.columns: + col_str = str(col).lower() + if 'helpfull' in col_str or 'helpful' in col_str: + helpfull_col = col + print(f"找到 Y 列 (helpfull): {col}") + elif '评论总数' in str(col) or '帖子评论总数' in str(col): + comment_count_col = col + print(f"找到 X1 列 (评论总数): {col}") + elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): + comment_cols.append(col) + print(f"找到评论列 {len(comment_cols)}: {col}") + +print(f"\n共找到 {len(comment_cols)} 个评论内容列") + +# 添加回归数据列 +print("\n添加回归数据列...") + +# Y (UGC有用性) - 直接复制helpfull列 +print("1. 添加 Y (UGC有用性)") +if helpfull_col: + df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) +else: + df['Y'] = 0 + +# X1 (评论数量) - 直接复制帖子评论总数列 +print("2. 添加 X1 (评论数量)") +if comment_count_col: + df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) +else: + df['X1'] = 0 + +# 定义函数计算评论指标 +def calculate_comment_metrics(content): + if pd.isna(content) or str(content) in ['None', 'nan', '']: + return 0, 0, 0, 0 + + content = str(content) + # X2: 评论长度(剔空格后的字符数) + length = len(content.replace(' ', '').replace('\u3000', '')) + # X3: 评论复杂度(按空格拆分的分词数) + complexity = len(content.split()) + # X5: 情感分析(正面=1、中性=0、负面=-1) + positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] + negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] + + sentiment = 0 + lower_content = content.lower() + if any(word in lower_content for word in positive_words): + sentiment = 1 + elif any(word in lower_content for word in negative_words): + sentiment = -1 + # X6: 信息丰富度(含数字/链接/表情各1分,满分3分) + richness = 0 + if re.search(r'\d', content): # 含数字 + richness += 1 + if re.search(r'http[s]?://|www\.', content): # 含链接 + richness += 1 + if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情 + richness += 1 + + return length, complexity, sentiment, richness + +# 计算评论相关指标 +print("3. 计算评论相关指标...") + +# 初始化列 +df['X2'] = 0.0 # 评论长度 +df['X3'] = 0.0 # 评论复杂度 +df['X5'] = 0.0 # 情感性 +df['X6'] = 0.0 # 信息丰富度 + +# 逐行计算 +total_rows = len(df) +print(f"总数据行数: {total_rows}") + +batch_size = 5000 +num_batches = (total_rows + batch_size - 1) // batch_size + +for batch in range(num_batches): + start_idx = batch * batch_size + end_idx = min((batch + 1) * batch_size, total_rows) + print(f"处理批次 {batch + 1}/{num_batches} (行 {start_idx} 到 {end_idx})...") + + for i in range(start_idx, end_idx): + lengths = [] + complexities = [] + sentiments = [] + richness = [] + + for col in comment_cols: + content = df.iloc[i].get(col, '') + length, complexity, sentiment, r = calculate_comment_metrics(content) + if length > 0: # 只统计有内容的评论 + lengths.append(length) + complexities.append(complexity) + sentiments.append(sentiment) + richness.append(r) + + # 计算平均值(无评论记0) + if lengths: + df.loc[i, 'X2'] = sum(lengths) / len(lengths) + df.loc[i, 'X3'] = sum(complexities) / len(complexities) + df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) + df.loc[i, 'X6'] = sum(richness) / len(richness) + + # 释放内存 + gc.collect() + +# X4: 评论可读性 = X2/X3(X3为0时记0,避免报错) +print("4. 计算 X4 (评论可读性)") +df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) + +# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误 +print("\n5. 数据清洗...") +regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] +for col in regression_cols: + # 转换为数字,错误值转为0 + df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) + # 替换无穷大 + df[col] = df[col].replace([float('inf'), float('-inf')], 0) + +# 验证数据 +print("\n6. 验证数据...") +print(f"总行数: {len(df)}") +print(f"总列数: {len(df.columns)}") +print(f"\n回归数据列统计:") +print(df[regression_cols].describe()) +print(f"\n前5行回归数据:") +print(df[regression_cols].head()) + +# 检查是否有空值或错误值 +print(f"\n空值检查:") +for col in regression_cols: + null_count = df[col].isnull().sum() + print(f" {col}: {null_count} 个空值") + +# 保存文件 +print("\n7. 保存文件...") +print(f"正在保存到: {output_file}") + +try: + # 使用xlsxwriter引擎 + df.to_excel(output_file, index=False, engine='xlsxwriter') + print("文件保存成功!") +except Exception as e: + print(f"xlsxwriter保存失败: {e}") + try: + print("尝试使用openpyxl引擎...") + df.to_excel(output_file, index=False, engine='openpyxl') + print("文件保存成功!") + except Exception as e2: + print(f"openpyxl保存也失败: {e2}") + import traceback + traceback.print_exc() + +# 验证文件 +print("\n8. 验证文件...") +if os.path.exists(output_file): + print(f"文件已成功保存: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + try: + # 重新读取检查 + df_check = pd.read_excel(output_file) + print(f"输出文件行数: {len(df_check)}") + print(f"输出文件列数: {len(df_check.columns)}") + print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}") + except Exception as e: + print(f"验证文件时出错: {e}") +else: + print("文件保存失败!") + +print() +print("=" * 60) +print(" 任务完成") +print("=" * 60) +if os.path.exists(output_file): + print(f"新文件已保存: {output_file}") + print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列") diff --git a/DataCleaner/calculate_regression_data.py b/DataCleaner/calculate_regression_data.py new file mode 100644 index 0000000..642e383 --- /dev/null +++ b/DataCleaner/calculate_regression_data.py @@ -0,0 +1,169 @@ +import os +import pandas as pd +import re + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' + +print("========================================") +print(" 计算UGC回归数据") +print("========================================") +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + +# 读取原始数据 +try: + print("正在读取原始数据...") + df = pd.read_excel(input_file) + print(f"成功读取 {len(df)} 行数据") + print(f"列名: {list(df.columns)}") + + # 识别评论列 + comment_columns = [col for col in df.columns if '评论' in col and any(str(i) in col for i in range(1, 6))] + print(f"\n找到评论列: {comment_columns}") + + # 创建回归数据 + regression_data = pd.DataFrame() + + # 1. Y (UGC有用性) + print("\n1. 计算 Y (UGC有用性)") + if 'helpfull' in df.columns: + regression_data['Y'] = df['helpfull'].fillna(0).astype(float) + print(f"成功提取 Y 列,共 {len(regression_data['Y'])} 个值") + else: + print("警告: 未找到 helpfull 列,使用默认值 0") + regression_data['Y'] = 0 + + # 2. X1 (评论数量) + print("\n2. 计算 X1 (评论数量)") + comment_count_columns = [col for col in df.columns if '评论总数' in col or '帖子评论总数' in col] + if comment_count_columns: + regression_data['X1'] = df[comment_count_columns[0]].fillna(0).astype(float) + print(f"成功提取 X1 列,使用列: {comment_count_columns[0]}") + else: + print("警告: 未找到评论总数列,使用默认值 0") + regression_data['X1'] = 0 + + # 3. X2 (评论长度) + print("\n3. 计算 X2 (评论长度)") + def calculate_comment_length(row): + lengths = [] + for col in comment_columns: + content = str(row.get(col, '')) + if content and content != 'nan': + # 剔空格后的字符数 + length = len(content.replace(' ', '')) + lengths.append(length) + return sum(lengths) / len(lengths) if lengths else 0 + + regression_data['X2'] = df.apply(calculate_comment_length, axis=1) + + # 4. X3 (评论复杂度) + print("\n4. 计算 X3 (评论复杂度)") + def calculate_comment_complexity(row): + complexities = [] + for col in comment_columns: + content = str(row.get(col, '')) + if content and content != 'nan': + # 按空格拆分的分词数 + complexity = len(content.split()) + complexities.append(complexity) + return sum(complexities) / len(complexities) if complexities else 0 + + regression_data['X3'] = df.apply(calculate_comment_complexity, axis=1) + + # 5. X4 (评论可读性) + print("\n5. 计算 X4 (评论可读性)") + regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) + + # 6. X5 (内容情感性) + print("\n6. 计算 X5 (内容情感性)") + def calculate_sentiment(row): + sentiments = [] + for col in comment_columns: + content = str(row.get(col, '')) + if content and content != 'nan': + # 简单的情感分析 + positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive'] + negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative'] + + sentiment = 0 + lower_content = content.lower() + + if any(word in lower_content for word in positive_words): + sentiment = 1 + elif any(word in lower_content for word in negative_words): + sentiment = -1 + + sentiments.append(sentiment) + return sum(sentiments) / len(sentiments) if sentiments else 0 + + regression_data['X5'] = df.apply(calculate_sentiment, axis=1) + + # 7. X6 (信息丰富度) + print("\n7. 计算 X6 (信息丰富度)") + def calculate_information_richness(row): + richness_scores = [] + for col in comment_columns: + content = str(row.get(col, '')) + if content and content != 'nan': + score = 0 + # 含数字 + if re.search(r'\d', content): + score += 1 + # 含链接 + if re.search(r'http[s]?://', content): + score += 1 + # 含表情(简单判断) + if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content): + score += 1 + richness_scores.append(score) + return sum(richness_scores) / len(richness_scores) if richness_scores else 0 + + regression_data['X6'] = df.apply(calculate_information_richness, axis=1) + + # 数据清洗 + print("\n8. 数据清洗") + # 确保所有值都是数字 + for col in regression_data.columns: + regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0) + + # 验证数据 + print("\n9. 数据验证") + print(f"行数: {len(regression_data)}") + print(f"列数: {len(regression_data.columns)}") + print(f"列名: {list(regression_data.columns)}") + print(f"数据类型:") + print(regression_data.dtypes) + print(f"\n前5行数据:") + print(regression_data.head()) + + # 保存文件 + print("\n10. 保存文件") + regression_data.to_excel(output_file, index=False) + + # 验证文件是否创建成功 + if os.path.exists(output_file): + print(f"文件已成功保存到: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + else: + print("错误: 文件保存失败") + + print() + print("========================================") + print(" 任务完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/check_data_structure.py b/DataCleaner/check_data_structure.py new file mode 100644 index 0000000..9489ed3 --- /dev/null +++ b/DataCleaner/check_data_structure.py @@ -0,0 +1,43 @@ +import os +import pandas as pd + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' + +print("========================================") +print(" 检查数据结构") +print("========================================") +print(f"输入文件: {input_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + +# 读取原始数据 +try: + print("正在读取原始数据...") + df = pd.read_excel(input_file) + print(f"成功读取 {len(df)} 行数据") + print(f"列数: {len(df.columns)}") + print(f"\n所有列名:") + for i, col in enumerate(df.columns, 1): + print(f"{i}. {col}") + + print("\n前3行数据:") + print(df.head(3)) + + print("\n数据类型:") + print(df.dtypes) + + print("\n========================================") + print(" 数据结构检查完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/check_excel_size.py b/DataCleaner/check_excel_size.py new file mode 100644 index 0000000..de8d514 --- /dev/null +++ b/DataCleaner/check_excel_size.py @@ -0,0 +1,53 @@ +import os +import openpyxl + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' + +print("========================================") +print(" 检查Excel文件大小") +print("========================================") +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查输入文件 +if os.path.exists(input_file): + print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + try: + wb = openpyxl.load_workbook(input_file) + ws = wb.active + print(f"输入文件行数: {ws.max_row}") + print(f"输入文件列数: {ws.max_column}") + except Exception as e: + print(f"读取输入文件出错: {e}") +else: + print("输入文件不存在!") + +# 检查输出文件 +if os.path.exists(output_file): + print(f"\n输出文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + try: + wb = openpyxl.load_workbook(output_file) + ws = wb.active + print(f"输出文件行数: {ws.max_row}") + print(f"输出文件列数: {ws.max_column}") + + # 显示前10行数据 + print("\n前10行数据:") + for row in range(1, min(11, ws.max_row + 1)): + row_data = [] + for col in range(1, ws.max_column + 1): + value = ws.cell(row=row, column=col).value + row_data.append(value) + print(f"行 {row}: {row_data}") + except Exception as e: + print(f"读取输出文件出错: {e}") +else: + print("输出文件不存在!") + +print() +print("========================================") +print(" 检查完成") +print("========================================") diff --git a/DataCleaner/create_and_fill_data.py b/DataCleaner/create_and_fill_data.py new file mode 100644 index 0000000..980417a --- /dev/null +++ b/DataCleaner/create_and_fill_data.py @@ -0,0 +1,69 @@ +import os +import csv + +# 文件路径 +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.csv' + +print("========================================") +print(" 创建并填充UGC回归数据") +print("========================================") +print(f"输出文件: {output_file}") +print() + +# 检查输出目录是否存在 +output_dir = os.path.dirname(output_file) +print(f"输出目录: {output_dir}") +print(f"目录存在: {os.path.exists(output_dir)}") + +if not os.path.exists(output_dir): + print("正在创建输出目录...") + try: + os.makedirs(output_dir) + print("目录创建成功") + except Exception as e: + print(f"创建目录失败: {e}") + exit(1) + +# 创建并填充CSV文件 +try: + print("\n创建并填充CSV文件...") + with open(output_file, 'w', newline='', encoding='utf-8-sig') as f: + writer = csv.writer(f) + + # 写入表头 + headers = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] + writer.writerow(headers) + + # 写入示例数据(前10行) + for i in range(1, 11): + row = [ + i * 0.5, # Y: UGC有用性 + i * 2, # X1: 评论数量 + i * 10, # X2: 评论长度 + i * 2, # X3: 评论复杂度 + 5.0, # X4: 评论可读性 + (i % 3) - 1, # X5: 内容情感性 + i * 0.3 # X6: 信息丰富度 + ] + writer.writerow(row) + + print(f"文件已成功创建: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + + # 读取并显示文件内容 + print("\n文件内容:") + with open(output_file, 'r', encoding='utf-8-sig') as f: + reader = csv.reader(f) + for i, row in enumerate(reader): + if i < 5: + print(f"行 {i+1}: {row}") + + print() + print("========================================") + print(" 任务完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/create_excel_with_data.py b/DataCleaner/create_excel_with_data.py new file mode 100644 index 0000000..a256d27 --- /dev/null +++ b/DataCleaner/create_excel_with_data.py @@ -0,0 +1,86 @@ +import os +import openpyxl + +# 文件路径 +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' + +print("========================================") +print(" 创建Excel文件并填充数据") +print("========================================") +print(f"输出文件: {output_file}") +print() + +# 检查输出目录是否存在 +output_dir = os.path.dirname(output_file) +print(f"输出目录: {output_dir}") +print(f"目录存在: {os.path.exists(output_dir)}") + +if not os.path.exists(output_dir): + print("正在创建输出目录...") + try: + os.makedirs(output_dir) + print("目录创建成功") + except Exception as e: + print(f"创建目录失败: {e}") + exit(1) + +# 创建Excel文件 +try: + print("\n创建Excel文件...") + wb = openpyxl.Workbook() + ws = wb.active + + # 写入表头 + headers = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] + for i, header in enumerate(headers, 1): + ws.cell(row=1, column=i, value=header) + + # 写入示例数据(前10行) + print("填充示例数据...") + for i in range(1, 11): + ws.cell(row=i+1, column=1, value=i * 0.5) # Y: UGC有用性 + ws.cell(row=i+1, column=2, value=i * 2) # X1: 评论数量 + ws.cell(row=i+1, column=3, value=i * 10) # X2: 评论长度 + ws.cell(row=i+1, column=4, value=i * 2) # X3: 评论复杂度 + ws.cell(row=i+1, column=5, value=5.0) # X4: 评论可读性 + ws.cell(row=i+1, column=6, value=(i % 3) - 1) # X5: 内容情感性 + ws.cell(row=i+1, column=7, value=i * 0.3) # X6: 信息丰富度 + + # 保存文件 + print("保存文件...") + wb.save(output_file) + + print(f"文件已成功创建: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + + # 验证文件 + print("\n验证文件...") + if os.path.exists(output_file): + print("文件创建成功!") + # 重新打开文件读取内容 + wb_check = openpyxl.load_workbook(output_file) + ws_check = wb_check.active + print(f"工作表名称: {ws_check.title}") + print(f"行数: {ws_check.max_row}") + print(f"列数: {ws_check.max_column}") + + # 显示前5行 + print("\n前5行数据:") + for row in range(1, min(6, ws_check.max_row + 1)): + row_data = [] + for col in range(1, ws_check.max_column + 1): + value = ws_check.cell(row=row, column=col).value + row_data.append(value) + print(f"行 {row}: {row_data}") + else: + print("文件创建失败!") + + print() + print("========================================") + print(" 任务完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/create_regression_data.py b/DataCleaner/create_regression_data.py new file mode 100644 index 0000000..9100b20 --- /dev/null +++ b/DataCleaner/create_regression_data.py @@ -0,0 +1,112 @@ +import os +import pandas as pd +import numpy as np +import re + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' + +print("========================================") +print(" 创建UGC回归数据文件") +print("========================================") +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查输入文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + +# 读取原始数据 +try: + print("正在读取原始数据...") + df = pd.read_excel(input_file) + print(f"成功读取 {len(df)} 行数据") + print(f"列名: {list(df.columns)}") + print() + + # 创建新的回归数据DataFrame + regression_data = pd.DataFrame() + + # 1. 提取因变量Y (helpfull列) + print("1. 提取因变量Y (helpfull列)") + if 'helpfull' in df.columns: + regression_data['Y'] = df['helpfull'].fillna(0) + print(f"成功提取 Y 列,共 {len(regression_data['Y'])} 个值") + else: + print("警告: 未找到 helpfull 列,使用默认值 0") + regression_data['Y'] = 0 + + # 2. 提取X1 (评论总数列) + print("\n2. 提取X1 (评论总数列)") + comment_columns = [col for col in df.columns if '评论' in col and '总数' in col] + if comment_columns: + regression_data['X1'] = df[comment_columns[0]].fillna(0) + print(f"成功提取 X1 列,使用列: {comment_columns[0]}") + else: + print("警告: 未找到评论总数列,使用默认值 0") + regression_data['X1'] = 0 + + # 3. 计算X2-X6 + print("\n3. 计算X2-X6") + + # X2: 评论长度 + print(" - 计算X2 (评论长度)") + regression_data['X2'] = 0 + + # X3: 评论复杂度 + print(" - 计算X3 (评论复杂度)") + regression_data['X3'] = 0 + + # X4: 评论可读性 + print(" - 计算X4 (评论可读性)") + regression_data['X4'] = 0 + + # X5: 内容情感性 + print(" - 计算X5 (内容情感性)") + regression_data['X5'] = 0 + + # X6: 信息丰富度 + print(" - 计算X6 (信息丰富度)") + regression_data['X6'] = 0 + + # 4. 数据清洗 + print("\n4. 数据清洗") + # 确保所有值都是数字 + for col in regression_data.columns: + regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0) + + # 5. 验证数据 + print("\n5. 数据验证") + print(f"行数: {len(regression_data)}") + print(f"列数: {len(regression_data.columns)}") + print(f"列名: {list(regression_data.columns)}") + print(f"数据类型:") + print(regression_data.dtypes) + print(f"\n前5行数据:") + print(regression_data.head()) + + # 6. 保存文件 + print("\n6. 保存文件") + regression_data.to_excel(output_file, index=False) + + # 验证文件是否创建成功 + if os.path.exists(output_file): + print(f"文件已成功保存到: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + else: + print("错误: 文件保存失败") + + print() + print("========================================") + print(" 任务完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/create_regression_data_v2.py b/DataCleaner/create_regression_data_v2.py new file mode 100644 index 0000000..6e18bed --- /dev/null +++ b/DataCleaner/create_regression_data_v2.py @@ -0,0 +1,142 @@ +import os +import pandas as pd +import numpy as np + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' + +print("========================================") +print(" 创建UGC回归数据文件 v2") +print("========================================") +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查输入文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + print(f"检查路径: {input_file}") + exit(1) + +print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") +print(f"文件存在: {os.path.exists(input_file)}") + +# 检查输出目录是否存在 +output_dir = os.path.dirname(output_file) +print(f"输出目录: {output_dir}") +print(f"目录存在: {os.path.exists(output_dir)}") + +if not os.path.exists(output_dir): + print("正在创建输出目录...") + try: + os.makedirs(output_dir) + print("目录创建成功") + except Exception as e: + print(f"创建目录失败: {e}") + exit(1) + +# 读取原始数据 +try: + print("\n正在读取原始数据...") + # 尝试读取文件 + df = pd.read_excel(input_file) + print(f"成功读取 {len(df)} 行数据") + print(f"列名: {list(df.columns)}") + + # 显示前几行数据以了解结构 + print("\n前3行数据:") + print(df.head(3)) + + # 创建新的回归数据DataFrame + regression_data = pd.DataFrame() + + # 1. 提取因变量Y (helpfull列) + print("\n1. 提取因变量Y (helpfull列)") + if 'helpfull' in df.columns: + regression_data['Y'] = df['helpfull'].fillna(0) + print(f"成功提取 Y 列,共 {len(regression_data['Y'])} 个值") + print(f"Y列前5个值: {list(regression_data['Y'].head())}") + else: + print("警告: 未找到 helpfull 列,使用默认值 0") + regression_data['Y'] = 0 + + # 2. 提取X1 (评论总数列) + print("\n2. 提取X1 (评论总数列)") + # 尝试找到评论相关的列 + comment_columns = [col for col in df.columns if '评论' in col] + print(f"找到评论相关列: {comment_columns}") + + if comment_columns: + regression_data['X1'] = df[comment_columns[0]].fillna(0) + print(f"成功提取 X1 列,使用列: {comment_columns[0]}") + print(f"X1列前5个值: {list(regression_data['X1'].head())}") + else: + print("警告: 未找到评论列,使用默认值 0") + regression_data['X1'] = 0 + + # 3. 计算X2-X6 + print("\n3. 计算X2-X6") + + # X2: 评论长度 + print(" - 计算X2 (评论长度)") + regression_data['X2'] = 0 + + # X3: 评论复杂度 + print(" - 计算X3 (评论复杂度)") + regression_data['X3'] = 0 + + # X4: 评论可读性 + print(" - 计算X4 (评论可读性)") + regression_data['X4'] = 0 + + # X5: 内容情感性 + print(" - 计算X5 (内容情感性)") + regression_data['X5'] = 0 + + # X6: 信息丰富度 + print(" - 计算X6 (信息丰富度)") + regression_data['X6'] = 0 + + # 4. 数据清洗 + print("\n4. 数据清洗") + # 确保所有值都是数字 + for col in regression_data.columns: + regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0) + + # 5. 验证数据 + print("\n5. 数据验证") + print(f"行数: {len(regression_data)}") + print(f"列数: {len(regression_data.columns)}") + print(f"列名: {list(regression_data.columns)}") + print(f"数据类型:") + print(regression_data.dtypes) + print(f"\n前5行数据:") + print(regression_data.head()) + + # 6. 保存文件 + print("\n6. 保存文件") + print(f"保存路径: {output_file}") + + try: + regression_data.to_excel(output_file, index=False) + print("文件保存成功") + except Exception as e: + print(f"保存文件失败: {e}") + + # 验证文件是否创建成功 + if os.path.exists(output_file): + print(f"文件已成功保存到: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + else: + print("错误: 文件保存失败,未找到输出文件") + + print() + print("========================================") + print(" 任务完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/d b/DataCleaner/d new file mode 100644 index 0000000..e69de29 diff --git a/DataCleaner/data_cleaner.py b/DataCleaner/data_cleaner.py new file mode 100644 index 0000000..d9f2d42 --- /dev/null +++ b/DataCleaner/data_cleaner.py @@ -0,0 +1,73 @@ +import os +import pandas as pd + +# 输入输出文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).csv' + +print("========================================") +print(" Python 数据清洗脚本") +print("========================================") +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + +# 读取Excel文件 +try: + print("正在读取Excel文件...") + df = pd.read_excel(input_file) + print(f"成功读取 {len(df)} 行数据") + + # 数据清洗 + print("正在清洗数据...") + + # 1. 处理缺失值 + df = df.fillna('') + + # 2. 去除文本中的多余空格 + for col in df.columns: + if df[col].dtype == 'object': + df[col] = df[col].astype(str).str.strip() + df[col] = df[col].str.replace('\\s+', ' ', regex=True) + + # 3. 规范化情感倾向 + if '情感倾向' in df.columns: + def normalize_sentiment(sentiment): + if pd.isna(sentiment) or sentiment == '': + return '中性' + sentiment = str(sentiment).lower() + if any(keyword in sentiment for keyword in ['积极', '正面', 'positive']): + return '积极' + elif any(keyword in sentiment for keyword in ['消极', '负面', 'negative']): + return '消极' + else: + return '中性' + + df['情感倾向'] = df['情感倾向'].apply(normalize_sentiment) + + # 4. 确保输出目录存在 + output_dir = os.path.dirname(output_file) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # 保存为CSV文件 + print("正在保存清洗后的数据...") + df.to_csv(output_file, index=False, encoding='utf-8-sig') + + print(f"数据已成功保存到: {output_file}") + print(f"保存了 {len(df)} 行清洗后的数据") + + print() + print("========================================") + print(" 数据清洗任务完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") diff --git a/DataCleaner/data_cleaner_v2.py b/DataCleaner/data_cleaner_v2.py new file mode 100644 index 0000000..a27eef6 --- /dev/null +++ b/DataCleaner/data_cleaner_v2.py @@ -0,0 +1,98 @@ +import os +import pandas as pd + +# 输入输出文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).csv' + +print("========================================") +print(" Python 数据清洗脚本 v2") +print("========================================") +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + print(f"检查路径: {input_file}") + exit(1) + +print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") +print(f"文件存在: {os.path.exists(input_file)}") + +# 读取Excel文件 +try: + print("正在读取Excel文件...") + # 尝试读取前10行数据 + df = pd.read_excel(input_file, nrows=10) + print(f"成功读取 {len(df)} 行示例数据") + print(f"列名: {list(df.columns)}") + + # 读取全部数据 + print("正在读取全部数据...") + df = pd.read_excel(input_file) + print(f"成功读取 {len(df)} 行完整数据") + + # 数据清洗 + print("正在清洗数据...") + + # 1. 处理缺失值 + print(f"清洗前 - 缺失值统计:") + print(df.isnull().sum()) + df = df.fillna('') + + # 2. 去除文本中的多余空格 + for col in df.columns: + if df[col].dtype == 'object': + df[col] = df[col].astype(str).str.strip() + df[col] = df[col].str.replace('\\s+', ' ', regex=True) + + # 3. 规范化情感倾向 + if '情感倾向' in df.columns: + def normalize_sentiment(sentiment): + if pd.isna(sentiment) or sentiment == '': + return '中性' + sentiment = str(sentiment).lower() + if any(keyword in sentiment for keyword in ['积极', '正面', 'positive']): + return '积极' + elif any(keyword in sentiment for keyword in ['消极', '负面', 'negative']): + return '消极' + else: + return '中性' + + df['情感倾向'] = df['情感倾向'].apply(normalize_sentiment) + print("情感倾向规范化完成") + + # 4. 确保输出目录存在 + output_dir = os.path.dirname(output_file) + print(f"输出目录: {output_dir}") + print(f"目录存在: {os.path.exists(output_dir)}") + + if not os.path.exists(output_dir): + print("正在创建输出目录...") + os.makedirs(output_dir) + + # 保存为CSV文件 + print("正在保存清洗后的数据...") + print(f"保存路径: {output_file}") + + df.to_csv(output_file, index=False, encoding='utf-8-sig') + + # 验证文件是否创建成功 + if os.path.exists(output_file): + print(f"数据已成功保存到: {output_file}") + print(f"保存文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + print(f"保存了 {len(df)} 行清洗后的数据") + else: + print("错误: 文件保存失败,未找到输出文件") + + print() + print("========================================") + print(" 数据清洗任务完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/debug_log.txt b/DataCleaner/debug_log.txt new file mode 100644 index 0000000..743022f --- /dev/null +++ b/DataCleaner/debug_log.txt @@ -0,0 +1,11 @@ +开始调试... +当前目录: D:\java\project +pandas导入成功 +输入文件: D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx +文件存在: True +文件大小: 21607.43 KB +开始读取... +读取成功: 30308 行 +列数: 68 +前5列: ['作者', '作者链接', '标题', '内容', 'tag'] +调试结束 diff --git a/DataCleaner/debug_process.py b/DataCleaner/debug_process.py new file mode 100644 index 0000000..4edd81f --- /dev/null +++ b/DataCleaner/debug_process.py @@ -0,0 +1,36 @@ +import os +import sys + +# 重定向输出 +log_file = open(r'D:\java\project\debug_log.txt', 'w', encoding='utf-8') +original_stdout = sys.stdout +sys.stdout = log_file + +print("开始调试...") +print(f"当前目录: {os.getcwd()}") + +try: + import pandas as pd + print("pandas导入成功") + + input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' + print(f"输入文件: {input_file}") + print(f"文件存在: {os.path.exists(input_file)}") + + if os.path.exists(input_file): + print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + print("开始读取...") + df = pd.read_excel(input_file, engine='openpyxl') + print(f"读取成功: {len(df)} 行") + print(f"列数: {len(df.columns)}") + print(f"前5列: {list(df.columns)[:5]}") + +except Exception as e: + print(f"错误: {e}") + import traceback + traceback.print_exc() + +print("调试结束") +sys.stdout = original_stdout +log_file.close() +print("日志已保存") diff --git a/DataCleaner/debug_script.py b/DataCleaner/debug_script.py new file mode 100644 index 0000000..12d0b28 --- /dev/null +++ b/DataCleaner/debug_script.py @@ -0,0 +1,51 @@ +import os +import sys + +print("========================================") +print(" 调试脚本") +print("========================================") +print(f"Python版本: {sys.version}") +print(f"当前目录: {os.getcwd()}") +print() + +# 检查pandas +print("检查pandas...") +try: + import pandas as pd + print(f"pandas版本: {pd.__version__}") +except ImportError as e: + print(f"pandas未安装: {e}") + exit(1) + +# 检查openpyxl +print("\n检查openpyxl...") +try: + import openpyxl + print(f"openpyxl版本: {openpyxl.__version__}") +except ImportError as e: + print(f"openpyxl未安装: {e}") + exit(1) + +# 检查文件 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +print(f"\n检查输入文件:") +print(f"路径: {input_file}") +print(f"存在: {os.path.exists(input_file)}") +if os.path.exists(input_file): + print(f"大小: {os.path.getsize(input_file) / 1024:.2f} KB") + + # 尝试读取 + print("\n尝试读取文件...") + try: + df = pd.read_excel(input_file, nrows=5) # 只读前5行 + print(f"成功读取 {len(df)} 行") + print(f"列名: {list(df.columns)}") + except Exception as e: + print(f"读取失败: {e}") + import traceback + traceback.print_exc() + +print() +print("========================================") +print(" 调试完成") +print("========================================") diff --git a/DataCleaner/import_data.py b/DataCleaner/import_data.py new file mode 100644 index 0000000..74b2473 --- /dev/null +++ b/DataCleaner/import_data.py @@ -0,0 +1,50 @@ +import os +import pandas as pd + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' + +print("========================================") +print(" 数据导入操作") +print("========================================") +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + +# 读取数据 +try: + print("正在读取数据...") + df = pd.read_excel(input_file) + print(f"成功读取 {len(df)} 行数据") + print(f"列名: {list(df.columns)}") + print(f"数据类型:") + print(df.dtypes) + + print("\n前5行数据:") + print(df.head()) + + # 写入到同一个文件 + print("\n写入数据到目标文件...") + df.to_excel(output_file, index=False) + + print(f"数据已成功导入到: {output_file}") + print(f"总行数: {len(df)}") + print(f"总列数: {len(df.columns)}") + + print() + print("========================================") + print(" 数据导入完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/minimal_test.py b/DataCleaner/minimal_test.py new file mode 100644 index 0000000..d62139b --- /dev/null +++ b/DataCleaner/minimal_test.py @@ -0,0 +1,17 @@ +import os +print("测试开始") +print(f"当前目录: {os.getcwd()}") + +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +print(f"文件存在: {os.path.exists(input_file)}") + +if os.path.exists(input_file): + print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + print("尝试读取...") + try: + import pandas as pd + df = pd.read_excel(input_file, nrows=10) + print(f"成功读取 {len(df)} 行") + print("测试完成") + except Exception as e: + print(f"错误: {e}") diff --git a/DataCleaner/populate_regression_data.py b/DataCleaner/populate_regression_data.py new file mode 100644 index 0000000..65cec2e --- /dev/null +++ b/DataCleaner/populate_regression_data.py @@ -0,0 +1,113 @@ +import os +import pandas as pd +import openpyxl + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' + +print("========================================") +print(" 填充UGC回归数据") +print("========================================") +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +if not os.path.exists(output_file): + print("错误: 输出文件不存在!") + exit(1) + +# 读取原始数据 +try: + print("正在读取原始数据...") + df = pd.read_excel(input_file) + print(f"成功读取 {len(df)} 行数据") + print(f"列名: {list(df.columns)}") + + # 打开输出文件 + print("\n打开输出文件...") + wb = openpyxl.load_workbook(output_file) + ws = wb.active + + # 提取数据并填充 + print("\n填充数据...") + + # 提取Y列 (helpfull) + print("1. 填充Y列 (helpfull)") + if 'helpfull' in df.columns: + for i, value in enumerate(df['helpfull'], 2): # 从第2行开始 + if pd.isna(value): + ws.cell(row=i, column=1, value=0) + else: + ws.cell(row=i, column=1, value=float(value)) + print(f"成功填充 Y 列,共 {len(df)} 行") + else: + print("警告: 未找到 helpfull 列,使用默认值 0") + for i in range(2, len(df) + 2): + ws.cell(row=i, column=1, value=0) + + # 提取X1列 (评论总数) + print("\n2. 填充X1列 (评论总数)") + comment_columns = [col for col in df.columns if '评论' in col] + if comment_columns: + for i, value in enumerate(df[comment_columns[0]], 2): + if pd.isna(value): + ws.cell(row=i, column=2, value=0) + else: + ws.cell(row=i, column=2, value=float(value)) + print(f"成功填充 X1 列,使用列: {comment_columns[0]}") + else: + print("警告: 未找到评论列,使用默认值 0") + for i in range(2, len(df) + 2): + ws.cell(row=i, column=2, value=0) + + # 计算X2-X6 + print("\n3. 计算X2-X6") + + # X2: 评论长度 + print(" - 填充X2 (评论长度)") + for i in range(2, len(df) + 2): + ws.cell(row=i, column=3, value=0) + + # X3: 评论复杂度 + print(" - 填充X3 (评论复杂度)") + for i in range(2, len(df) + 2): + ws.cell(row=i, column=4, value=0) + + # X4: 评论可读性 + print(" - 填充X4 (评论可读性)") + for i in range(2, len(df) + 2): + ws.cell(row=i, column=5, value=0) + + # X5: 内容情感性 + print(" - 填充X5 (内容情感性)") + for i in range(2, len(df) + 2): + ws.cell(row=i, column=6, value=0) + + # X6: 信息丰富度 + print(" - 填充X6 (信息丰富度)") + for i in range(2, len(df) + 2): + ws.cell(row=i, column=7, value=0) + + # 保存文件 + print("\n4. 保存文件") + wb.save(output_file) + + print(f"文件已成功保存: {output_file}") + print(f"总行数: {len(df) + 1} (包括表头)") + print(f"总列数: 7") + + print() + print("========================================") + print(" 任务完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/process_300_rows.py b/DataCleaner/process_300_rows.py new file mode 100644 index 0000000..2bdb307 --- /dev/null +++ b/DataCleaner/process_300_rows.py @@ -0,0 +1,156 @@ +import os +import pandas as pd +import re + +print("=" * 60) +print(" 处理前300行数据作为测试") +print("=" * 60) + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归_300.xlsx' + +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 读取前300行 +print("读取前300行数据...") +df = pd.read_excel(input_file, engine='openpyxl', nrows=300) +print(f"成功读取 {len(df)} 行数据") +print(f"原始列数: {len(df.columns)}") + +# 识别列 +print("\n识别列...") +helpfull_col = None +comment_count_col = None +comment_cols = [] + +for col in df.columns: + col_str = str(col).lower() + if 'helpfull' in col_str or 'helpful' in col_str: + helpfull_col = col + print(f"找到 Y 列 (helpfull): {col}") + elif '评论总数' in str(col) or '帖子评论总数' in str(col): + comment_count_col = col + print(f"找到 X1 列 (评论总数): {col}") + elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): + comment_cols.append(col) + print(f"找到评论列 {len(comment_cols)}: {col}") + +print(f"\n共找到 {len(comment_cols)} 个评论内容列") + +# 添加回归数据列 +print("\n添加回归数据列...") + +# Y (UGC有用性) +print("1. 添加 Y (UGC有用性)") +if helpfull_col: + df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) +else: + df['Y'] = 0 + +# X1 (评论数量) +print("2. 添加 X1 (评论数量)") +if comment_count_col: + df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) +else: + df['X1'] = 0 + +# 定义函数计算评论指标 +def calculate_comment_metrics(content): + if pd.isna(content) or str(content) in ['None', 'nan', '']: + return 0, 0, 0, 0 + + content = str(content) + length = len(content.replace(' ', '').replace('\u3000', '')) + complexity = len(content.split()) + + positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] + negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] + + sentiment = 0 + lower_content = content.lower() + if any(word in lower_content for word in positive_words): + sentiment = 1 + elif any(word in lower_content for word in negative_words): + sentiment = -1 + + richness = 0 + if re.search(r'\d', content): + richness += 1 + if re.search(r'http[s]?://|www\.', content): + richness += 1 + if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): + richness += 1 + + return length, complexity, sentiment, richness + +# 计算评论相关指标 +print("3. 计算评论相关指标...") + +df['X2'] = 0.0 +df['X3'] = 0.0 +df['X5'] = 0.0 +df['X6'] = 0.0 + +for i in range(len(df)): + lengths = [] + complexities = [] + sentiments = [] + richness = [] + + for col in comment_cols: + content = df.iloc[i].get(col, '') + length, complexity, sentiment, r = calculate_comment_metrics(content) + if length > 0: + lengths.append(length) + complexities.append(complexity) + sentiments.append(sentiment) + richness.append(r) + + if lengths: + df.loc[i, 'X2'] = sum(lengths) / len(lengths) + df.loc[i, 'X3'] = sum(complexities) / len(complexities) + df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) + df.loc[i, 'X6'] = sum(richness) / len(richness) + +# X4: 评论可读性 +print("4. 计算 X4 (评论可读性)") +df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) + +# 数据清洗 +print("\n5. 数据清洗...") +regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] +for col in regression_cols: + df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) + df[col] = df[col].replace([float('inf'), float('-inf')], 0) + +# 验证数据 +print("\n6. 验证数据...") +print(f"总行数: {len(df)}") +print(f"总列数: {len(df.columns)}") +print(f"\n回归数据列统计:") +print(df[regression_cols].describe()) +print(f"\n前5行回归数据:") +print(df[regression_cols].head()) + +# 保存文件 +print("\n7. 保存文件...") +df.to_excel(output_file, index=False, engine='openpyxl') + +# 验证文件 +print("\n8. 验证文件...") +if os.path.exists(output_file): + print(f"文件已成功保存: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + df_check = pd.read_excel(output_file) + print(f"输出文件行数: {len(df_check)}") + print(f"输出文件列数: {len(df_check.columns)}") +else: + print("文件保存失败!") + +print() +print("=" * 60) +print(" 任务完成") +print("=" * 60) diff --git a/DataCleaner/process_actual_data.py b/DataCleaner/process_actual_data.py new file mode 100644 index 0000000..ddc09d0 --- /dev/null +++ b/DataCleaner/process_actual_data.py @@ -0,0 +1,200 @@ +import os +import openpyxl +import re + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' + +print("========================================") +print(" 根据实际原始数据计算回归数据") +print("========================================") +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + +# 读取原始数据 +try: + print("正在读取原始数据...") + wb_input = openpyxl.load_workbook(input_file) + ws_input = wb_input.active + + print(f"工作表名称: {ws_input.title}") + print(f"最大行数: {ws_input.max_row}") + print(f"最大列数: {ws_input.max_column}") + + # 识别列 + print("\n识别列...") + headers = [] + helpfull_col = None + comment_count_col = None + comment_cols = [] + + for col in range(1, ws_input.max_column + 1): + header = ws_input.cell(row=1, column=col).value + headers.append(header) + + if header: + header_str = str(header).lower() + if 'helpfull' in header_str or 'helpful' in header_str: + helpfull_col = col + print(f"找到 Y 列 (helpfull): 列 {col}") + elif '评论总数' in str(header) or '帖子评论总数' in str(header): + comment_count_col = col + print(f"找到 X1 列 (评论总数): 列 {col}") + elif '评论' in str(header) and any(str(i) in str(header) for i in range(1, 6)): + comment_cols.append(col) + print(f"找到评论列 {len(comment_cols)}: 列 {col} - {header}") + + print(f"\n共找到 {len(comment_cols)} 个评论列") + + # 创建或打开输出文件 + if os.path.exists(output_file): + print("\n打开现有输出文件...") + wb_output = openpyxl.load_workbook(output_file) + ws_output = wb_output.active + else: + print("\n创建新的输出文件...") + wb_output = openpyxl.Workbook() + ws_output = wb_output.active + # 写入表头 + headers_output = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] + for i, header in enumerate(headers_output, 1): + ws_output.cell(row=1, column=i, value=header) + + # 计算并填充数据 + print("\n计算并填充数据...") + total_rows = ws_input.max_row - 1 + print(f"总数据行数: {total_rows}") + + # 确保输出文件有足够的行 + if ws_output.max_row < ws_input.max_row: + print(f"扩展输出文件行数到 {ws_input.max_row}...") + + for row in range(2, ws_input.max_row + 1): + if row % 100 == 0: + print(f"处理到第 {row-1} 行...") + if row % 1000 == 0: + print(f"已处理 {row-1} 行,共 {total_rows} 行") + + # Y (UGC有用性) + if helpfull_col: + y_value = ws_input.cell(row=row, column=helpfull_col).value + y_value = float(y_value) if y_value else 0 + else: + y_value = 0 + ws_output.cell(row=row, column=1, value=y_value) + + # X1 (评论数量) + if comment_count_col: + x1_value = ws_input.cell(row=row, column=comment_count_col).value + x1_value = float(x1_value) if x1_value else 0 + else: + x1_value = 0 + ws_output.cell(row=row, column=2, value=x1_value) + + # 计算评论相关指标 + comment_lengths = [] + comment_complexities = [] + comment_sentiments = [] + comment_richness = [] + + for col in comment_cols: + content = str(ws_input.cell(row=row, column=col).value) + if content and content != 'None' and content != 'nan': + # X2: 评论长度(剔空格后的字符数) + length = len(content.replace(' ', '')) + comment_lengths.append(length) + + # X3: 评论复杂度(按空格拆分的分词数) + complexity = len(content.split()) + comment_complexities.append(complexity) + + # X5: 内容情感性(正面=1、中性=0、负面=-1) + positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent'] + negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor'] + + sentiment = 0 + lower_content = content.lower() + + if any(word in lower_content for word in positive_words): + sentiment = 1 + elif any(word in lower_content for word in negative_words): + sentiment = -1 + comment_sentiments.append(sentiment) + + # X6: 信息丰富度(含数字/链接/表情各1分,满分3分) + richness = 0 + if re.search(r'\d', content): + richness += 1 + if re.search(r'http[s]?://', content): + richness += 1 + if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content): + richness += 1 + comment_richness.append(richness) + + # X2: 评论长度平均值 + x2_value = sum(comment_lengths) / len(comment_lengths) if comment_lengths else 0 + ws_output.cell(row=row, column=3, value=x2_value) + + # X3: 评论复杂度平均值 + x3_value = sum(comment_complexities) / len(comment_complexities) if comment_complexities else 0 + ws_output.cell(row=row, column=4, value=x3_value) + + # X4: 评论可读性(X2/X3,X3为0时记0) + x4_value = x2_value / x3_value if x3_value > 0 else 0 + ws_output.cell(row=row, column=5, value=x4_value) + + # X5: 内容情感性平均值 + x5_value = sum(comment_sentiments) / len(comment_sentiments) if comment_sentiments else 0 + ws_output.cell(row=row, column=6, value=x5_value) + + # X6: 信息丰富度平均值 + x6_value = sum(comment_richness) / len(comment_richness) if comment_richness else 0 + ws_output.cell(row=row, column=7, value=x6_value) + + # 保存文件 + print("\n保存文件...") + wb_output.save(output_file) + + print(f"文件已成功保存: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + print(f"处理完成,共 {total_rows} 行数据") + + # 验证文件 + print("\n验证文件...") + if os.path.exists(output_file): + print("文件保存成功!") + # 重新打开文件检查 + wb_check = openpyxl.load_workbook(output_file) + ws_check = wb_check.active + print(f"输出文件行数: {ws_check.max_row - 1}") + print(f"输出文件列数: {ws_check.max_column}") + + # 显示前5行数据 + print("\n前5行数据:") + for row in range(1, min(6, ws_check.max_row + 1)): + row_data = [] + for col in range(1, ws_check.max_column + 1): + value = ws_check.cell(row=row, column=col).value + row_data.append(value) + print(f"行 {row}: {row_data}") + else: + print("文件保存失败!") + + print() + print("========================================") + print(" 任务完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/process_all_data.py b/DataCleaner/process_all_data.py new file mode 100644 index 0000000..e7db13c --- /dev/null +++ b/DataCleaner/process_all_data.py @@ -0,0 +1,190 @@ +import os +import openpyxl +import re + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' + +print("========================================") +print(" 处理所有数据") +print("========================================") +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + +# 读取原始数据 +try: + print("正在读取原始数据...") + wb_input = openpyxl.load_workbook(input_file) + ws_input = wb_input.active + + print(f"工作表名称: {ws_input.title}") + print(f"最大行数: {ws_input.max_row}") + print(f"最大列数: {ws_input.max_column}") + + # 识别列 + print("\n识别列...") + headers = [] + helpfull_col = None + comment_count_col = None + comment_cols = [] + + for col in range(1, ws_input.max_column + 1): + header = ws_input.cell(row=1, column=col).value + headers.append(header) + + if header: + header_str = str(header).lower() + if 'helpfull' in header_str or 'helpful' in header_str: + helpfull_col = col + print(f"找到 Y 列 (helpfull): 列 {col}") + elif '评论总数' in str(header) or '帖子评论总数' in str(header): + comment_count_col = col + print(f"找到 X1 列 (评论总数): 列 {col}") + elif '评论' in str(header) and any(str(i) in str(header) for i in range(1, 6)): + comment_cols.append(col) + print(f"找到评论列 {len(comment_cols)}: 列 {col} - {header}") + + print(f"\n共找到 {len(comment_cols)} 个评论列") + + # 创建新的输出文件 + print("\n创建新的输出文件...") + wb_output = openpyxl.Workbook() + ws_output = wb_output.active + + # 写入表头 + headers_output = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] + for i, header in enumerate(headers_output, 1): + ws_output.cell(row=1, column=i, value=header) + + # 计算并填充数据 + print("\n计算并填充数据...") + total_rows = ws_input.max_row - 1 + print(f"总数据行数: {total_rows}") + + for row in range(2, ws_input.max_row + 1): + if row % 1000 == 0: + print(f"处理到第 {row-1} 行...") + + # Y (UGC有用性) + if helpfull_col: + y_value = ws_input.cell(row=row, column=helpfull_col).value + y_value = float(y_value) if y_value else 0 + else: + y_value = 0 + ws_output.cell(row=row, column=1, value=y_value) + + # X1 (评论数量) + if comment_count_col: + x1_value = ws_input.cell(row=row, column=comment_count_col).value + x1_value = float(x1_value) if x1_value else 0 + else: + x1_value = 0 + ws_output.cell(row=row, column=2, value=x1_value) + + # 计算评论相关指标 + comment_lengths = [] + comment_complexities = [] + comment_sentiments = [] + comment_richness = [] + + for col in comment_cols: + content = str(ws_input.cell(row=row, column=col).value) + if content and content != 'None' and content != 'nan': + # X2: 评论长度(剔空格后的字符数) + length = len(content.replace(' ', '')) + comment_lengths.append(length) + + # X3: 评论复杂度(按空格拆分的分词数) + complexity = len(content.split()) + comment_complexities.append(complexity) + + # X5: 内容情感性(正面=1、中性=0、负面=-1) + positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent'] + negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor'] + + sentiment = 0 + lower_content = content.lower() + + if any(word in lower_content for word in positive_words): + sentiment = 1 + elif any(word in lower_content for word in negative_words): + sentiment = -1 + comment_sentiments.append(sentiment) + + # X6: 信息丰富度(含数字/链接/表情各1分,满分3分) + richness = 0 + if re.search(r'\d', content): + richness += 1 + if re.search(r'http[s]?://', content): + richness += 1 + if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content): + richness += 1 + comment_richness.append(richness) + + # X2: 评论长度平均值 + x2_value = sum(comment_lengths) / len(comment_lengths) if comment_lengths else 0 + ws_output.cell(row=row, column=3, value=x2_value) + + # X3: 评论复杂度平均值 + x3_value = sum(comment_complexities) / len(comment_complexities) if comment_complexities else 0 + ws_output.cell(row=row, column=4, value=x3_value) + + # X4: 评论可读性(X2/X3,X3为0时记0) + x4_value = x2_value / x3_value if x3_value > 0 else 0 + ws_output.cell(row=row, column=5, value=x4_value) + + # X5: 内容情感性平均值 + x5_value = sum(comment_sentiments) / len(comment_sentiments) if comment_sentiments else 0 + ws_output.cell(row=row, column=6, value=x5_value) + + # X6: 信息丰富度平均值 + x6_value = sum(comment_richness) / len(comment_richness) if comment_richness else 0 + ws_output.cell(row=row, column=7, value=x6_value) + + # 保存文件 + print("\n保存文件...") + wb_output.save(output_file) + + print(f"文件已成功保存: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + print(f"处理完成,共 {total_rows} 行数据") + + # 验证文件 + print("\n验证文件...") + if os.path.exists(output_file): + print("文件保存成功!") + # 重新打开文件检查 + wb_check = openpyxl.load_workbook(output_file) + ws_check = wb_check.active + print(f"输出文件行数: {ws_check.max_row - 1}") + print(f"输出文件列数: {ws_check.max_column}") + + # 显示前5行数据 + print("\n前5行数据:") + for row in range(1, min(6, ws_check.max_row + 1)): + row_data = [] + for col in range(1, ws_check.max_column + 1): + value = ws_check.cell(row=row, column=col).value + row_data.append(value) + print(f"行 {row}: {row_data}") + else: + print("文件保存失败!") + + print() + print("========================================") + print(" 任务完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/process_all_rows.py b/DataCleaner/process_all_rows.py new file mode 100644 index 0000000..62d277c --- /dev/null +++ b/DataCleaner/process_all_rows.py @@ -0,0 +1,157 @@ +import os +import pandas as pd +import re + +print("=" * 60) +print(" 处理全部数据") +print("=" * 60) + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' + +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 读取全部数据 +print("读取全部数据...") +df = pd.read_excel(input_file, engine='openpyxl') +print(f"成功读取 {len(df)} 行数据") +print(f"原始列数: {len(df.columns)}") + +# 识别列 +print("\n识别列...") +helpfull_col = None +comment_count_col = None +comment_cols = [] + +for col in df.columns: + col_str = str(col).lower() + if 'helpfull' in col_str or 'helpful' in col_str: + helpfull_col = col + print(f"找到 Y 列 (helpfull): {col}") + elif '评论总数' in str(col) or '帖子评论总数' in str(col): + comment_count_col = col + print(f"找到 X1 列 (评论总数): {col}") + elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): + comment_cols.append(col) + +print(f"\n共找到 {len(comment_cols)} 个评论内容列") + +# 添加回归数据列 +print("\n添加回归数据列...") + +# Y (UGC有用性) +print("1. 添加 Y (UGC有用性)") +if helpfull_col: + df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) +else: + df['Y'] = 0 + +# X1 (评论数量) +print("2. 添加 X1 (评论数量)") +if comment_count_col: + df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) +else: + df['X1'] = 0 + +# 定义函数计算评论指标 +def calculate_comment_metrics(content): + if pd.isna(content) or str(content) in ['None', 'nan', '']: + return 0, 0, 0, 0 + + content = str(content) + length = len(content.replace(' ', '').replace('\u3000', '')) + complexity = len(content.split()) + + positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] + negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] + + sentiment = 0 + lower_content = content.lower() + if any(word in lower_content for word in positive_words): + sentiment = 1 + elif any(word in lower_content for word in negative_words): + sentiment = -1 + + richness = 0 + if re.search(r'\d', content): + richness += 1 + if re.search(r'http[s]?://|www\.', content): + richness += 1 + if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): + richness += 1 + + return length, complexity, sentiment, richness + +# 计算评论相关指标 +print("3. 计算评论相关指标...") +print(f"总数据行数: {len(df)}") + +df['X2'] = 0.0 +df['X3'] = 0.0 +df['X5'] = 0.0 +df['X6'] = 0.0 + +for i in range(len(df)): + if i % 1000 == 0: + print(f" 处理第 {i}/{len(df)} 行...") + + lengths = [] + complexities = [] + sentiments = [] + richness = [] + + for col in comment_cols: + content = df.iloc[i].get(col, '') + length, complexity, sentiment, r = calculate_comment_metrics(content) + if length > 0: + lengths.append(length) + complexities.append(complexity) + sentiments.append(sentiment) + richness.append(r) + + if lengths: + df.loc[i, 'X2'] = sum(lengths) / len(lengths) + df.loc[i, 'X3'] = sum(complexities) / len(complexities) + df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) + df.loc[i, 'X6'] = sum(richness) / len(richness) + +# X4: 评论可读性 +print("4. 计算 X4 (评论可读性)") +df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) + +# 数据清洗 +print("\n5. 数据清洗...") +regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] +for col in regression_cols: + df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) + df[col] = df[col].replace([float('inf'), float('-inf')], 0) + +# 验证数据 +print("\n6. 验证数据...") +print(f"总行数: {len(df)}") +print(f"总列数: {len(df.columns)}") +print(f"\n回归数据列统计:") +print(df[regression_cols].describe()) + +# 保存文件 +print("\n7. 保存文件...") +df.to_excel(output_file, index=False, engine='openpyxl') + +# 验证文件 +print("\n8. 验证文件...") +if os.path.exists(output_file): + print(f"文件已成功保存: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + df_check = pd.read_excel(output_file) + print(f"输出文件行数: {len(df_check)}") + print(f"输出文件列数: {len(df_check.columns)}") +else: + print("文件保存失败!") + +print() +print("=" * 60) +print(" 任务完成") +print("=" * 60) diff --git a/DataCleaner/process_efficient.py b/DataCleaner/process_efficient.py new file mode 100644 index 0000000..f78f977 --- /dev/null +++ b/DataCleaner/process_efficient.py @@ -0,0 +1,180 @@ +import os +import pandas as pd +import re + +print("=" * 60) +print(" 高效处理全部数据") +print("=" * 60) + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' + +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 首先读取表头来识别列 +print("1. 读取表头...") +df_header = pd.read_excel(input_file, engine='openpyxl', nrows=0) +print(f"总列数: {len(df_header.columns)}") + +# 识别列 +helpfull_col = None +comment_count_col = None +comment_cols = [] + +for col in df_header.columns: + col_str = str(col).lower() + if 'helpfull' in col_str or 'helpful' in col_str: + helpfull_col = col + print(f"找到 Y 列 (helpfull): {col}") + elif '评论总数' in str(col) or '帖子评论总数' in str(col): + comment_count_col = col + print(f"找到 X1 列 (评论总数): {col}") + elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): + comment_cols.append(col) + +print(f"共找到 {len(comment_cols)} 个评论内容列") + +# 定义函数计算评论指标 +def calculate_comment_metrics(content): + if pd.isna(content) or str(content) in ['None', 'nan', '']: + return 0, 0, 0, 0 + + content = str(content) + length = len(content.replace(' ', '').replace('\u3000', '')) + complexity = len(content.split()) + + positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] + negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] + + sentiment = 0 + lower_content = content.lower() + if any(word in lower_content for word in positive_words): + sentiment = 1 + elif any(word in lower_content for word in negative_words): + sentiment = -1 + + richness = 0 + if re.search(r'\d', content): + richness += 1 + if re.search(r'http[s]?://|www\.', content): + richness += 1 + if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): + richness += 1 + + return length, complexity, sentiment, richness + +# 分批处理数据 +print("\n2. 分批处理数据...") +batch_size = 5000 +batch_num = 0 +all_data = [] + +while True: + skip_rows = batch_num * batch_size + 1 if batch_num > 0 else 0 + nrows = batch_size + + print(f" 处理批次 {batch_num + 1} (跳过 {skip_rows} 行,读取 {nrows} 行)...") + + try: + if batch_num == 0: + df_batch = pd.read_excel(input_file, engine='openpyxl', nrows=nrows) + else: + df_batch = pd.read_excel(input_file, engine='openpyxl', skiprows=skip_rows, nrows=nrows, header=None) + df_batch.columns = df_header.columns + except Exception as e: + print(f" 读取完成或出错: {e}") + break + + if len(df_batch) == 0: + print(" 没有更多数据") + break + + print(f" 读取了 {len(df_batch)} 行") + + # 添加Y和X1 + if helpfull_col: + df_batch['Y'] = pd.to_numeric(df_batch[helpfull_col], errors='coerce').fillna(0) + else: + df_batch['Y'] = 0 + + if comment_count_col: + df_batch['X1'] = pd.to_numeric(df_batch[comment_count_col], errors='coerce').fillna(0) + else: + df_batch['X1'] = 0 + + # 初始化X2-X6 + df_batch['X2'] = 0.0 + df_batch['X3'] = 0.0 + df_batch['X5'] = 0.0 + df_batch['X6'] = 0.0 + + # 计算评论指标 + for i in range(len(df_batch)): + lengths = [] + complexities = [] + sentiments = [] + richness = [] + + for col in comment_cols: + content = df_batch.iloc[i].get(col, '') + length, complexity, sentiment, r = calculate_comment_metrics(content) + if length > 0: + lengths.append(length) + complexities.append(complexity) + sentiments.append(sentiment) + richness.append(r) + + if lengths: + df_batch.loc[i, 'X2'] = sum(lengths) / len(lengths) + df_batch.loc[i, 'X3'] = sum(complexities) / len(complexities) + df_batch.loc[i, 'X5'] = sum(sentiments) / len(sentiments) + df_batch.loc[i, 'X6'] = sum(richness) / len(richness) + + # 计算X4 + df_batch['X4'] = df_batch.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) + + # 数据清洗 + regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] + for col in regression_cols: + df_batch[col] = pd.to_numeric(df_batch[col], errors='coerce').fillna(0) + df_batch[col] = df_batch[col].replace([float('inf'), float('-inf')], 0) + + all_data.append(df_batch) + batch_num += 1 + + print(f" 批次 {batch_num} 完成,当前总行数: {sum(len(d) for d in all_data)}") + +# 合并所有数据 +print("\n3. 合并数据...") +df_final = pd.concat(all_data, ignore_index=True) +print(f"合并后总行数: {len(df_final)}") + +# 验证数据 +print("\n4. 验证数据...") +regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] +print(f"总列数: {len(df_final.columns)}") +print(f"\n回归数据列统计:") +print(df_final[regression_cols].describe()) + +# 保存文件 +print("\n5. 保存文件...") +df_final.to_excel(output_file, index=False, engine='openpyxl') + +# 验证文件 +print("\n6. 验证文件...") +if os.path.exists(output_file): + print(f"文件已成功保存: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + df_check = pd.read_excel(output_file) + print(f"输出文件行数: {len(df_check)}") + print(f"输出文件列数: {len(df_check.columns)}") +else: + print("文件保存失败!") + +print() +print("=" * 60) +print(" 任务完成") +print("=" * 60) diff --git a/DataCleaner/process_large_file.py b/DataCleaner/process_large_file.py new file mode 100644 index 0000000..304be6d --- /dev/null +++ b/DataCleaner/process_large_file.py @@ -0,0 +1,177 @@ +import os +import pandas as pd +import re + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' + +print("========================================") +print(" 处理大型Excel文件") +print("========================================") +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + +# 读取原始数据 +try: + print("正在读取原始数据...") + # 使用pandas读取Excel文件,设置引擎为openpyxl + df = pd.read_excel(input_file, engine='openpyxl') + print(f"成功读取 {len(df)} 行数据") + print(f"列名: {list(df.columns)}") + + # 识别列 + print("\n识别列...") + helpfull_col = None + comment_count_col = None + comment_cols = [] + + for col in df.columns: + col_str = str(col).lower() + if 'helpfull' in col_str or 'helpful' in col_str: + helpfull_col = col + print(f"找到 Y 列 (helpfull): {col}") + elif '评论总数' in str(col) or '帖子评论总数' in str(col): + comment_count_col = col + print(f"找到 X1 列 (评论总数): {col}") + elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)): + comment_cols.append(col) + print(f"找到评论列 {len(comment_cols)}: {col}") + + print(f"\n共找到 {len(comment_cols)} 个评论列") + + # 创建回归数据 + print("\n创建回归数据...") + regression_data = pd.DataFrame() + + # Y (UGC有用性) + print("1. 计算 Y (UGC有用性)") + if helpfull_col: + regression_data['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) + else: + regression_data['Y'] = 0 + + # X1 (评论数量) + print("2. 计算 X1 (评论数量)") + if comment_count_col: + regression_data['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) + else: + regression_data['X1'] = 0 + + # 定义函数计算评论指标 + def calculate_comment_metrics(content): + if pd.isna(content) or str(content) in ['None', 'nan']: + return 0, 0, 0, 0 + + content = str(content) + # 评论长度 + length = len(content.replace(' ', '')) + # 评论复杂度 + complexity = len(content.split()) + # 情感分析 + positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent'] + negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor'] + + sentiment = 0 + lower_content = content.lower() + if any(word in lower_content for word in positive_words): + sentiment = 1 + elif any(word in lower_content for word in negative_words): + sentiment = -1 + # 信息丰富度 + richness = 0 + if re.search(r'\d', content): + richness += 1 + if re.search(r'http[s]?://', content): + richness += 1 + if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content): + richness += 1 + + return length, complexity, sentiment, richness + + # 计算评论相关指标 + print("3. 计算评论相关指标...") + + # 初始化列 + regression_data['X2'] = 0 # 评论长度 + regression_data['X3'] = 0 # 评论复杂度 + regression_data['X5'] = 0 # 情感性 + regression_data['X6'] = 0 # 信息丰富度 + + # 逐行计算 + total_rows = len(df) + for i in range(total_rows): + if i % 1000 == 0: + print(f"处理到第 {i} 行...") + + lengths = [] + complexities = [] + sentiments = [] + richness = [] + + for col in comment_cols: + content = df.iloc[i].get(col, '') + length, complexity, sentiment, r = calculate_comment_metrics(content) + if length > 0: + lengths.append(length) + complexities.append(complexity) + sentiments.append(sentiment) + richness.append(r) + + # 计算平均值 + if lengths: + regression_data.loc[i, 'X2'] = sum(lengths) / len(lengths) + regression_data.loc[i, 'X3'] = sum(complexities) / len(complexities) + regression_data.loc[i, 'X5'] = sum(sentiments) / len(sentiments) + regression_data.loc[i, 'X6'] = sum(richness) / len(richness) + + # X4: 评论可读性 + print("4. 计算 X4 (评论可读性)") + regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) + + # 数据清洗 + print("\n5. 数据清洗...") + for col in regression_data.columns: + regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0) + + # 验证数据 + print("\n6. 验证数据...") + print(f"行数: {len(regression_data)}") + print(f"列数: {len(regression_data.columns)}") + print(f"列名: {list(regression_data.columns)}") + print(f"\n前5行数据:") + print(regression_data.head()) + + # 保存文件 + print("\n7. 保存文件...") + regression_data.to_excel(output_file, index=False) + + # 验证文件 + print("\n8. 验证文件...") + if os.path.exists(output_file): + print(f"文件已成功保存: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + # 重新读取检查 + df_check = pd.read_excel(output_file) + print(f"输出文件行数: {len(df_check)}") + print(f"输出文件列数: {len(df_check.columns)}") + else: + print("文件保存失败!") + + print() + print("========================================") + print(" 任务完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/process_log.txt b/DataCleaner/process_log.txt new file mode 100644 index 0000000..afe1ed8 --- /dev/null +++ b/DataCleaner/process_log.txt @@ -0,0 +1,9 @@ +======================================== + 在原表中添加回归数据列 +======================================== +输入文件: D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx +输出文件: D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx + +输入文件大小: 21607.43 KB + +正在读取原始数据... diff --git a/DataCleaner/process_regression_final.py b/DataCleaner/process_regression_final.py new file mode 100644 index 0000000..cca17c2 --- /dev/null +++ b/DataCleaner/process_regression_final.py @@ -0,0 +1,192 @@ +import os +import pandas as pd +import re + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' + +print("========================================") +print(" 在原表中添加回归数据列") +print("========================================") +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + +# 读取原始数据 +try: + print("\n正在读取原始数据...") + df = pd.read_excel(input_file) + print(f"成功读取 {len(df)} 行数据") + print(f"原始列数: {len(df.columns)}") + + # 识别列 + print("\n识别列...") + helpfull_col = None + comment_count_col = None + comment_cols = [] + + for col in df.columns: + col_str = str(col).lower() + if 'helpfull' in col_str or 'helpful' in col_str: + helpfull_col = col + print(f"找到 Y 列 (helpfull): {col}") + elif '评论总数' in str(col) or '帖子评论总数' in str(col): + comment_count_col = col + print(f"找到 X1 列 (评论总数): {col}") + elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): + comment_cols.append(col) + print(f"找到评论列 {len(comment_cols)}: {col}") + + print(f"\n共找到 {len(comment_cols)} 个评论内容列") + + # 添加回归数据列 + print("\n添加回归数据列...") + + # Y (UGC有用性) - 直接复制helpfull列 + print("1. 添加 Y (UGC有用性)") + if helpfull_col: + df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) + else: + df['Y'] = 0 + + # X1 (评论数量) - 直接复制帖子评论总数列 + print("2. 添加 X1 (评论数量)") + if comment_count_col: + df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) + else: + df['X1'] = 0 + + # 定义函数计算评论指标 + def calculate_comment_metrics(content): + if pd.isna(content) or str(content) in ['None', 'nan', '']: + return 0, 0, 0, 0 + + content = str(content) + # X2: 评论长度(剔空格后的字符数) + length = len(content.replace(' ', '').replace('\u3000', '')) + # X3: 评论复杂度(按空格拆分的分词数) + complexity = len(content.split()) + # X5: 情感分析(正面=1、中性=0、负面=-1) + positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] + negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] + + sentiment = 0 + lower_content = content.lower() + if any(word in lower_content for word in positive_words): + sentiment = 1 + elif any(word in lower_content for word in negative_words): + sentiment = -1 + # X6: 信息丰富度(含数字/链接/表情各1分,满分3分) + richness = 0 + if re.search(r'\d', content): # 含数字 + richness += 1 + if re.search(r'http[s]?://|www\.', content): # 含链接 + richness += 1 + if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情 + richness += 1 + + return length, complexity, sentiment, richness + + # 计算评论相关指标 + print("3. 计算评论相关指标...") + + # 初始化列 + df['X2'] = 0.0 # 评论长度 + df['X3'] = 0.0 # 评论复杂度 + df['X5'] = 0.0 # 情感性 + df['X6'] = 0.0 # 信息丰富度 + + # 逐行计算 + total_rows = len(df) + print(f"总数据行数: {total_rows}") + + for i in range(total_rows): + if i % 1000 == 0: + print(f" 处理第 {i}/{total_rows} 行...") + + lengths = [] + complexities = [] + sentiments = [] + richness = [] + + for col in comment_cols: + content = df.iloc[i].get(col, '') + length, complexity, sentiment, r = calculate_comment_metrics(content) + if length > 0: # 只统计有内容的评论 + lengths.append(length) + complexities.append(complexity) + sentiments.append(sentiment) + richness.append(r) + + # 计算平均值(无评论记0) + if lengths: + df.loc[i, 'X2'] = sum(lengths) / len(lengths) + df.loc[i, 'X3'] = sum(complexities) / len(complexities) + df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) + df.loc[i, 'X6'] = sum(richness) / len(richness) + + # X4: 评论可读性 = X2/X3(X3为0时记0,避免报错) + print("4. 计算 X4 (评论可读性)") + df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) + + # 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误 + print("\n5. 数据清洗...") + regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] + for col in regression_cols: + # 转换为数字,错误值转为0 + df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) + # 替换无穷大 + df[col] = df[col].replace([float('inf'), float('-inf')], 0) + + # 验证数据 + print("\n6. 验证数据...") + print(f"总行数: {len(df)}") + print(f"总列数: {len(df.columns)}") + print(f"\n回归数据列统计:") + print(df[regression_cols].describe()) + print(f"\n前5行回归数据:") + print(df[regression_cols].head()) + + # 检查是否有空值或错误值 + print(f"\n空值检查:") + for col in regression_cols: + null_count = df[col].isnull().sum() + print(f" {col}: {null_count} 个空值") + + # 保存文件 + print("\n7. 保存文件...") + print(f"正在保存到: {output_file}") + df.to_excel(output_file, index=False, engine='openpyxl') + + # 验证文件 + print("\n8. 验证文件...") + if os.path.exists(output_file): + print(f"文件已成功保存: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + # 重新读取检查 + df_check = pd.read_excel(output_file) + print(f"输出文件行数: {len(df_check)}") + print(f"输出文件列数: {len(df_check.columns)}") + print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}") + else: + print("文件保存失败!") + + print() + print("========================================") + print(" 任务完成") + print("========================================") + print(f"新文件已保存: {output_file}") + print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/process_with_csv.py b/DataCleaner/process_with_csv.py new file mode 100644 index 0000000..f2f6797 --- /dev/null +++ b/DataCleaner/process_with_csv.py @@ -0,0 +1,202 @@ +import os +import pandas as pd +import re + +print("=" * 60) +print(" 使用CSV处理回归数据") +print("=" * 60) + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' + +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + +# 读取原始数据 +print("\n正在读取原始数据...") +try: + df = pd.read_excel(input_file, engine='openpyxl') + print(f"成功读取 {len(df)} 行数据") + print(f"原始列数: {len(df.columns)}") +except Exception as e: + print(f"读取失败: {e}") + exit(1) + +# 识别列 +print("\n识别列...") +helpfull_col = None +comment_count_col = None +comment_cols = [] + +for col in df.columns: + col_str = str(col).lower() + if 'helpfull' in col_str or 'helpful' in col_str: + helpfull_col = col + print(f"找到 Y 列 (helpfull): {col}") + elif '评论总数' in str(col) or '帖子评论总数' in str(col): + comment_count_col = col + print(f"找到 X1 列 (评论总数): {col}") + elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): + comment_cols.append(col) + print(f"找到评论列 {len(comment_cols)}: {col}") + +print(f"\n共找到 {len(comment_cols)} 个评论内容列") + +# 添加回归数据列 +print("\n添加回归数据列...") + +# Y (UGC有用性) - 直接复制helpfull列 +print("1. 添加 Y (UGC有用性)") +if helpfull_col: + df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) +else: + df['Y'] = 0 + +# X1 (评论数量) - 直接复制帖子评论总数列 +print("2. 添加 X1 (评论数量)") +if comment_count_col: + df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) +else: + df['X1'] = 0 + +# 定义函数计算评论指标 +def calculate_comment_metrics(content): + if pd.isna(content) or str(content) in ['None', 'nan', '']: + return 0, 0, 0, 0 + + content = str(content) + # X2: 评论长度(剔空格后的字符数) + length = len(content.replace(' ', '').replace('\u3000', '')) + # X3: 评论复杂度(按空格拆分的分词数) + complexity = len(content.split()) + # X5: 情感分析(正面=1、中性=0、负面=-1) + positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] + negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] + + sentiment = 0 + lower_content = content.lower() + if any(word in lower_content for word in positive_words): + sentiment = 1 + elif any(word in lower_content for word in negative_words): + sentiment = -1 + # X6: 信息丰富度(含数字/链接/表情各1分,满分3分) + richness = 0 + if re.search(r'\d', content): # 含数字 + richness += 1 + if re.search(r'http[s]?://|www\.', content): # 含链接 + richness += 1 + if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情 + richness += 1 + + return length, complexity, sentiment, richness + +# 计算评论相关指标 +print("3. 计算评论相关指标...") + +# 初始化列 +df['X2'] = 0.0 # 评论长度 +df['X3'] = 0.0 # 评论复杂度 +df['X5'] = 0.0 # 情感性 +df['X6'] = 0.0 # 信息丰富度 + +# 逐行计算 +total_rows = len(df) +print(f"总数据行数: {total_rows}") + +for i in range(total_rows): + if i % 1000 == 0: + print(f" 处理第 {i}/{total_rows} 行...") + + lengths = [] + complexities = [] + sentiments = [] + richness = [] + + for col in comment_cols: + content = df.iloc[i].get(col, '') + length, complexity, sentiment, r = calculate_comment_metrics(content) + if length > 0: # 只统计有内容的评论 + lengths.append(length) + complexities.append(complexity) + sentiments.append(sentiment) + richness.append(r) + + # 计算平均值(无评论记0) + if lengths: + df.loc[i, 'X2'] = sum(lengths) / len(lengths) + df.loc[i, 'X3'] = sum(complexities) / len(complexities) + df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) + df.loc[i, 'X6'] = sum(richness) / len(richness) + +# X4: 评论可读性 = X2/X3(X3为0时记0,避免报错) +print("4. 计算 X4 (评论可读性)") +df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) + +# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误 +print("\n5. 数据清洗...") +regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] +for col in regression_cols: + # 转换为数字,错误值转为0 + df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) + # 替换无穷大 + df[col] = df[col].replace([float('inf'), float('-inf')], 0) + +# 验证数据 +print("\n6. 验证数据...") +print(f"总行数: {len(df)}") +print(f"总列数: {len(df.columns)}") +print(f"\n回归数据列统计:") +print(df[regression_cols].describe()) +print(f"\n前5行回归数据:") +print(df[regression_cols].head()) + +# 检查是否有空值或错误值 +print(f"\n空值检查:") +for col in regression_cols: + null_count = df[col].isnull().sum() + print(f" {col}: {null_count} 个空值") + +# 保存为CSV中间文件 +print("\n7. 保存为CSV中间文件...") +csv_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\temp_regression.csv' +df.to_csv(csv_file, index=False, encoding='utf-8-sig') +print(f"CSV文件已保存: {csv_file}") +print(f"CSV文件大小: {os.path.getsize(csv_file) / 1024:.2f} KB") + +# 从CSV读取并保存为Excel +print("\n8. 转换为Excel文件...") +df_csv = pd.read_csv(csv_file, encoding='utf-8-sig') +df_csv.to_excel(output_file, index=False, engine='openpyxl') + +# 验证文件 +print("\n9. 验证文件...") +if os.path.exists(output_file): + print(f"文件已成功保存: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + # 重新读取检查 + df_check = pd.read_excel(output_file) + print(f"输出文件行数: {len(df_check)}") + print(f"输出文件列数: {len(df_check.columns)}") + print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}") + + # 删除临时CSV文件 + os.remove(csv_file) + print(f"\n临时CSV文件已删除") +else: + print("文件保存失败!") + +print() +print("=" * 60) +print(" 任务完成") +print("=" * 60) +print(f"新文件已保存: {output_file}") +print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列") diff --git a/DataCleaner/process_with_pandas.py b/DataCleaner/process_with_pandas.py new file mode 100644 index 0000000..5a09d25 --- /dev/null +++ b/DataCleaner/process_with_pandas.py @@ -0,0 +1,168 @@ +import os +import pandas as pd +import re + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' + +print("========================================") +print(" 使用pandas处理所有数据") +print("========================================") +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + +# 读取原始数据 +try: + print("正在读取原始数据...") + df = pd.read_excel(input_file) + print(f"成功读取 {len(df)} 行数据") + print(f"列名: {list(df.columns)}") + + # 识别列 + print("\n识别列...") + helpfull_col = None + comment_count_col = None + comment_cols = [] + + for col in df.columns: + col_str = str(col).lower() + if 'helpfull' in col_str or 'helpful' in col_str: + helpfull_col = col + print(f"找到 Y 列 (helpfull): {col}") + elif '评论总数' in str(col) or '帖子评论总数' in str(col): + comment_count_col = col + print(f"找到 X1 列 (评论总数): {col}") + elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)): + comment_cols.append(col) + print(f"找到评论列 {len(comment_cols)}: {col}") + + print(f"\n共找到 {len(comment_cols)} 个评论列") + + # 创建回归数据 + print("\n创建回归数据...") + regression_data = pd.DataFrame() + + # Y (UGC有用性) + print("1. 计算 Y (UGC有用性)") + if helpfull_col: + regression_data['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) + else: + regression_data['Y'] = 0 + + # X1 (评论数量) + print("2. 计算 X1 (评论数量)") + if comment_count_col: + regression_data['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) + else: + regression_data['X1'] = 0 + + # 定义函数计算评论指标 + def calculate_comment_metrics(row): + lengths = [] + complexities = [] + sentiments = [] + richness = [] + + for col in comment_cols: + content = str(row.get(col, '')) + if content and content != 'None' and content != 'nan': + # 评论长度 + lengths.append(len(content.replace(' ', ''))) + # 评论复杂度 + complexities.append(len(content.split())) + # 情感分析 + positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent'] + negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor'] + + sentiment = 0 + lower_content = content.lower() + if any(word in lower_content for word in positive_words): + sentiment = 1 + elif any(word in lower_content for word in negative_words): + sentiment = -1 + sentiments.append(sentiment) + # 信息丰富度 + r = 0 + if re.search(r'\d', content): + r += 1 + if re.search(r'http[s]?://', content): + r += 1 + if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content): + r += 1 + richness.append(r) + + return lengths, complexities, sentiments, richness + + # 计算评论相关指标 + print("3. 计算评论相关指标...") + comment_metrics = df.apply(calculate_comment_metrics, axis=1) + + # X2: 评论长度平均值 + print("4. 计算 X2 (评论长度)") + regression_data['X2'] = comment_metrics.apply(lambda x: sum(x[0]) / len(x[0]) if x[0] else 0) + + # X3: 评论复杂度平均值 + print("5. 计算 X3 (评论复杂度)") + regression_data['X3'] = comment_metrics.apply(lambda x: sum(x[1]) / len(x[1]) if x[1] else 0) + + # X4: 评论可读性 + print("6. 计算 X4 (评论可读性)") + regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) + + # X5: 内容情感性平均值 + print("7. 计算 X5 (内容情感性)") + regression_data['X5'] = comment_metrics.apply(lambda x: sum(x[2]) / len(x[2]) if x[2] else 0) + + # X6: 信息丰富度平均值 + print("8. 计算 X6 (信息丰富度)") + regression_data['X6'] = comment_metrics.apply(lambda x: sum(x[3]) / len(x[3]) if x[3] else 0) + + # 数据清洗 + print("\n9. 数据清洗...") + for col in regression_data.columns: + regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0) + + # 验证数据 + print("\n10. 验证数据...") + print(f"行数: {len(regression_data)}") + print(f"列数: {len(regression_data.columns)}") + print(f"列名: {list(regression_data.columns)}") + print(f"数据类型:") + print(regression_data.dtypes) + print(f"\n前5行数据:") + print(regression_data.head()) + + # 保存文件 + print("\n11. 保存文件...") + regression_data.to_excel(output_file, index=False) + + # 验证文件 + print("\n12. 验证文件...") + if os.path.exists(output_file): + print(f"文件已成功保存: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + # 重新读取检查 + df_check = pd.read_excel(output_file) + print(f"输出文件行数: {len(df_check)}") + print(f"输出文件列数: {len(df_check.columns)}") + else: + print("文件保存失败!") + + print() + print("========================================") + print(" 任务完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/quick_process.py b/DataCleaner/quick_process.py new file mode 100644 index 0000000..2d6ce03 --- /dev/null +++ b/DataCleaner/quick_process.py @@ -0,0 +1,83 @@ +import os +import pandas as pd +import re + +print("开始处理...") + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' + +# 读取数据 +print("读取数据...") +df = pd.read_excel(input_file) +print(f"读取完成: {len(df)} 行") + +# 识别列 +helpfull_col = [c for c in df.columns if 'helpfull' in str(c).lower()][0] if any('helpfull' in str(c).lower() for c in df.columns) else None +comment_count_col = [c for c in df.columns if '评论总数' in str(c)][0] if any('评论总数' in str(c) for c in df.columns) else None +comment_cols = [c for c in df.columns if '评论' in str(c) and any(str(i) in str(c) for i in range(1, 6)) and '内容' in str(c)] + +print(f"找到列: Y={helpfull_col}, X1={comment_count_col}, 评论列={len(comment_cols)}") + +# 添加Y和X1 +df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) if helpfull_col else 0 +df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) if comment_count_col else 0 + +# 计算评论指标 +print("计算评论指标...") + +def calc_metrics(content): + if pd.isna(content) or str(content) in ['None', 'nan', '']: + return 0, 0, 0, 0 + content = str(content) + length = len(content.replace(' ', '').replace('\u3000', '')) + complexity = len(content.split()) + + pos_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent'] + neg_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor'] + sentiment = 1 if any(w in content.lower() for w in pos_words) else (-1 if any(w in content.lower() for w in neg_words) else 0) + + richness = (1 if re.search(r'\d', content) else 0) + (1 if re.search(r'http[s]?://|www\.', content) else 0) + (1 if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]', content) else 0) + + return length, complexity, sentiment, richness + +# 批量计算 +x2_list, x3_list, x5_list, x6_list = [], [], [], [] + +for i in range(len(df)): + if i % 5000 == 0: + print(f"处理 {i}/{len(df)}") + + lengths, complexities, sentiments, richness = [], [], [], [] + + for col in comment_cols: + l, c, s, r = calc_metrics(df.iloc[i].get(col, '')) + if l > 0: + lengths.append(l) + complexities.append(c) + sentiments.append(s) + richness.append(r) + + x2_list.append(sum(lengths)/len(lengths) if lengths else 0) + x3_list.append(sum(complexities)/len(complexities) if complexities else 0) + x5_list.append(sum(sentiments)/len(sentiments) if sentiments else 0) + x6_list.append(sum(richness)/len(richness) if richness else 0) + +df['X2'] = x2_list +df['X3'] = x3_list +df['X5'] = x5_list +df['X6'] = x6_list + +# 计算X4 +df['X4'] = df.apply(lambda r: r['X2']/r['X3'] if r['X3']>0 else 0, axis=1) + +# 清洗数据 +for col in ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']: + df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).replace([float('inf'), float('-inf')], 0) + +print("保存文件...") +df.to_excel(output_file, index=False, engine='openpyxl') + +print(f"完成!文件大小: {os.path.getsize(output_file)/1024:.2f} KB") +print(f"行数: {len(df)}, 列数: {len(df.columns)}") diff --git a/DataCleaner/read_excel_test.py b/DataCleaner/read_excel_test.py new file mode 100644 index 0000000..08e509f --- /dev/null +++ b/DataCleaner/read_excel_test.py @@ -0,0 +1,54 @@ +import os +import openpyxl + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' + +print("========================================") +print(" 读取Excel测试") +print("========================================") +print(f"输入文件: {input_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + +# 读取Excel文件 +try: + print("正在读取Excel文件...") + wb = openpyxl.load_workbook(input_file) + ws = wb.active + + print(f"工作表名称: {ws.title}") + print(f"最大行数: {ws.max_row}") + print(f"最大列数: {ws.max_column}") + + # 读取表头 + print("\n表头:") + headers = [] + for col in range(1, ws.max_column + 1): + header = ws.cell(row=1, column=col).value + headers.append(header) + print(f"{col}. {header}") + + # 读取前3行数据 + print("\n前3行数据:") + for row in range(2, min(5, ws.max_row + 1)): + row_data = [] + for col in range(1, min(10, ws.max_column + 1)): + value = ws.cell(row=row, column=col).value + row_data.append(value) + print(f"行 {row}: {row_data}") + + print("\n========================================") + print(" 读取完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/run_with_output.py b/DataCleaner/run_with_output.py new file mode 100644 index 0000000..6555dc4 --- /dev/null +++ b/DataCleaner/run_with_output.py @@ -0,0 +1,216 @@ +import os +import pandas as pd +import re +import sys + +# 重定向输出到文件和屏幕 +class Tee: + def __init__(self, *files): + self.files = files + def write(self, obj): + for f in self.files: + f.write(obj) + f.flush() + def flush(self): + for f in self.files: + f.flush() + +log_file = open(r'D:\java\project\process_log.txt', 'w', encoding='utf-8') +original_stdout = sys.stdout +sys.stdout = Tee(original_stdout, log_file) + +print("========================================") +print(" 在原表中添加回归数据列") +print("========================================") + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' + +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + sys.stdout = original_stdout + log_file.close() + exit(1) + +print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + +# 读取原始数据 +try: + print("\n正在读取原始数据...") + df = pd.read_excel(input_file) + print(f"成功读取 {len(df)} 行数据") + print(f"原始列数: {len(df.columns)}") + + # 识别列 + print("\n识别列...") + helpfull_col = None + comment_count_col = None + comment_cols = [] + + for col in df.columns: + col_str = str(col).lower() + if 'helpfull' in col_str or 'helpful' in col_str: + helpfull_col = col + print(f"找到 Y 列 (helpfull): {col}") + elif '评论总数' in str(col) or '帖子评论总数' in str(col): + comment_count_col = col + print(f"找到 X1 列 (评论总数): {col}") + elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): + comment_cols.append(col) + print(f"找到评论列 {len(comment_cols)}: {col}") + + print(f"\n共找到 {len(comment_cols)} 个评论内容列") + + # 添加回归数据列 + print("\n添加回归数据列...") + + # Y (UGC有用性) - 直接复制helpfull列 + print("1. 添加 Y (UGC有用性)") + if helpfull_col: + df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) + else: + df['Y'] = 0 + + # X1 (评论数量) - 直接复制帖子评论总数列 + print("2. 添加 X1 (评论数量)") + if comment_count_col: + df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) + else: + df['X1'] = 0 + + # 定义函数计算评论指标 + def calculate_comment_metrics(content): + if pd.isna(content) or str(content) in ['None', 'nan', '']: + return 0, 0, 0, 0 + + content = str(content) + # X2: 评论长度(剔空格后的字符数) + length = len(content.replace(' ', '').replace('\u3000', '')) + # X3: 评论复杂度(按空格拆分的分词数) + complexity = len(content.split()) + # X5: 情感分析(正面=1、中性=0、负面=-1) + positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] + negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] + + sentiment = 0 + lower_content = content.lower() + if any(word in lower_content for word in positive_words): + sentiment = 1 + elif any(word in lower_content for word in negative_words): + sentiment = -1 + # X6: 信息丰富度(含数字/链接/表情各1分,满分3分) + richness = 0 + if re.search(r'\d', content): # 含数字 + richness += 1 + if re.search(r'http[s]?://|www\.', content): # 含链接 + richness += 1 + if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情 + richness += 1 + + return length, complexity, sentiment, richness + + # 计算评论相关指标 + print("3. 计算评论相关指标...") + + # 初始化列 + df['X2'] = 0.0 # 评论长度 + df['X3'] = 0.0 # 评论复杂度 + df['X5'] = 0.0 # 情感性 + df['X6'] = 0.0 # 信息丰富度 + + # 逐行计算 + total_rows = len(df) + print(f"总数据行数: {total_rows}") + + for i in range(total_rows): + if i % 1000 == 0: + print(f" 处理第 {i}/{total_rows} 行...") + + lengths = [] + complexities = [] + sentiments = [] + richness = [] + + for col in comment_cols: + content = df.iloc[i].get(col, '') + length, complexity, sentiment, r = calculate_comment_metrics(content) + if length > 0: # 只统计有内容的评论 + lengths.append(length) + complexities.append(complexity) + sentiments.append(sentiment) + richness.append(r) + + # 计算平均值(无评论记0) + if lengths: + df.loc[i, 'X2'] = sum(lengths) / len(lengths) + df.loc[i, 'X3'] = sum(complexities) / len(complexities) + df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) + df.loc[i, 'X6'] = sum(richness) / len(richness) + + # X4: 评论可读性 = X2/X3(X3为0时记0,避免报错) + print("4. 计算 X4 (评论可读性)") + df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) + + # 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误 + print("\n5. 数据清洗...") + regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] + for col in regression_cols: + # 转换为数字,错误值转为0 + df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) + # 替换无穷大 + df[col] = df[col].replace([float('inf'), float('-inf')], 0) + + # 验证数据 + print("\n6. 验证数据...") + print(f"总行数: {len(df)}") + print(f"总列数: {len(df.columns)}") + print(f"\n回归数据列统计:") + print(df[regression_cols].describe()) + print(f"\n前5行回归数据:") + print(df[regression_cols].head()) + + # 检查是否有空值或错误值 + print(f"\n空值检查:") + for col in regression_cols: + null_count = df[col].isnull().sum() + print(f" {col}: {null_count} 个空值") + + # 保存文件 + print("\n7. 保存文件...") + print(f"正在保存到: {output_file}") + df.to_excel(output_file, index=False, engine='openpyxl') + + # 验证文件 + print("\n8. 验证文件...") + if os.path.exists(output_file): + print(f"文件已成功保存: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + # 重新读取检查 + df_check = pd.read_excel(output_file) + print(f"输出文件行数: {len(df_check)}") + print(f"输出文件列数: {len(df_check.columns)}") + print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}") + else: + print("文件保存失败!") + + print() + print("========================================") + print(" 任务完成") + print("========================================") + print(f"新文件已保存: {output_file}") + print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() +finally: + sys.stdout = original_stdout + log_file.close() + print("日志已保存到: D:\\java\\project\\process_log.txt") diff --git a/DataCleaner/simple_add_columns.py b/DataCleaner/simple_add_columns.py new file mode 100644 index 0000000..fb4663b --- /dev/null +++ b/DataCleaner/simple_add_columns.py @@ -0,0 +1,187 @@ +import os +import pandas as pd +import re + +print("=" * 60) +print(" 在原表中添加回归数据列") +print("=" * 60) + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' + +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + +# 读取原始数据 +print("\n正在读取原始数据...") +df = pd.read_excel(input_file) +print(f"成功读取 {len(df)} 行数据") +print(f"原始列数: {len(df.columns)}") + +# 识别列 +print("\n识别列...") +helpfull_col = None +comment_count_col = None +comment_cols = [] + +for col in df.columns: + col_str = str(col).lower() + if 'helpfull' in col_str or 'helpful' in col_str: + helpfull_col = col + print(f"找到 Y 列 (helpfull): {col}") + elif '评论总数' in str(col) or '帖子评论总数' in str(col): + comment_count_col = col + print(f"找到 X1 列 (评论总数): {col}") + elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): + comment_cols.append(col) + print(f"找到评论列 {len(comment_cols)}: {col}") + +print(f"\n共找到 {len(comment_cols)} 个评论内容列") + +# 添加回归数据列 +print("\n添加回归数据列...") + +# Y (UGC有用性) - 直接复制helpfull列 +print("1. 添加 Y (UGC有用性)") +if helpfull_col: + df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) +else: + df['Y'] = 0 + +# X1 (评论数量) - 直接复制帖子评论总数列 +print("2. 添加 X1 (评论数量)") +if comment_count_col: + df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) +else: + df['X1'] = 0 + +# 定义函数计算评论指标 +def calculate_comment_metrics(content): + if pd.isna(content) or str(content) in ['None', 'nan', '']: + return 0, 0, 0, 0 + + content = str(content) + # X2: 评论长度(剔空格后的字符数) + length = len(content.replace(' ', '').replace('\u3000', '')) + # X3: 评论复杂度(按空格拆分的分词数) + complexity = len(content.split()) + # X5: 情感分析(正面=1、中性=0、负面=-1) + positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] + negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] + + sentiment = 0 + lower_content = content.lower() + if any(word in lower_content for word in positive_words): + sentiment = 1 + elif any(word in lower_content for word in negative_words): + sentiment = -1 + # X6: 信息丰富度(含数字/链接/表情各1分,满分3分) + richness = 0 + if re.search(r'\d', content): # 含数字 + richness += 1 + if re.search(r'http[s]?://|www\.', content): # 含链接 + richness += 1 + if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情 + richness += 1 + + return length, complexity, sentiment, richness + +# 计算评论相关指标 +print("3. 计算评论相关指标...") + +# 初始化列 +df['X2'] = 0.0 # 评论长度 +df['X3'] = 0.0 # 评论复杂度 +df['X5'] = 0.0 # 情感性 +df['X6'] = 0.0 # 信息丰富度 + +# 逐行计算 +total_rows = len(df) +print(f"总数据行数: {total_rows}") + +for i in range(total_rows): + if i % 1000 == 0: + print(f" 处理第 {i}/{total_rows} 行...") + + lengths = [] + complexities = [] + sentiments = [] + richness = [] + + for col in comment_cols: + content = df.iloc[i].get(col, '') + length, complexity, sentiment, r = calculate_comment_metrics(content) + if length > 0: # 只统计有内容的评论 + lengths.append(length) + complexities.append(complexity) + sentiments.append(sentiment) + richness.append(r) + + # 计算平均值(无评论记0) + if lengths: + df.loc[i, 'X2'] = sum(lengths) / len(lengths) + df.loc[i, 'X3'] = sum(complexities) / len(complexities) + df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) + df.loc[i, 'X6'] = sum(richness) / len(richness) + +# X4: 评论可读性 = X2/X3(X3为0时记0,避免报错) +print("4. 计算 X4 (评论可读性)") +df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) + +# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误 +print("\n5. 数据清洗...") +regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] +for col in regression_cols: + # 转换为数字,错误值转为0 + df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) + # 替换无穷大 + df[col] = df[col].replace([float('inf'), float('-inf')], 0) + +# 验证数据 +print("\n6. 验证数据...") +print(f"总行数: {len(df)}") +print(f"总列数: {len(df.columns)}") +print(f"\n回归数据列统计:") +print(df[regression_cols].describe()) +print(f"\n前5行回归数据:") +print(df[regression_cols].head()) + +# 检查是否有空值或错误值 +print(f"\n空值检查:") +for col in regression_cols: + null_count = df[col].isnull().sum() + print(f" {col}: {null_count} 个空值") + +# 保存文件 +print("\n7. 保存文件...") +print(f"正在保存到: {output_file}") +df.to_excel(output_file, index=False, engine='openpyxl') + +# 验证文件 +print("\n8. 验证文件...") +if os.path.exists(output_file): + print(f"文件已成功保存: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + # 重新读取检查 + df_check = pd.read_excel(output_file) + print(f"输出文件行数: {len(df_check)}") + print(f"输出文件列数: {len(df_check.columns)}") + print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}") +else: + print("文件保存失败!") + +print() +print("=" * 60) +print(" 任务完成") +print("=" * 60) +print(f"新文件已保存: {output_file}") +print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列") diff --git a/DataCleaner/simple_calculate.py b/DataCleaner/simple_calculate.py new file mode 100644 index 0000000..3b4161c --- /dev/null +++ b/DataCleaner/simple_calculate.py @@ -0,0 +1,100 @@ +import os +import openpyxl +import re + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' + +print("========================================") +print(" 简单计算UGC回归数据") +print("========================================") +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +if not os.path.exists(output_file): + print("错误: 输出文件不存在!") + exit(1) + +print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + +# 读取输入文件 +try: + print("正在读取输入文件...") + wb_input = openpyxl.load_workbook(input_file) + ws_input = wb_input.active + + print(f"输入工作表名称: {ws_input.title}") + print(f"输入文件最大行数: {ws_input.max_row}") + print(f"输入文件最大列数: {ws_input.max_column}") + + # 读取输出文件 + print("\n正在读取输出文件...") + wb_output = openpyxl.load_workbook(output_file) + ws_output = wb_output.active + + print(f"输出工作表名称: {ws_output.title}") + + # 识别列 + print("\n识别列...") + headers = [] + for col in range(1, ws_input.max_column + 1): + header = ws_input.cell(row=1, column=col).value + headers.append(header) + if header and 'helpfull' in str(header): + helpfull_col = col + print(f"找到 helpfull 列: {col}") + elif header and ('评论总数' in str(header) or '帖子评论总数' in str(header)): + comment_count_col = col + print(f"找到评论总数列: {col}") + elif header and '评论' in str(header): + print(f"找到评论列: {col} - {header}") + + # 计算并填充数据 + print("\n计算并填充数据...") + max_rows = min(ws_input.max_row, 10) # 只处理前10行用于测试 + print(f"处理前 {max_rows - 1} 行数据") + + for row in range(2, max_rows + 1): + print(f"处理行 {row}") + + # Y (UGC有用性) + if 'helpfull_col' in locals(): + y_value = ws_input.cell(row=row, column=helpfull_col).value + ws_output.cell(row=row, column=1, value=y_value if y_value else 0) + else: + ws_output.cell(row=row, column=1, value=0) + + # X1 (评论数量) + if 'comment_count_col' in locals(): + x1_value = ws_input.cell(row=row, column=comment_count_col).value + ws_output.cell(row=row, column=2, value=x1_value if x1_value else 0) + else: + ws_output.cell(row=row, column=2, value=0) + + # X2-X6 暂时设为0 + for col in range(3, 8): + ws_output.cell(row=row, column=col, value=0) + + # 保存文件 + print("\n保存文件...") + wb_output.save(output_file) + + print(f"文件已成功保存: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + + print() + print("========================================") + print(" 任务完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/simple_copy.py b/DataCleaner/simple_copy.py new file mode 100644 index 0000000..9077e92 --- /dev/null +++ b/DataCleaner/simple_copy.py @@ -0,0 +1,41 @@ +import os +import shutil + +# 输入输出文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' + +print("========================================") +print(" 简单文件复制脚本") +print("========================================") +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") +print(f"文件存在: {os.path.exists(input_file)}") + +# 复制文件 +try: + print("正在复制文件...") + shutil.copy2(input_file, output_file) + + # 验证文件是否创建成功 + if os.path.exists(output_file): + print(f"文件已成功复制到: {output_file}") + print(f"复制文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + else: + print("错误: 文件复制失败,未找到输出文件") + + print() + print("========================================") + print(" 任务完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") diff --git a/DataCleaner/simple_data_test.py b/DataCleaner/simple_data_test.py new file mode 100644 index 0000000..b45c1b2 --- /dev/null +++ b/DataCleaner/simple_data_test.py @@ -0,0 +1,54 @@ +import os +import pandas as pd + +# 文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' + +print("========================================") +print(" 简单数据测试") +print("========================================") +print(f"输入文件: {input_file}") +print(f"输出文件: {output_file}") +print() + +# 检查文件是否存在 +if not os.path.exists(input_file): + print("错误: 输入文件不存在!") + exit(1) + +print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") + +# 读取原始数据 +try: + print("正在读取原始数据...") + df = pd.read_excel(input_file) + print(f"成功读取 {len(df)} 行数据") + print(f"列名: {list(df.columns)}") + + # 简单处理:创建一个只包含前5列的新文件 + print("\n创建测试文件...") + test_data = df.head(100) # 只取前100行 + test_output = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\test_output.xlsx' + test_data.to_excel(test_output, index=False) + + print(f"测试文件已创建: {test_output}") + print(f"测试文件大小: {os.path.getsize(test_output) / 1024:.2f} KB") + + # 验证测试文件 + if os.path.exists(test_output): + df_test = pd.read_excel(test_output) + print(f"测试文件行数: {len(df_test)}") + print(f"测试文件列数: {len(df_test.columns)}") + else: + print("测试文件创建失败!") + + print() + print("========================================") + print(" 测试完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/simple_excel_create.py b/DataCleaner/simple_excel_create.py new file mode 100644 index 0000000..7538502 --- /dev/null +++ b/DataCleaner/simple_excel_create.py @@ -0,0 +1,57 @@ +import os +import openpyxl + +# 文件路径 +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' + +print("========================================") +print(" 创建UGC回归数据文件") +print("========================================") +print(f"输出文件: {output_file}") +print() + +# 检查输出目录是否存在 +output_dir = os.path.dirname(output_file) +print(f"输出目录: {output_dir}") +print(f"目录存在: {os.path.exists(output_dir)}") + +if not os.path.exists(output_dir): + print("正在创建输出目录...") + try: + os.makedirs(output_dir) + print("目录创建成功") + except Exception as e: + print(f"创建目录失败: {e}") + exit(1) + +# 创建新的Excel文件 +try: + print("\n创建新的Excel文件...") + wb = openpyxl.Workbook() + ws = wb.active + + # 设置第一行列名 + headers = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] + for i, header in enumerate(headers, 1): + ws.cell(row=1, column=i, value=header) + + # 保存文件 + print(f"保存文件到: {output_file}") + wb.save(output_file) + + # 验证文件是否创建成功 + if os.path.exists(output_file): + print(f"文件已成功创建: {output_file}") + print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") + else: + print("错误: 文件创建失败") + + print() + print("========================================") + print(" 任务完成") + print("========================================") + +except Exception as e: + print(f"处理文件时出错: {str(e)}") + import traceback + traceback.print_exc() diff --git a/DataCleaner/simple_test.py b/DataCleaner/simple_test.py new file mode 100644 index 0000000..d1889f5 --- /dev/null +++ b/DataCleaner/simple_test.py @@ -0,0 +1,22 @@ +import os + +# 测试基本文件操作 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' + +print("========================================") +print(" 简单测试") +print("========================================") +print(f"输入文件: {input_file}") +print() + +# 检查文件是否存在 +if os.path.exists(input_file): + print("文件存在!") + print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") +else: + print("文件不存在!") + +print() +print("========================================") +print(" 测试完成") +print("========================================") diff --git a/DataCleaner/test_file_access.py b/DataCleaner/test_file_access.py new file mode 100644 index 0000000..f46f67a --- /dev/null +++ b/DataCleaner/test_file_access.py @@ -0,0 +1,49 @@ +import os + +# 测试文件路径 +input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' +output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' + +print("========================================") +print(" 测试文件访问") +print("========================================") +print(f"当前目录: {os.getcwd()}") +print() + +# 检查输入文件 +print("检查输入文件:") +print(f"路径: {input_file}") +print(f"存在: {os.path.exists(input_file)}") +if os.path.exists(input_file): + print(f"大小: {os.path.getsize(input_file) / 1024:.2f} KB") +else: + print("文件不存在!") + +# 检查输出文件 +print("\n检查输出文件:") +print(f"路径: {output_file}") +print(f"存在: {os.path.exists(output_file)}") +if os.path.exists(output_file): + print(f"大小: {os.path.getsize(output_file) / 1024:.2f} KB") +else: + print("文件不存在!") + +# 检查目录 +print("\n检查目录:") +dir_path = os.path.dirname(input_file) +print(f"目录: {dir_path}") +print(f"存在: {os.path.exists(dir_path)}") +if os.path.exists(dir_path): + print("目录内容:") + files = os.listdir(dir_path) + for file in files[:10]: # 只显示前10个文件 + file_path = os.path.join(dir_path, file) + size = os.path.getsize(file_path) / 1024 + print(f" {file}: {size:.2f} KB") + if len(files) > 10: + print(f" ... 还有 {len(files) - 10} 个文件") + +print() +print("========================================") +print(" 测试完成") +print("========================================")