import org.apache.poi.ss.usermodel.*; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import java.io.*; import java.util.*; import java.util.regex.*; public class AddRegressionColumns { public static void main(String[] args) { String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).xlsx"; String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新)_回归.xlsx"; System.out.println("========================================"); System.out.println(" 在原表中添加回归数据列"); System.out.println("========================================"); System.out.println("输入文件: " + inputFile); System.out.println("输出文件: " + outputFile); System.out.println(); try { // 读取输入文件 System.out.println("读取输入文件..."); FileInputStream fis = new FileInputStream(inputFile); Workbook wb = new XSSFWorkbook(fis); Sheet sheet = wb.getSheetAt(0); int totalRows = sheet.getLastRowNum(); System.out.println("总行数: " + totalRows); // 获取表头行 Row headerRow = sheet.getRow(0); int totalCols = headerRow.getLastCellNum(); System.out.println("总列数: " + totalCols); // 识别列 int helpfullCol = -1; int commentCountCol = -1; List commentCols = new ArrayList<>(); for (int i = 0; i < totalCols; i++) { Cell cell = headerRow.getCell(i); if (cell != null) { String header = cell.getStringCellValue().toLowerCase(); if (header.contains("helpfull") || header.contains("helpful")) { helpfullCol = i; System.out.println("找到 Y 列 (helpfull): 列 " + i); } else if (header.contains("评论总数") || header.contains("帖子评论总数")) { commentCountCol = i; System.out.println("找到 X1 列 (评论总数): 列 " + i); } else if (header.contains("评论") && header.contains("内容")) { for (int j = 1; j <= 5; j++) { if (header.contains(String.valueOf(j))) { commentCols.add(i); System.out.println("找到评论列 " + commentCols.size() + ": 列 " + i + " - " + header); break; } } } } } System.out.println("\n共找到 " + commentCols.size() + " 个评论列"); // 添加新列的表头 int yCol = totalCols; int x1Col = totalCols + 1; int x2Col = totalCols + 2; int x3Col = totalCols + 3; int x4Col = totalCols + 4; int x5Col = totalCols + 5; int x6Col = totalCols + 6; headerRow.createCell(yCol).setCellValue("Y"); headerRow.createCell(x1Col).setCellValue("X1"); headerRow.createCell(x2Col).setCellValue("X2"); headerRow.createCell(x3Col).setCellValue("X3"); headerRow.createCell(x4Col).setCellValue("X4"); headerRow.createCell(x5Col).setCellValue("X5"); headerRow.createCell(x6Col).setCellValue("X6"); // 处理每一行数据 System.out.println("\n处理数据..."); Pattern digitPattern = Pattern.compile("\\d"); Pattern urlPattern = Pattern.compile("http[s]?://|www\\."); Pattern emojiPattern = Pattern.compile("[\\u2600-\\u27BF\\uD83C-\\uDBFF\\uDC00-\\uDFFF]|[:;][-]?[)D]"); String[] positiveWords = {"好", "棒", "优秀", "喜欢", "满意", "赞", "positive", "good", "great", "excellent", "love", "like"}; String[] negativeWords = {"差", "糟糕", "不好", "失望", "不满", "negative", "bad", "terrible", "poor", "hate", "dislike"}; for (int i = 1; i <= totalRows; i++) { if (i % 1000 == 0) { System.out.println("处理第 " + i + "/" + totalRows + " 行..."); } Row row = sheet.getRow(i); if (row == null) continue; // Y (UGC有用性) double y = 0; if (helpfullCol >= 0) { Cell cell = row.getCell(helpfullCol); if (cell != null) { try { y = cell.getNumericCellValue(); } catch (Exception e) { y = 0; } } } row.createCell(yCol).setCellValue(y); // X1 (评论数量) double x1 = 0; if (commentCountCol >= 0) { Cell cell = row.getCell(commentCountCol); if (cell != null) { try { x1 = cell.getNumericCellValue(); } catch (Exception e) { x1 = 0; } } } row.createCell(x1Col).setCellValue(x1); // 计算评论相关指标 List lengths = new ArrayList<>(); List complexities = new ArrayList<>(); List sentiments = new ArrayList<>(); List richnessList = new ArrayList<>(); for (int colIdx : commentCols) { Cell cell = row.getCell(colIdx); if (cell != null) { String content = ""; try { content = cell.getStringCellValue(); } catch (Exception e) { try { content = String.valueOf(cell.getNumericCellValue()); } catch (Exception e2) { content = ""; } } if (content != null && !content.isEmpty() && !content.equals("nan") && !content.equals("null")) { // X2: 评论长度(剔空格后的字符数) double length = content.replace(" ", "").replace("\u3000", "").length(); lengths.add(length); // X3: 评论复杂度(按空格拆分的分词数) double complexity = content.split("\\s+").length; complexities.add(complexity); // X5: 情感分析 double sentiment = 0; String lowerContent = content.toLowerCase(); for (String word : positiveWords) { if (lowerContent.contains(word)) { sentiment = 1; break; } } if (sentiment == 0) { for (String word : negativeWords) { if (lowerContent.contains(word)) { sentiment = -1; break; } } } sentiments.add(sentiment); // X6: 信息丰富度 double richness = 0; if (digitPattern.matcher(content).find()) richness += 1; if (urlPattern.matcher(content).find()) richness += 1; if (emojiPattern.matcher(content).find()) richness += 1; richnessList.add(richness); } } } // 计算平均值(无评论记0) double x2 = lengths.isEmpty() ? 0 : lengths.stream().mapToDouble(Double::doubleValue).average().getAsDouble(); double x3 = complexities.isEmpty() ? 0 : complexities.stream().mapToDouble(Double::doubleValue).average().getAsDouble(); double x5 = sentiments.isEmpty() ? 0 : sentiments.stream().mapToDouble(Double::doubleValue).average().getAsDouble(); double x6 = richnessList.isEmpty() ? 0 : richnessList.stream().mapToDouble(Double::doubleValue).average().getAsDouble(); // X4: 评论可读性 = X2/X3(X3为0时记0) double x4 = (x3 > 0) ? x2 / x3 : 0; // 写入单元格 row.createCell(x2Col).setCellValue(x2); row.createCell(x3Col).setCellValue(x3); row.createCell(x4Col).setCellValue(x4); row.createCell(x5Col).setCellValue(x5); row.createCell(x6Col).setCellValue(x6); } // 保存文件 System.out.println("\n保存文件..."); FileOutputStream fos = new FileOutputStream(outputFile); wb.write(fos); fos.close(); wb.close(); fis.close(); // 验证文件 File output = new File(outputFile); if (output.exists()) { System.out.println("文件保存成功!"); System.out.println("文件大小: " + (output.length() / 1024) + " KB"); } System.out.println("\n========================================"); System.out.println(" 任务完成"); System.out.println("========================================"); } catch (Exception e) { System.out.println("错误: " + e.getMessage()); e.printStackTrace(); } } }