You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

224 lines
10 KiB

import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.*;
import java.util.*;
import java.util.regex.*;
public class AddRegressionColumns {
public static void main(String[] args) {
String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).xlsx";
String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新)_回归.xlsx";
System.out.println("========================================");
System.out.println(" 在原表中添加回归数据列");
System.out.println("========================================");
System.out.println("输入文件: " + inputFile);
System.out.println("输出文件: " + outputFile);
System.out.println();
try {
// 读取输入文件
System.out.println("读取输入文件...");
FileInputStream fis = new FileInputStream(inputFile);
Workbook wb = new XSSFWorkbook(fis);
Sheet sheet = wb.getSheetAt(0);
int totalRows = sheet.getLastRowNum();
System.out.println("总行数: " + totalRows);
// 获取表头行
Row headerRow = sheet.getRow(0);
int totalCols = headerRow.getLastCellNum();
System.out.println("总列数: " + totalCols);
// 识别列
int helpfullCol = -1;
int commentCountCol = -1;
List<Integer> commentCols = new ArrayList<>();
for (int i = 0; i < totalCols; i++) {
Cell cell = headerRow.getCell(i);
if (cell != null) {
String header = cell.getStringCellValue().toLowerCase();
if (header.contains("helpfull") || header.contains("helpful")) {
helpfullCol = i;
System.out.println("找到 Y 列 (helpfull): 列 " + i);
} else if (header.contains("评论总数") || header.contains("帖子评论总数")) {
commentCountCol = i;
System.out.println("找到 X1 列 (评论总数): 列 " + i);
} else if (header.contains("评论") && header.contains("内容")) {
for (int j = 1; j <= 5; j++) {
if (header.contains(String.valueOf(j))) {
commentCols.add(i);
System.out.println("找到评论列 " + commentCols.size() + ": 列 " + i + " - " + header);
break;
}
}
}
}
}
System.out.println("\n共找到 " + commentCols.size() + " 个评论列");
// 添加新列的表头
int yCol = totalCols;
int x1Col = totalCols + 1;
int x2Col = totalCols + 2;
int x3Col = totalCols + 3;
int x4Col = totalCols + 4;
int x5Col = totalCols + 5;
int x6Col = totalCols + 6;
headerRow.createCell(yCol).setCellValue("Y");
headerRow.createCell(x1Col).setCellValue("X1");
headerRow.createCell(x2Col).setCellValue("X2");
headerRow.createCell(x3Col).setCellValue("X3");
headerRow.createCell(x4Col).setCellValue("X4");
headerRow.createCell(x5Col).setCellValue("X5");
headerRow.createCell(x6Col).setCellValue("X6");
// 处理每一行数据
System.out.println("\n处理数据...");
Pattern digitPattern = Pattern.compile("\\d");
Pattern urlPattern = Pattern.compile("http[s]?://|www\\.");
Pattern emojiPattern = Pattern.compile("[\\u2600-\\u27BF\\uD83C-\\uDBFF\\uDC00-\\uDFFF]|[:;][-]?[)D]");
String[] positiveWords = {"好", "棒", "优秀", "喜欢", "满意", "赞", "positive", "good", "great", "excellent", "love", "like"};
String[] negativeWords = {"差", "糟糕", "不好", "失望", "不满", "negative", "bad", "terrible", "poor", "hate", "dislike"};
for (int i = 1; i <= totalRows; i++) {
if (i % 1000 == 0) {
System.out.println("处理第 " + i + "/" + totalRows + " 行...");
}
Row row = sheet.getRow(i);
if (row == null) continue;
// Y (UGC有用性)
double y = 0;
if (helpfullCol >= 0) {
Cell cell = row.getCell(helpfullCol);
if (cell != null) {
try {
y = cell.getNumericCellValue();
} catch (Exception e) {
y = 0;
}
}
}
row.createCell(yCol).setCellValue(y);
// X1 (评论数量)
double x1 = 0;
if (commentCountCol >= 0) {
Cell cell = row.getCell(commentCountCol);
if (cell != null) {
try {
x1 = cell.getNumericCellValue();
} catch (Exception e) {
x1 = 0;
}
}
}
row.createCell(x1Col).setCellValue(x1);
// 计算评论相关指标
List<Double> lengths = new ArrayList<>();
List<Double> complexities = new ArrayList<>();
List<Double> sentiments = new ArrayList<>();
List<Double> richnessList = new ArrayList<>();
for (int colIdx : commentCols) {
Cell cell = row.getCell(colIdx);
if (cell != null) {
String content = "";
try {
content = cell.getStringCellValue();
} catch (Exception e) {
try {
content = String.valueOf(cell.getNumericCellValue());
} catch (Exception e2) {
content = "";
}
}
if (content != null && !content.isEmpty() && !content.equals("nan") && !content.equals("null")) {
// X2: 评论长度(剔空格后的字符数)
double length = content.replace(" ", "").replace("\u3000", "").length();
lengths.add(length);
// X3: 评论复杂度(按空格拆分的分词数)
double complexity = content.split("\\s+").length;
complexities.add(complexity);
// X5: 情感分析
double sentiment = 0;
String lowerContent = content.toLowerCase();
for (String word : positiveWords) {
if (lowerContent.contains(word)) {
sentiment = 1;
break;
}
}
if (sentiment == 0) {
for (String word : negativeWords) {
if (lowerContent.contains(word)) {
sentiment = -1;
break;
}
}
}
sentiments.add(sentiment);
// X6: 信息丰富度
double richness = 0;
if (digitPattern.matcher(content).find()) richness += 1;
if (urlPattern.matcher(content).find()) richness += 1;
if (emojiPattern.matcher(content).find()) richness += 1;
richnessList.add(richness);
}
}
}
// 计算平均值(无评论记0)
double x2 = lengths.isEmpty() ? 0 : lengths.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
double x3 = complexities.isEmpty() ? 0 : complexities.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
double x5 = sentiments.isEmpty() ? 0 : sentiments.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
double x6 = richnessList.isEmpty() ? 0 : richnessList.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
// X4: 评论可读性 = X2/X3(X3为0时记0)
double x4 = (x3 > 0) ? x2 / x3 : 0;
// 写入单元格
row.createCell(x2Col).setCellValue(x2);
row.createCell(x3Col).setCellValue(x3);
row.createCell(x4Col).setCellValue(x4);
row.createCell(x5Col).setCellValue(x5);
row.createCell(x6Col).setCellValue(x6);
}
// 保存文件
System.out.println("\n保存文件...");
FileOutputStream fos = new FileOutputStream(outputFile);
wb.write(fos);
fos.close();
wb.close();
fis.close();
// 验证文件
File output = new File(outputFile);
if (output.exists()) {
System.out.println("文件保存成功!");
System.out.println("文件大小: " + (output.length() / 1024) + " KB");
}
System.out.println("\n========================================");
System.out.println(" 任务完成");
System.out.println("========================================");
} catch (Exception e) {
System.out.println("错误: " + e.getMessage());
e.printStackTrace();
}
}
}