You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
224 lines
10 KiB
224 lines
10 KiB
import org.apache.poi.ss.usermodel.*;
|
|
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
|
import java.io.*;
|
|
import java.util.*;
|
|
import java.util.regex.*;
|
|
|
|
public class AddRegressionColumns {
|
|
public static void main(String[] args) {
|
|
String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).xlsx";
|
|
String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新)_回归.xlsx";
|
|
|
|
System.out.println("========================================");
|
|
System.out.println(" 在原表中添加回归数据列");
|
|
System.out.println("========================================");
|
|
System.out.println("输入文件: " + inputFile);
|
|
System.out.println("输出文件: " + outputFile);
|
|
System.out.println();
|
|
|
|
try {
|
|
// 读取输入文件
|
|
System.out.println("读取输入文件...");
|
|
FileInputStream fis = new FileInputStream(inputFile);
|
|
Workbook wb = new XSSFWorkbook(fis);
|
|
Sheet sheet = wb.getSheetAt(0);
|
|
|
|
int totalRows = sheet.getLastRowNum();
|
|
System.out.println("总行数: " + totalRows);
|
|
|
|
// 获取表头行
|
|
Row headerRow = sheet.getRow(0);
|
|
int totalCols = headerRow.getLastCellNum();
|
|
System.out.println("总列数: " + totalCols);
|
|
|
|
// 识别列
|
|
int helpfullCol = -1;
|
|
int commentCountCol = -1;
|
|
List<Integer> commentCols = new ArrayList<>();
|
|
|
|
for (int i = 0; i < totalCols; i++) {
|
|
Cell cell = headerRow.getCell(i);
|
|
if (cell != null) {
|
|
String header = cell.getStringCellValue().toLowerCase();
|
|
if (header.contains("helpfull") || header.contains("helpful")) {
|
|
helpfullCol = i;
|
|
System.out.println("找到 Y 列 (helpfull): 列 " + i);
|
|
} else if (header.contains("评论总数") || header.contains("帖子评论总数")) {
|
|
commentCountCol = i;
|
|
System.out.println("找到 X1 列 (评论总数): 列 " + i);
|
|
} else if (header.contains("评论") && header.contains("内容")) {
|
|
for (int j = 1; j <= 5; j++) {
|
|
if (header.contains(String.valueOf(j))) {
|
|
commentCols.add(i);
|
|
System.out.println("找到评论列 " + commentCols.size() + ": 列 " + i + " - " + header);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
System.out.println("\n共找到 " + commentCols.size() + " 个评论列");
|
|
|
|
// 添加新列的表头
|
|
int yCol = totalCols;
|
|
int x1Col = totalCols + 1;
|
|
int x2Col = totalCols + 2;
|
|
int x3Col = totalCols + 3;
|
|
int x4Col = totalCols + 4;
|
|
int x5Col = totalCols + 5;
|
|
int x6Col = totalCols + 6;
|
|
|
|
headerRow.createCell(yCol).setCellValue("Y");
|
|
headerRow.createCell(x1Col).setCellValue("X1");
|
|
headerRow.createCell(x2Col).setCellValue("X2");
|
|
headerRow.createCell(x3Col).setCellValue("X3");
|
|
headerRow.createCell(x4Col).setCellValue("X4");
|
|
headerRow.createCell(x5Col).setCellValue("X5");
|
|
headerRow.createCell(x6Col).setCellValue("X6");
|
|
|
|
// 处理每一行数据
|
|
System.out.println("\n处理数据...");
|
|
Pattern digitPattern = Pattern.compile("\\d");
|
|
Pattern urlPattern = Pattern.compile("http[s]?://|www\\.");
|
|
Pattern emojiPattern = Pattern.compile("[\\u2600-\\u27BF\\uD83C-\\uDBFF\\uDC00-\\uDFFF]|[:;][-]?[)D]");
|
|
|
|
String[] positiveWords = {"好", "棒", "优秀", "喜欢", "满意", "赞", "positive", "good", "great", "excellent", "love", "like"};
|
|
String[] negativeWords = {"差", "糟糕", "不好", "失望", "不满", "negative", "bad", "terrible", "poor", "hate", "dislike"};
|
|
|
|
for (int i = 1; i <= totalRows; i++) {
|
|
if (i % 1000 == 0) {
|
|
System.out.println("处理第 " + i + "/" + totalRows + " 行...");
|
|
}
|
|
|
|
Row row = sheet.getRow(i);
|
|
if (row == null) continue;
|
|
|
|
// Y (UGC有用性)
|
|
double y = 0;
|
|
if (helpfullCol >= 0) {
|
|
Cell cell = row.getCell(helpfullCol);
|
|
if (cell != null) {
|
|
try {
|
|
y = cell.getNumericCellValue();
|
|
} catch (Exception e) {
|
|
y = 0;
|
|
}
|
|
}
|
|
}
|
|
row.createCell(yCol).setCellValue(y);
|
|
|
|
// X1 (评论数量)
|
|
double x1 = 0;
|
|
if (commentCountCol >= 0) {
|
|
Cell cell = row.getCell(commentCountCol);
|
|
if (cell != null) {
|
|
try {
|
|
x1 = cell.getNumericCellValue();
|
|
} catch (Exception e) {
|
|
x1 = 0;
|
|
}
|
|
}
|
|
}
|
|
row.createCell(x1Col).setCellValue(x1);
|
|
|
|
// 计算评论相关指标
|
|
List<Double> lengths = new ArrayList<>();
|
|
List<Double> complexities = new ArrayList<>();
|
|
List<Double> sentiments = new ArrayList<>();
|
|
List<Double> richnessList = new ArrayList<>();
|
|
|
|
for (int colIdx : commentCols) {
|
|
Cell cell = row.getCell(colIdx);
|
|
if (cell != null) {
|
|
String content = "";
|
|
try {
|
|
content = cell.getStringCellValue();
|
|
} catch (Exception e) {
|
|
try {
|
|
content = String.valueOf(cell.getNumericCellValue());
|
|
} catch (Exception e2) {
|
|
content = "";
|
|
}
|
|
}
|
|
|
|
if (content != null && !content.isEmpty() && !content.equals("nan") && !content.equals("null")) {
|
|
// X2: 评论长度(剔空格后的字符数)
|
|
double length = content.replace(" ", "").replace("\u3000", "").length();
|
|
lengths.add(length);
|
|
|
|
// X3: 评论复杂度(按空格拆分的分词数)
|
|
double complexity = content.split("\\s+").length;
|
|
complexities.add(complexity);
|
|
|
|
// X5: 情感分析
|
|
double sentiment = 0;
|
|
String lowerContent = content.toLowerCase();
|
|
for (String word : positiveWords) {
|
|
if (lowerContent.contains(word)) {
|
|
sentiment = 1;
|
|
break;
|
|
}
|
|
}
|
|
if (sentiment == 0) {
|
|
for (String word : negativeWords) {
|
|
if (lowerContent.contains(word)) {
|
|
sentiment = -1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
sentiments.add(sentiment);
|
|
|
|
// X6: 信息丰富度
|
|
double richness = 0;
|
|
if (digitPattern.matcher(content).find()) richness += 1;
|
|
if (urlPattern.matcher(content).find()) richness += 1;
|
|
if (emojiPattern.matcher(content).find()) richness += 1;
|
|
richnessList.add(richness);
|
|
}
|
|
}
|
|
}
|
|
|
|
// 计算平均值(无评论记0)
|
|
double x2 = lengths.isEmpty() ? 0 : lengths.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
|
|
double x3 = complexities.isEmpty() ? 0 : complexities.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
|
|
double x5 = sentiments.isEmpty() ? 0 : sentiments.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
|
|
double x6 = richnessList.isEmpty() ? 0 : richnessList.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
|
|
|
|
// X4: 评论可读性 = X2/X3(X3为0时记0)
|
|
double x4 = (x3 > 0) ? x2 / x3 : 0;
|
|
|
|
// 写入单元格
|
|
row.createCell(x2Col).setCellValue(x2);
|
|
row.createCell(x3Col).setCellValue(x3);
|
|
row.createCell(x4Col).setCellValue(x4);
|
|
row.createCell(x5Col).setCellValue(x5);
|
|
row.createCell(x6Col).setCellValue(x6);
|
|
}
|
|
|
|
// 保存文件
|
|
System.out.println("\n保存文件...");
|
|
FileOutputStream fos = new FileOutputStream(outputFile);
|
|
wb.write(fos);
|
|
fos.close();
|
|
wb.close();
|
|
fis.close();
|
|
|
|
// 验证文件
|
|
File output = new File(outputFile);
|
|
if (output.exists()) {
|
|
System.out.println("文件保存成功!");
|
|
System.out.println("文件大小: " + (output.length() / 1024) + " KB");
|
|
}
|
|
|
|
System.out.println("\n========================================");
|
|
System.out.println(" 任务完成");
|
|
System.out.println("========================================");
|
|
|
|
} catch (Exception e) {
|
|
System.out.println("错误: " + e.getMessage());
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
}
|
|
|