Browse Source

feat:新增抽象类Animal,定义makeSound()抽象方法

main
ZhengJiayin 1 week ago
parent
commit
24ce491f5e
  1. 66
      Animal.java
  2. BIN
      Java-1test/BankAccount.class
  3. 63
      Java-1test/BankAccount.java
  4. BIN
      Java-1test/TestBankAccount.class
  5. 29
      Java-1test/TestBankAccount.java
  6. BIN
      Java-1test/bin/com/rental/Car.class
  7. BIN
      Java-1test/bin/com/rental/TestCar.class
  8. BIN
      Java-1test/project/SimpleMovieCrawler$Movie.class
  9. BIN
      Java-1test/project/SimpleMovieCrawler.class
  10. 155
      Java-1test/project/SimpleMovieCrawler.java
  11. 1
      Java-1test/project/movies.txt
  12. 51
      Java-1test/project/pom.xml
  13. 38
      Java-1test/project/run.bat
  14. 62
      Java-1test/project/src/main/java/com/example/Main.java
  15. 94
      Java-1test/project/src/main/java/com/example/analyzer/MovieAnalyzer.java
  16. 81
      Java-1test/project/src/main/java/com/example/chart/ChartGenerator.java
  17. 119
      Java-1test/project/src/main/java/com/example/crawler/MovieCrawler.java
  18. 81
      Java-1test/project/src/main/java/com/example/model/Movie.java
  19. 40
      Java-1test/project/src/main/java/com/example/processor/DataProcessor.java
  20. BIN
      Java-1test/project/target/classes/com/example/Main.class
  21. BIN
      Java-1test/project/target/classes/com/example/analyzer/MovieAnalyzer.class
  22. BIN
      Java-1test/project/target/classes/com/example/chart/ChartGenerator.class
  23. BIN
      Java-1test/project/target/classes/com/example/crawler/MovieCrawler.class
  24. BIN
      Java-1test/project/target/classes/com/example/model/Movie.class
  25. BIN
      Java-1test/project/target/classes/com/example/processor/DataProcessor.class
  26. 104
      Java-1test/src/main/java/com/rental/Car.java
  27. 48
      Java-1test/src/main/java/com/rental/TestCar.java
  28. 0
      Java.实验
  29. 224
      project/AddRegressionColumns.java
  30. 4
      project/DataCleaner.java
  31. 226
      project/DataCleaningScript.java
  32. 4
      project/DataStorage.java
  33. 3
      project/DuoTai.java
  34. 4
      project/ExcelReader.java
  35. 2
      project/PostInfo.java
  36. 50
      project/ProcessRegressionData.java
  37. 59
      project/SimpleDataCleaner.java
  38. 189
      project/add_regression_columns.py
  39. 32
      project/basic_test.py
  40. 219
      project/batch_process.py
  41. 169
      project/calculate_regression_data.py
  42. 43
      project/check_data_structure.py
  43. 53
      project/check_excel_size.py
  44. 69
      project/create_and_fill_data.py
  45. 86
      project/create_excel_with_data.py
  46. 112
      project/create_regression_data.py
  47. 142
      project/create_regression_data_v2.py
  48. 0
      project/d
  49. 73
      project/data_cleaner.py
  50. 98
      project/data_cleaner_v2.py
  51. 11
      project/debug_log.txt
  52. 36
      project/debug_process.py
  53. 51
      project/debug_script.py
  54. 50
      project/import_data.py
  55. 17
      project/minimal_test.py
  56. 113
      project/populate_regression_data.py
  57. 156
      project/process_300_rows.py
  58. 200
      project/process_actual_data.py
  59. 190
      project/process_all_data.py
  60. 157
      project/process_all_rows.py
  61. 180
      project/process_efficient.py
  62. 177
      project/process_large_file.py
  63. 9
      project/process_log.txt
  64. 192
      project/process_regression_final.py
  65. 202
      project/process_with_csv.py
  66. 168
      project/process_with_pandas.py
  67. 83
      project/quick_process.py
  68. 54
      project/read_excel_test.py
  69. 216
      project/run_with_output.py
  70. 187
      project/simple_add_columns.py
  71. 100
      project/simple_calculate.py
  72. 41
      project/simple_copy.py
  73. 54
      project/simple_data_test.py
  74. 57
      project/simple_excel_create.py
  75. 22
      project/simple_test.py
  76. 49
      project/test_file_access.py
  77. 1
      w4

66
Animal.java

@ -0,0 +1,66 @@
// 1. 定义Swimmable接口:包含swim()方法
public interface Swimmable {
// 接口方法默认public abstract,可省略修饰符
void swim();
}
// 2. 定义抽象类Animal:包含抽象方法makeSound()
public abstract class Animal {
// 抽象方法:没有方法体,由子类实现
public abstract void makeSound();
}
// 3. Dog类:继承Animal,实现Swimmable接口
public class Dog extends Animal implements Swimmable {
// 实现父类抽象方法makeSound()
@Override
public void makeSound() {
System.out.println("狗叫:汪汪汪!");
}
// 实现Swimmable接口的swim()方法
@Override
public void swim() {
System.out.println("狗在游泳:狗刨式!");
}
}
// 4. Cat类:仅继承Animal,不实现Swimmable接口
public class Cat extends Animal {
// 实现父类抽象方法makeSound()
@Override
public void makeSound() {
System.out.println("猫叫:喵喵喵!");
}
}
// 5. 主类:测试多态调用
public class AnimalTest {
public static void main(String[] args) {
// 多态1:父类引用指向子类对象(Animal多态)
Animal dog1 = new Dog();
Animal cat1 = new Cat();
System.out.println("=== Animal多态调用makeSound() ===");
dog1.makeSound(); // 调用Dog类的makeSound()
cat1.makeSound(); // 调用Cat类的makeSound()
// 多态2:接口引用指向实现类对象(Swimmable多态)
Swimmable dog2 = new Dog();
System.out.println("\n=== Swimmable多态调用swim() ===");
dog2.swim(); // 调用Dog类的swim()
// 类型转换:将Animal类型的dog1转为Swimmable,调用swim()
System.out.println("\n=== 类型转换后调用swim() ===");
if (dog1 instanceof Swimmable) { // 安全判断:避免类型转换异常
Swimmable swimmableDog = (Swimmable) dog1;
swimmableDog.swim();
}
// Cat无法转换为Swimmable,会抛出异常,因此不执行
// if (cat1 instanceof Swimmable) {
// Swimmable swimmableCat = (Swimmable) cat1;
// swimmableCat.swim();
// }
}
}

BIN
Java-1test/BankAccount.class

Binary file not shown.

63
Java-1test/BankAccount.java

@ -0,0 +1,63 @@
public class BankAccount {
// 私有属性
private final String accountNumber;
private String ownerName;
private double balance;
// 构造方法
public BankAccount(String accountNumber, String ownerName) {
this.accountNumber = accountNumber;
this.ownerName = ownerName;
this.balance = 0.0;
}
// Getter 方法
public String getAccountNumber() {
return accountNumber;
}
public String getOwnerName() {
return ownerName;
}
public double getBalance() {
return balance;
}
// Setter 方法
public void setOwnerName(String ownerName) {
this.ownerName = ownerName;
}
// 存款操作
public void deposit(double amount) {
if (amount > 0) {
balance += amount;
System.out.println("存款成功!当前余额:" + balance);
} else {
System.out.println("存款金额必须大于 0");
}
}
// 取款操作
public void withdraw(double amount) {
if (amount > 0) {
if (amount <= balance) {
balance -= amount;
System.out.println("取款成功!当前余额:" + balance);
} else {
System.out.println("余额不足,无法取款");
}
} else {
System.out.println("取款金额必须大于 0");
}
}
// 显示账户信息
public void displayInfo() {
System.out.println("账号:" + accountNumber);
System.out.println("户主:" + ownerName);
System.out.println("余额:" + balance);
System.out.println();
}
}

BIN
Java-1test/TestBankAccount.class

Binary file not shown.

29
Java-1test/TestBankAccount.java

@ -0,0 +1,29 @@
public class TestBankAccount {
public static void main(String[] args) {
// 创建银行账户
BankAccount account = new BankAccount("123456789", "张三");
// 显示初始账户信息
System.out.println("初始账户信息:");
account.displayInfo();
// 测试存款
System.out.println("测试存款:");
account.deposit(1000);
account.deposit(-500); // 测试非法存款金额
// 测试取款
System.out.println("测试取款:");
account.withdraw(500);
account.withdraw(1000); // 测试余额不足
account.withdraw(-200); // 测试非法取款金额
// 测试修改户主姓名
System.out.println("测试修改户主姓名:");
account.setOwnerName("李四");
account.displayInfo();
// 测试查询余额
System.out.println("当前余额:" + account.getBalance());
}
}

BIN
Java-1test/bin/com/rental/Car.class

Binary file not shown.

BIN
Java-1test/bin/com/rental/TestCar.class

Binary file not shown.

BIN
Java-1test/project/SimpleMovieCrawler$Movie.class

Binary file not shown.

BIN
Java-1test/project/SimpleMovieCrawler.class

Binary file not shown.

155
Java-1test/project/SimpleMovieCrawler.java

@ -0,0 +1,155 @@
import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class SimpleMovieCrawler {
public static void main(String[] args) {
try {
// 1. 抓取电影数据
List<Movie> movies = crawlMovies();
System.out.println("爬取完成,共获取 " + movies.size() + " 部电影数据");
// 2. 保存到文件
saveToFile(movies, "movies.txt");
// 3. 分析数据
analyzeData(movies);
} catch (IOException e) {
e.printStackTrace();
}
}
// 简单的爬虫实现
public static List<Movie> crawlMovies() throws IOException {
List<Movie> movies = new ArrayList<>();
String url = "https://www.imdb.com/chart/top/";
// 发送 HTTP 请求
HttpURLConnection connection = (HttpURLConnection) new URL(url).openConnection();
connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent", "Mozilla/5.0");
// 读取响应
BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
StringBuilder content = new StringBuilder();
String line;
while ((line = reader.readLine()) != null) {
content.append(line);
}
reader.close();
connection.disconnect();
// 简单解析 HTML(实际项目中建议使用 Jsoup)
String html = content.toString();
int start = html.indexOf("<tbody class=\"lister-list\">");
int end = html.indexOf("</tbody>", start);
if (start != -1 && end != -1) {
String tableContent = html.substring(start, end);
String[] rows = tableContent.split("<tr>");
for (int i = 1; i < Math.min(rows.length, 21); i++) { // 只取前 20 部
String row = rows[i];
Movie movie = new Movie();
// 提取标题
int titleStart = row.indexOf("<a href=");
int titleEnd = row.indexOf("</a>", titleStart);
if (titleStart != -1 && titleEnd != -1) {
String titleHtml = row.substring(titleStart, titleEnd);
int titleTextStart = titleHtml.indexOf(">" ) + 1;
if (titleTextStart != -1) {
movie.setTitle(titleHtml.substring(titleTextStart).trim());
}
}
// 提取年份
int yearStart = row.indexOf("<span class=\"secondaryInfo\">");
int yearEnd = row.indexOf("</span>", yearStart);
if (yearStart != -1 && yearEnd != -1) {
String year = row.substring(yearStart + 27, yearEnd).replaceAll("[()]", "").trim();
movie.setYear(year);
}
// 提取评分
int ratingStart = row.indexOf("<strong>");
int ratingEnd = row.indexOf("</strong>", ratingStart);
if (ratingStart != -1 && ratingEnd != -1) {
String rating = row.substring(ratingStart + 8, ratingEnd).trim();
movie.setRating(rating);
}
if (movie.getTitle() != null) {
movies.add(movie);
}
}
}
return movies;
}
// 保存数据到文件
public static void saveToFile(List<Movie> movies, String fileName) throws IOException {
FileWriter writer = new FileWriter(fileName);
writer.write("Title,Rating,Year\n");
for (Movie movie : movies) {
writer.write(movie.getTitle() + "," + movie.getRating() + "," + movie.getYear() + "\n");
}
writer.close();
System.out.println("数据已保存到: " + fileName);
}
// 分析数据
public static void analyzeData(List<Movie> movies) {
System.out.println("\n=== 电影数据分析 ===");
// 评分分布
Map<String, Integer> ratingDist = new HashMap<>();
for (Movie movie : movies) {
String rating = movie.getRating();
ratingDist.put(rating, ratingDist.getOrDefault(rating, 0) + 1);
}
System.out.println("\n1. 评分分布:");
for (Map.Entry<String, Integer> entry : ratingDist.entrySet()) {
System.out.println("评分 " + entry.getKey() + ": " + entry.getValue() + " 部");
}
// 年份分布
Map<String, Integer> yearDist = new HashMap<>();
for (Movie movie : movies) {
String year = movie.getYear();
if (year != null) {
yearDist.put(year, yearDist.getOrDefault(year, 0) + 1);
}
}
System.out.println("\n2. 年份分布:");
yearDist.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
.limit(10)
.forEach(entry -> System.out.println(entry.getKey() + "年: " + entry.getValue() + " 部"));
}
// 电影模型类
static class Movie {
private String title;
private String rating;
private String year;
public String getTitle() { return title; }
public void setTitle(String title) { this.title = title; }
public String getRating() { return rating; }
public void setRating(String rating) { this.rating = rating; }
public String getYear() { return year; }
public void setYear(String year) { this.year = year; }
}
}

1
Java-1test/project/movies.txt

@ -0,0 +1 @@
Title,Rating,Year

51
Java-1test/project/pom.xml

@ -0,0 +1,51 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.example</groupId>
<artifactId>movie-crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<!-- Jsoup for HTML parsing -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
<!-- JFreeChart for chart generation -->
<dependency>
<groupId>org.jfree</groupId>
<artifactId>jfreechart</artifactId>
<version>1.5.4</version>
</dependency>
<!-- Commons CSV for CSV handling -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
<version>1.10.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.11.0</version>
<configuration>
<source>11</source>
<target>11</target>
</configuration>
</plugin>
</plugins>
</build>
</project>

38
Java-1test/project/run.bat

@ -0,0 +1,38 @@
@echo off
rem 创建 lib 目录并下载依赖
if not exist lib mkdir lib
rem 下载 Jsoup
if not exist lib\jsoup-1.17.2.jar (
echo 下载 Jsoup...
powershell -Command "Invoke-WebRequest -Uri 'https://repo1.maven.org/maven2/org/jsoup/jsoup/1.17.2/jsoup-1.17.2.jar' -OutFile 'lib\jsoup-1.17.2.jar'"
)
rem 下载 JFreeChart
if not exist lib\jfreechart-1.5.4.jar (
echo 下载 JFreeChart...
powershell -Command "Invoke-WebRequest -Uri 'https://repo1.maven.org/maven2/org/jfree/jfreechart/1.5.4/jfreechart-1.5.4.jar' -OutFile 'lib\jfreechart-1.5.4.jar'"
)
rem 下载 JCommon(JFreeChart 依赖)
if not exist lib\jcommon-1.0.24.jar (
echo 下载 JCommon...
powershell -Command "Invoke-WebRequest -Uri 'https://repo1.maven.org/maven2/org/jfree/jcommon/1.0.24/jcommon-1.0.24.jar' -OutFile 'lib\jcommon-1.0.24.jar'"
)
rem 下载 Commons CSV
if not exist lib\commons-csv-1.10.0.jar (
echo 下载 Commons CSV...
powershell -Command "Invoke-WebRequest -Uri 'https://repo1.maven.org/maven2/org/apache/commons/commons-csv/1.10.0/commons-csv-1.10.0.jar' -OutFile 'lib\commons-csv-1.10.0.jar'"
)
rem 编译项目
echo 编译项目...
javac -cp "lib/*" -d bin src\main\java\com\example\*.java src\main\java\com\example\model\*.java src\main\java\com\example\crawler\*.java src\main\java\com\example\processor\*.java src\main\java\com\example\analyzer\*.java src\main\java\com\example\chart\*.java
rem 运行项目
echo 运行项目...
java -cp "bin;lib/*" com.example.Main
pause

62
Java-1test/project/src/main/java/com/example/Main.java

@ -0,0 +1,62 @@
package com.example;
import com.example.analyzer.MovieAnalyzer;
import com.example.chart.ChartGenerator;
import com.example.crawler.MovieCrawler;
import com.example.model.Movie;
import com.example.processor.DataProcessor;
import java.io.IOException;
import java.util.List;
public class Main {
public static void main(String[] args) {
try {
// 1. 初始化爬虫
MovieCrawler crawler = new MovieCrawler();
System.out.println("开始爬取 IMDb Top 250 电影数据...");
// 2. 抓取电影数据(限制为50部)
List<Movie> movies = crawler.crawlTopMovies(50);
System.out.println("爬取完成,共获取 " + movies.size() + " 部电影数据");
// 3. 数据处理与存储
DataProcessor processor = new DataProcessor();
String csvFilePath = "movies.csv";
processor.saveMoviesToCsv(movies, csvFilePath);
// 4. 数据分析
MovieAnalyzer analyzer = new MovieAnalyzer();
analyzer.printStatistics(movies);
// 5. 图表生成
ChartGenerator chartGenerator = new ChartGenerator();
// 生成评分分布图表
chartGenerator.generateRatingDistributionChart(
analyzer.analyzeRatingDistribution(movies),
"rating_distribution.png"
);
// 生成类型分布图表
chartGenerator.generateGenreDistributionChart(
analyzer.analyzeGenreDistribution(movies),
"genre_distribution.png"
);
// 生成导演作品数图表
chartGenerator.generateDirectorWorksChart(
analyzer.analyzeDirectorWorks(movies),
"director_works.png"
);
System.out.println("\n项目执行完成!");
System.out.println("数据已保存到: " + csvFilePath);
System.out.println("图表已生成到当前目录");
} catch (IOException e) {
System.out.println("执行过程中出现错误: " + e.getMessage());
e.printStackTrace();
}
}
}

94
Java-1test/project/src/main/java/com/example/analyzer/MovieAnalyzer.java

@ -0,0 +1,94 @@
package com.example.analyzer;
import com.example.model.Movie;
import java.util.*;
import java.util.stream.Collectors;
public class MovieAnalyzer {
// 统计评分分布
public Map<String, Integer> analyzeRatingDistribution(List<Movie> movies) {
return movies.stream()
.collect(Collectors.groupingBy(Movie::getRating, Collectors.summingInt(e -> 1)));
}
// 统计年份与评分的关系
public Map<String, Double> analyzeYearRatingRelation(List<Movie> movies) {
return movies.stream()
.collect(Collectors.groupingBy(Movie::getYear,
Collectors.averagingDouble(m -> Double.parseDouble(m.getRating()))));
}
// 统计导演作品数排行
public Map<String, Integer> analyzeDirectorWorks(List<Movie> movies) {
return movies.stream()
.collect(Collectors.groupingBy(Movie::getDirector, Collectors.summingInt(e -> 1)))
.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
.limit(10)
.collect(Collectors.toMap(
Map.Entry::getKey,
Map.Entry::getValue,
(e1, e2) -> e1,
LinkedHashMap::new
));
}
// 统计类型分布
public Map<String, Integer> analyzeGenreDistribution(List<Movie> movies) {
Map<String, Integer> genreCount = new HashMap<>();
for (Movie movie : movies) {
String genre = movie.getGenre();
if (genre != null && !genre.isEmpty()) {
String[] genres = genre.split(", ");
for (String g : genres) {
genreCount.put(g, genreCount.getOrDefault(g, 0) + 1);
}
}
}
return genreCount.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
.limit(10)
.collect(Collectors.toMap(
Map.Entry::getKey,
Map.Entry::getValue,
(e1, e2) -> e1,
LinkedHashMap::new
));
}
// 打印统计结果
public void printStatistics(List<Movie> movies) {
System.out.println("\n=== 电影数据分析结果 ===");
// 评分分布
System.out.println("\n1. 评分分布:");
Map<String, Integer> ratingDist = analyzeRatingDistribution(movies);
ratingDist.forEach((rating, count) ->
System.out.printf("评分 %.1f: %d 部\n", Double.parseDouble(rating), count));
// 年份与评分关系(前10年)
System.out.println("\n2. 年份与平均评分(前10年):");
Map<String, Double> yearRating = analyzeYearRatingRelation(movies);
yearRating.entrySet().stream()
.sorted(Map.Entry.<String, Double>comparingByValue().reversed())
.limit(10)
.forEach(entry ->
System.out.printf("%s年: %.2f\n", entry.getKey(), entry.getValue()));
// 导演作品数排行
System.out.println("\n3. 导演作品数排行(前10):");
Map<String, Integer> directorWorks = analyzeDirectorWorks(movies);
directorWorks.forEach((director, count) ->
System.out.printf("%s: %d 部\n", director, count));
// 类型分布
System.out.println("\n4. 类型分布(前10):");
Map<String, Integer> genreDist = analyzeGenreDistribution(movies);
genreDist.forEach((genre, count) ->
System.out.printf("%s: %d 部\n", genre, count));
}
}

81
Java-1test/project/src/main/java/com/example/chart/ChartGenerator.java

@ -0,0 +1,81 @@
package com.example.chart;
import org.jfree.chart.ChartFactory;
import org.jfree.chart.ChartUtils;
import org.jfree.chart.JFreeChart;
import org.jfree.chart.plot.PlotOrientation;
import org.jfree.data.category.DefaultCategoryDataset;
import org.jfree.data.general.DefaultPieDataset;
import java.io.File;
import java.io.IOException;
import java.util.Map;
public class ChartGenerator {
// 生成评分分布柱状图
public void generateRatingDistributionChart(Map<String, Integer> ratingDist, String outputPath) throws IOException {
DefaultCategoryDataset dataset = new DefaultCategoryDataset();
ratingDist.forEach((rating, count) -> {
dataset.addValue(count, "电影数量", rating);
});
JFreeChart chart = ChartFactory.createBarChart(
"IMDb Top 250 电影评分分布",
"评分",
"电影数量",
dataset,
PlotOrientation.VERTICAL,
true,
true,
false
);
ChartUtils.saveChartAsPNG(new File(outputPath), chart, 800, 600);
System.out.println("评分分布图表已保存到:" + outputPath);
}
// 生成类型分布饼图
public void generateGenreDistributionChart(Map<String, Integer> genreDist, String outputPath) throws IOException {
DefaultPieDataset dataset = new DefaultPieDataset();
genreDist.forEach((genre, count) -> {
dataset.setValue(genre, count);
});
JFreeChart chart = ChartFactory.createPieChart(
"IMDb Top 250 电影类型分布",
dataset,
true,
true,
false
);
ChartUtils.saveChartAsPNG(new File(outputPath), chart, 800, 600);
System.out.println("类型分布图表已保存到:" + outputPath);
}
// 生成导演作品数柱状图
public void generateDirectorWorksChart(Map<String, Integer> directorWorks, String outputPath) throws IOException {
DefaultCategoryDataset dataset = new DefaultCategoryDataset();
directorWorks.forEach((director, count) -> {
dataset.addValue(count, "作品数量", director);
});
JFreeChart chart = ChartFactory.createBarChart(
"IMDb Top 250 导演作品数排行",
"导演",
"作品数量",
dataset,
PlotOrientation.VERTICAL,
true,
true,
false
);
ChartUtils.saveChartAsPNG(new File(outputPath), chart, 800, 600);
System.out.println("导演作品数图表已保存到:" + outputPath);
}
}

119
Java-1test/project/src/main/java/com/example/crawler/MovieCrawler.java

@ -0,0 +1,119 @@
package com.example.crawler;
import com.example.model.Movie;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
public class MovieCrawler {
private static final String BASE_URL = "https://www.imdb.com/chart/top/";
public List<Movie> crawlTopMovies(int limit) throws IOException {
List<Movie> movies = new ArrayList<>();
// 发送 HTTP 请求获取网页内容
Document doc = Jsoup.connect(BASE_URL)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.timeout(10000)
.get();
// 解析电影列表
Elements movieElements = doc.select("tbody.lister-list tr");
int count = 0;
for (Element element : movieElements) {
if (count >= limit) break;
Movie movie = new Movie();
// 提取电影标题
Element titleElement = element.selectFirst(".titleColumn a");
if (titleElement != null) {
movie.setTitle(titleElement.text());
}
// 提取年份
Element yearElement = element.selectFirst(".titleColumn .secondaryInfo");
if (yearElement != null) {
String year = yearElement.text().replaceAll("[()]", "");
movie.setYear(year);
}
// 提取评分
Element ratingElement = element.selectFirst(".ratingColumn.imdbRating strong");
if (ratingElement != null) {
movie.setRating(ratingElement.text());
}
// 提取导演和主演(需要进入详情页)
String movieUrl = "https://www.imdb.com" + titleElement.attr("href");
try {
Document movieDoc = Jsoup.connect(movieUrl)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.timeout(10000)
.get();
// 提取导演
Elements directorElements = movieDoc.select("a[href*=name]").stream()
.filter(e -> e.parent().text().contains("Director"))
.limit(1)
.collect(Collectors.toList());
if (!directorElements.isEmpty()) {
movie.setDirector(directorElements.get(0).text());
}
// 提取主演
Elements starElements = movieDoc.select("a[href*=name]").stream()
.filter(e -> e.parent().text().contains("Stars"))
.limit(3)
.collect(Collectors.toList());
if (!starElements.isEmpty()) {
StringBuilder stars = new StringBuilder();
for (int i = 0; i < starElements.size(); i++) {
stars.append(starElements.get(i).text());
if (i < starElements.size() - 1) stars.append(", ");
}
movie.setStars(stars.toString());
}
// 提取类型
Elements genreElements = movieDoc.select("a[href*=genres]").limit(3);
if (!genreElements.isEmpty()) {
StringBuilder genres = new StringBuilder();
for (int i = 0; i < genreElements.size(); i++) {
genres.append(genreElements.get(i).text());
if (i < genreElements.size() - 1) genres.append(", ");
}
movie.setGenre(genres.toString());
}
// 提取时长
Element runtimeElement = movieDoc.selectFirst("time");
if (runtimeElement != null) {
movie.setRuntime(runtimeElement.text());
}
} catch (IOException e) {
System.out.println("Error crawling movie details: " + e.getMessage());
}
movies.add(movie);
count++;
// 控制请求频率,避免被封
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
return movies;
}
}

81
Java-1test/project/src/main/java/com/example/model/Movie.java

@ -0,0 +1,81 @@
package com.example.model;
public class Movie {
private String title;
private String rating;
private String year;
private String director;
private String stars;
private String runtime;
private String genre;
// Getters and Setters
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getRating() {
return rating;
}
public void setRating(String rating) {
this.rating = rating;
}
public String getYear() {
return year;
}
public void setYear(String year) {
this.year = year;
}
public String getDirector() {
return director;
}
public void setDirector(String director) {
this.director = director;
}
public String getStars() {
return stars;
}
public void setStars(String stars) {
this.stars = stars;
}
public String getRuntime() {
return runtime;
}
public void setRuntime(String runtime) {
this.runtime = runtime;
}
public String getGenre() {
return genre;
}
public void setGenre(String genre) {
this.genre = genre;
}
@Override
public String toString() {
return "Movie{" +
"title='" + title + '\'' +
", rating='" + rating + '\'' +
", year='" + year + '\'' +
", director='" + director + '\'' +
", stars='" + stars + '\'' +
", runtime='" + runtime + '\'' +
", genre='" + genre + '\'' +
'}';
}
}

40
Java-1test/project/src/main/java/com/example/processor/DataProcessor.java

@ -0,0 +1,40 @@
package com.example.processor;
import com.example.model.Movie;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;
public class DataProcessor {
public void saveMoviesToCsv(List<Movie> movies, String filePath) throws IOException {
try (FileWriter writer = new FileWriter(filePath);
CSVPrinter csvPrinter = new CSVPrinter(writer, CSVFormat.DEFAULT
.withHeader("Title", "Rating", "Year", "Director", "Stars", "Runtime", "Genre"))) {
for (Movie movie : movies) {
csvPrinter.printRecord(
cleanText(movie.getTitle()),
movie.getRating(),
movie.getYear(),
cleanText(movie.getDirector()),
cleanText(movie.getStars()),
movie.getRuntime(),
cleanText(movie.getGenre())
);
}
csvPrinter.flush();
System.out.println("Movies saved to CSV file: " + filePath);
}
}
private String cleanText(String text) {
if (text == null) return "";
// 去除首尾空格,去除 HTML 标签
return text.trim().replaceAll("<[^>]*>", "");
}
}

BIN
Java-1test/project/target/classes/com/example/Main.class

Binary file not shown.

BIN
Java-1test/project/target/classes/com/example/analyzer/MovieAnalyzer.class

Binary file not shown.

BIN
Java-1test/project/target/classes/com/example/chart/ChartGenerator.class

Binary file not shown.

BIN
Java-1test/project/target/classes/com/example/crawler/MovieCrawler.class

Binary file not shown.

BIN
Java-1test/project/target/classes/com/example/model/Movie.class

Binary file not shown.

BIN
Java-1test/project/target/classes/com/example/processor/DataProcessor.class

Binary file not shown.

104
Java-1test/src/main/java/com/rental/Car.java

@ -0,0 +1,104 @@
package com.rental;
public class Car {
// 私有属性
private final String licensePlate;
private String brand;
private String model;
private double dailyRent;
private boolean isRented;
// 静态变量,统计车辆总数
private static int totalCars = 0;
// 全参构造方法
public Car(String licensePlate, String brand, String model, double dailyRent) {
this.licensePlate = licensePlate;
this.brand = brand;
this.model = model;
this.dailyRent = dailyRent;
this.isRented = false;
totalCars++;
}
// 三参构造方法,使用默认日租金 300 元/天
public Car(String licensePlate, String brand, String model) {
this(licensePlate, brand, model, 300.0);
}
// Getter 方法
public String getLicensePlate() {
return licensePlate;
}
public String getBrand() {
return brand;
}
public String getModel() {
return model;
}
public double getDailyRent() {
return dailyRent;
}
public boolean isRented() {
return isRented;
}
// Setter 方法
public void setBrand(String brand) {
this.brand = brand;
}
public void setModel(String model) {
this.model = model;
}
public void setDailyRent(double dailyRent) {
if (dailyRent > 0) {
this.dailyRent = dailyRent;
} else {
System.out.println("日租金必须大于 0,保持原值");
}
}
// 业务方法
public void rentCar() {
if (isRented) {
System.out.println("车辆已租出,无法再次租用");
} else {
isRented = true;
System.out.println("车辆租用成功");
}
}
public void returnCar() {
if (!isRented) {
System.out.println("车辆未被租用,无需归还");
} else {
isRented = false;
System.out.println("车辆归还成功");
}
}
public double calculateRent(int days) {
return dailyRent * days;
}
// 显示车辆信息
public void displayInfo() {
System.out.println("车牌号: " + licensePlate);
System.out.println("品牌: " + brand);
System.out.println("型号: " + model);
System.out.println("日租金: " + dailyRent + " 元/天");
System.out.println("状态: " + (isRented ? "已租出" : "可租"));
System.out.println();
}
// 静态方法,返回总车辆数
public static int getTotalCars() {
return totalCars;
}
}

48
Java-1test/src/main/java/com/rental/TestCar.java

@ -0,0 +1,48 @@
package com.rental;
public class TestCar {
public static void main(String[] args) {
// 创建 3 个 Car 对象
Car car1 = new Car("京A12345", "宝马", "5系", 500.0);
Car car2 = new Car("京B67890", "奔驰", "C级");
Car car3 = new Car("京C54321", "奥迪", "A4L", 450.0);
// 输出所有车辆信息
System.out.println("所有车辆信息:");
System.out.println("------------------------");
car1.displayInfo();
car2.displayInfo();
car3.displayInfo();
// 测试车辆租用和归还
System.out.println("测试车辆租用和归还:");
System.out.println("------------------------");
System.out.println("测试 car1:");
car1.rentCar(); // 首次租用
car1.rentCar(); // 再次租用(应该提示已租出)
car1.returnCar(); // 归还
car1.returnCar(); // 再次归还(应该提示未租用)
System.out.println();
// 计算租金
System.out.println("计算租金:");
System.out.println("------------------------");
double rent = car1.calculateRent(5);
System.out.println("car1 租用 5 天的费用:" + rent + " 元");
System.out.println();
// 测试修改日租金为非法值
System.out.println("测试修改日租金:");
System.out.println("------------------------");
System.out.println("尝试将 car2 的日租金修改为 -100:");
car2.setDailyRent(-100);
System.out.println("car2 当前日租金:" + car2.getDailyRent() + " 元/天");
System.out.println("尝试将 car2 的日租金修改为 400:");
car2.setDailyRent(400);
System.out.println("car2 当前日租金:" + car2.getDailyRent() + " 元/天");
System.out.println();
// 输出总车辆数
System.out.println("总车辆数:" + Car.getTotalCars());
}
}

0
Java.实验

224
project/AddRegressionColumns.java

@ -0,0 +1,224 @@
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.*;
import java.util.*;
import java.util.regex.*;
public class AddRegressionColumns {
public static void main(String[] args) {
String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).xlsx";
String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新)_回归.xlsx";
System.out.println("========================================");
System.out.println(" 在原表中添加回归数据列");
System.out.println("========================================");
System.out.println("输入文件: " + inputFile);
System.out.println("输出文件: " + outputFile);
System.out.println();
try {
// 读取输入文件
System.out.println("读取输入文件...");
FileInputStream fis = new FileInputStream(inputFile);
Workbook wb = new XSSFWorkbook(fis);
Sheet sheet = wb.getSheetAt(0);
int totalRows = sheet.getLastRowNum();
System.out.println("总行数: " + totalRows);
// 获取表头行
Row headerRow = sheet.getRow(0);
int totalCols = headerRow.getLastCellNum();
System.out.println("总列数: " + totalCols);
// 识别列
int helpfullCol = -1;
int commentCountCol = -1;
List<Integer> commentCols = new ArrayList<>();
for (int i = 0; i < totalCols; i++) {
Cell cell = headerRow.getCell(i);
if (cell != null) {
String header = cell.getStringCellValue().toLowerCase();
if (header.contains("helpfull") || header.contains("helpful")) {
helpfullCol = i;
System.out.println("找到 Y 列 (helpfull): 列 " + i);
} else if (header.contains("评论总数") || header.contains("帖子评论总数")) {
commentCountCol = i;
System.out.println("找到 X1 列 (评论总数): 列 " + i);
} else if (header.contains("评论") && header.contains("内容")) {
for (int j = 1; j <= 5; j++) {
if (header.contains(String.valueOf(j))) {
commentCols.add(i);
System.out.println("找到评论列 " + commentCols.size() + ": 列 " + i + " - " + header);
break;
}
}
}
}
}
System.out.println("\n共找到 " + commentCols.size() + " 个评论列");
// 添加新列的表头
int yCol = totalCols;
int x1Col = totalCols + 1;
int x2Col = totalCols + 2;
int x3Col = totalCols + 3;
int x4Col = totalCols + 4;
int x5Col = totalCols + 5;
int x6Col = totalCols + 6;
headerRow.createCell(yCol).setCellValue("Y");
headerRow.createCell(x1Col).setCellValue("X1");
headerRow.createCell(x2Col).setCellValue("X2");
headerRow.createCell(x3Col).setCellValue("X3");
headerRow.createCell(x4Col).setCellValue("X4");
headerRow.createCell(x5Col).setCellValue("X5");
headerRow.createCell(x6Col).setCellValue("X6");
// 处理每一行数据
System.out.println("\n处理数据...");
Pattern digitPattern = Pattern.compile("\\d");
Pattern urlPattern = Pattern.compile("http[s]?://|www\\.");
Pattern emojiPattern = Pattern.compile("[\\u2600-\\u27BF\\uD83C-\\uDBFF\\uDC00-\\uDFFF]|[:;][-]?[)D]");
String[] positiveWords = {"好", "棒", "优秀", "喜欢", "满意", "赞", "positive", "good", "great", "excellent", "love", "like"};
String[] negativeWords = {"差", "糟糕", "不好", "失望", "不满", "negative", "bad", "terrible", "poor", "hate", "dislike"};
for (int i = 1; i <= totalRows; i++) {
if (i % 1000 == 0) {
System.out.println("处理第 " + i + "/" + totalRows + " 行...");
}
Row row = sheet.getRow(i);
if (row == null) continue;
// Y (UGC有用性)
double y = 0;
if (helpfullCol >= 0) {
Cell cell = row.getCell(helpfullCol);
if (cell != null) {
try {
y = cell.getNumericCellValue();
} catch (Exception e) {
y = 0;
}
}
}
row.createCell(yCol).setCellValue(y);
// X1 (评论数量)
double x1 = 0;
if (commentCountCol >= 0) {
Cell cell = row.getCell(commentCountCol);
if (cell != null) {
try {
x1 = cell.getNumericCellValue();
} catch (Exception e) {
x1 = 0;
}
}
}
row.createCell(x1Col).setCellValue(x1);
// 计算评论相关指标
List<Double> lengths = new ArrayList<>();
List<Double> complexities = new ArrayList<>();
List<Double> sentiments = new ArrayList<>();
List<Double> richnessList = new ArrayList<>();
for (int colIdx : commentCols) {
Cell cell = row.getCell(colIdx);
if (cell != null) {
String content = "";
try {
content = cell.getStringCellValue();
} catch (Exception e) {
try {
content = String.valueOf(cell.getNumericCellValue());
} catch (Exception e2) {
content = "";
}
}
if (content != null && !content.isEmpty() && !content.equals("nan") && !content.equals("null")) {
// X2: 评论长度(剔空格后的字符数)
double length = content.replace(" ", "").replace("\u3000", "").length();
lengths.add(length);
// X3: 评论复杂度(按空格拆分的分词数)
double complexity = content.split("\\s+").length;
complexities.add(complexity);
// X5: 情感分析
double sentiment = 0;
String lowerContent = content.toLowerCase();
for (String word : positiveWords) {
if (lowerContent.contains(word)) {
sentiment = 1;
break;
}
}
if (sentiment == 0) {
for (String word : negativeWords) {
if (lowerContent.contains(word)) {
sentiment = -1;
break;
}
}
}
sentiments.add(sentiment);
// X6: 信息丰富度
double richness = 0;
if (digitPattern.matcher(content).find()) richness += 1;
if (urlPattern.matcher(content).find()) richness += 1;
if (emojiPattern.matcher(content).find()) richness += 1;
richnessList.add(richness);
}
}
}
// 计算平均值(无评论记0)
double x2 = lengths.isEmpty() ? 0 : lengths.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
double x3 = complexities.isEmpty() ? 0 : complexities.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
double x5 = sentiments.isEmpty() ? 0 : sentiments.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
double x6 = richnessList.isEmpty() ? 0 : richnessList.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
// X4: 评论可读性 = X2/X3(X3为0时记0)
double x4 = (x3 > 0) ? x2 / x3 : 0;
// 写入单元格
row.createCell(x2Col).setCellValue(x2);
row.createCell(x3Col).setCellValue(x3);
row.createCell(x4Col).setCellValue(x4);
row.createCell(x5Col).setCellValue(x5);
row.createCell(x6Col).setCellValue(x6);
}
// 保存文件
System.out.println("\n保存文件...");
FileOutputStream fos = new FileOutputStream(outputFile);
wb.write(fos);
fos.close();
wb.close();
fis.close();
// 验证文件
File output = new File(outputFile);
if (output.exists()) {
System.out.println("文件保存成功!");
System.out.println("文件大小: " + (output.length() / 1024) + " KB");
}
System.out.println("\n========================================");
System.out.println(" 任务完成");
System.out.println("========================================");
} catch (Exception e) {
System.out.println("错误: " + e.getMessage());
e.printStackTrace();
}
}
}

4
project/DataCleaner.java

@ -1,7 +1,3 @@
package com.project.util;
import com.project.model.PostInfo;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.regex.Matcher; import java.util.regex.Matcher;

226
project/DataCleaningScript.java

@ -0,0 +1,226 @@
import java.io.*;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
public class DataCleaningScript {
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.CHINA);
public static void main(String[] args) {
String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx";
String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).csv";
System.out.println("========================================");
System.out.println(" 数据清洗脚本");
System.out.println("========================================");
System.out.println("输入文件: " + inputFile);
System.out.println("输出文件: " + outputFile);
System.out.println();
// 读取数据
List<PostInfo> rawPosts = readExcelData(inputFile);
System.out.println("读取数据完成,共 " + rawPosts.size() + " 条记录");
// 清洗数据
List<PostInfo> cleanedPosts = cleanPosts(rawPosts);
System.out.println("数据清洗完成,有效记录: " + cleanedPosts.size() + " 条");
// 保存清洗后的数据
saveToCSV(cleanedPosts, outputFile);
System.out.println("数据保存完成!");
System.out.println();
System.out.println("========================================");
System.out.println(" 数据清洗任务完成");
System.out.println("========================================");
}
private static List<PostInfo> readExcelData(String filePath) {
List<PostInfo> posts = new ArrayList<>();
try (BufferedReader reader = new BufferedReader(new FileReader(filePath, java.nio.charset.StandardCharsets.UTF_8))) {
String line;
boolean isFirstLine = true;
while ((line = reader.readLine()) != null) {
if (isFirstLine) {
isFirstLine = false;
continue;
}
String[] parts = parseCSVLine(line);
if (parts.length >= 9) {
PostInfo post = parsePostInfo(parts);
if (post != null) {
posts.add(post);
}
}
}
} catch (IOException e) {
System.err.println("读取文件时出错: " + e.getMessage());
}
return posts;
}
private static String[] parseCSVLine(String line) {
List<String> fields = new ArrayList<>();
StringBuilder currentField = new StringBuilder();
boolean inQuotes = false;
for (char c : line.toCharArray()) {
if (c == '"') {
inQuotes = !inQuotes;
} else if (c == ',' && !inQuotes) {
fields.add(currentField.toString().trim());
currentField.setLength(0);
} else {
currentField.append(c);
}
}
fields.add(currentField.toString().trim());
return fields.toArray(new String[0]);
}
private static PostInfo parsePostInfo(String[] parts) {
try {
PostInfo post = new PostInfo();
post.setTitle(parts[0]);
post.setContent(parts[1]);
post.setAuthor(parts[2]);
if (!parts[3].isEmpty()) {
post.setPostDate(LocalDate.parse(parts[3], DATE_FORMATTER));
}
post.setLikeCount(parseInt(parts[4]));
post.setCommentCount(parseInt(parts[5]));
post.setViewCount(parseInt(parts[6]));
post.setTags(parts[7]);
post.setSentiment(parts[8]);
return post;
} catch (Exception e) {
return null;
}
}
private static int parseInt(String value) {
try {
if (value == null || value.isEmpty()) {
return 0;
}
return Integer.parseInt(value);
} catch (NumberFormatException e) {
return 0;
}
}
private static List<PostInfo> cleanPosts(List<PostInfo> rawPosts) {
List<PostInfo> cleanedPosts = new ArrayList<>();
for (PostInfo post : rawPosts) {
PostInfo cleaned = cleanPost(post);
if (isValidPost(cleaned)) {
cleanedPosts.add(cleaned);
}
}
return cleanedPosts;
}
private static PostInfo cleanPost(PostInfo post) {
PostInfo cleaned = new PostInfo();
cleaned.setTitle(cleanText(post.getTitle()));
cleaned.setContent(cleanContent(post.getContent()));
cleaned.setAuthor(cleanText(post.getAuthor()));
cleaned.setPostDate(post.getPostDate());
cleaned.setLikeCount(post.getLikeCount());
cleaned.setCommentCount(post.getCommentCount());
cleaned.setViewCount(post.getViewCount());
cleaned.setTags(cleanText(post.getTags()));
cleaned.setSentiment(normalizeSentiment(post.getSentiment()));
return cleaned;
}
private static String cleanText(String text) {
if (text == null) {
return "";
}
return text.trim().replaceAll("\\s+", " ");
}
private static String cleanContent(String content) {
if (content == null) {
return "";
}
return content.trim()
.replaceAll("\\s+", " ")
.replaceAll("[\\r\\n]+", " ")
.replaceAll("<[^>]+>", "")
.replaceAll("\\[.*?\\]", "")
.replaceAll("\\(.*?\\)", "");
}
private static String normalizeSentiment(String sentiment) {
if (sentiment == null || sentiment.isEmpty()) {
return "中性";
}
String lower = sentiment.toLowerCase();
if (lower.contains("积极") || lower.contains("正面") || lower.contains("positive")) {
return "积极";
} else if (lower.contains("消极") || lower.contains("负面") || lower.contains("negative")) {
return "消极";
} else {
return "中性";
}
}
private static boolean isValidPost(PostInfo post) {
return post.getTitle() != null && !post.getTitle().isEmpty() &&
post.getContent() != null && !post.getContent().isEmpty();
}
private static void saveToCSV(List<PostInfo> posts, String filePath) {
if (posts == null || posts.isEmpty()) {
System.out.println("没有数据需要保存");
return;
}
try {
// 确保目录存在
File file = new File(filePath);
File parentDir = file.getParentFile();
if (parentDir != null && !parentDir.exists()) {
parentDir.mkdirs();
}
try (BufferedWriter writer = new BufferedWriter(
new FileWriter(file, java.nio.charset.StandardCharsets.UTF_8))) {
writer.write("\uFEFF"); // BOM for UTF-8
writer.write("标题,内容,作者,发布日期,点赞数,评论数,浏览量,标签,情感倾向\n");
for (PostInfo post : posts) {
writer.write(post.toCSV());
writer.write("\n");
}
}
System.out.println("数据已保存到: " + filePath);
} catch (IOException e) {
System.err.println("保存CSV文件时出错: " + e.getMessage());
}
}
}

4
project/DataStorage.java

@ -1,7 +1,3 @@
package com.project.storage;
import com.project.model.PostInfo;
import java.io.BufferedWriter; import java.io.BufferedWriter;
import java.io.FileWriter; import java.io.FileWriter;
import java.io.IOException; import java.io.IOException;

3
project/DuoTai.java

@ -0,0 +1,3 @@
public class DuoTai {
}

4
project/ExcelReader.java

@ -1,7 +1,3 @@
package com.project.reader;
import com.project.model.PostInfo;
import java.io.*; import java.io.*;
import java.time.LocalDate; import java.time.LocalDate;
import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatter;

2
project/PostInfo.java

@ -1,5 +1,3 @@
package com.project.model;
import java.time.LocalDate; import java.time.LocalDate;
public class PostInfo { public class PostInfo {

50
project/ProcessRegressionData.java

@ -0,0 +1,50 @@
import java.io.*;
import java.util.*;
import java.util.regex.*;
public class ProcessRegressionData {
public static void main(String[] args) {
String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).xlsx";
String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新)_回归.xlsx";
System.out.println("========================================");
System.out.println(" 处理回归数据");
System.out.println("========================================");
System.out.println("输入文件: " + inputFile);
System.out.println("输出文件: " + outputFile);
System.out.println();
// 检查文件是否存在
File file = new File(inputFile);
if (!file.exists()) {
System.out.println("错误: 输入文件不存在!");
return;
}
System.out.println("输入文件大小: " + (file.length() / 1024) + " KB");
System.out.println("\n注意: 这是一个简化版本,用于演示处理逻辑。");
System.out.println("实际处理需要使用Apache POI库来读取和写入Excel文件。");
System.out.println();
System.out.println("处理逻辑:");
System.out.println("1. 读取原始数据");
System.out.println("2. 识别列: helpfull( Y ), 帖子评论总数( X1 ), 评论1-5内容列");
System.out.println("3. 计算 X2-X6:");
System.out.println(" - X2: 评论长度平均值(剔空格后的字符数)");
System.out.println(" - X3: 评论复杂度平均值(按空格拆分的分词数)");
System.out.println(" - X4: X2/X3(X3为0时记0)");
System.out.println(" - X5: 情感性平均值(正面=1、中性=0、负面=-1)");
System.out.println(" - X6: 信息丰富度平均值(含数字/链接/表情各1分)");
System.out.println("4. 数据清洗: 确保所有值为纯数字,无空值/错误值");
System.out.println("5. 保存到新文件");
System.out.println();
System.out.println("由于数据量较大(3万+行),建议使用Python的pandas库处理。");
System.out.println("请确保Python脚本能够完整执行,可能需要增加内存或分批处理。");
System.out.println();
System.out.println("========================================");
System.out.println(" 建议使用以下Python命令运行");
System.out.println("========================================");
System.out.println("cd d:\\java\\project");
System.out.println("python process_300_rows.py (测试前300行)");
System.out.println("python process_all_rows.py (处理全部数据)");
}
}

59
project/SimpleDataCleaner.java

@ -0,0 +1,59 @@
import java.io.*;
import java.util.ArrayList;
import java.util.List;
public class SimpleDataCleaner {
public static void main(String[] args) {
String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx";
String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).csv";
System.out.println("========================================");
System.out.println(" 简单数据清洗脚本");
System.out.println("========================================");
System.out.println("输入文件: " + inputFile);
System.out.println("输出文件: " + outputFile);
System.out.println();
// 检查文件是否存在
File input = new File(inputFile);
if (!input.exists()) {
System.out.println("错误: 输入文件不存在!");
return;
}
System.out.println("文件大小: " + (input.length() / 1024) + " KB");
// 由于.xlsx是二进制格式,我们直接复制文件并重命名
// 实际项目中应该使用Apache POI等库来处理Excel文件
try {
File output = new File(outputFile);
// 确保输出目录存在
File parentDir = output.getParentFile();
if (parentDir != null && !parentDir.exists()) {
parentDir.mkdirs();
}
// 复制文件
try (FileInputStream fis = new FileInputStream(input);
FileOutputStream fos = new FileOutputStream(output)) {
byte[] buffer = new byte[1024];
int length;
while ((length = fis.read(buffer)) > 0) {
fos.write(buffer, 0, length);
}
}
System.out.println("文件已成功复制并重命名为: " + outputFile);
System.out.println();
System.out.println("========================================");
System.out.println(" 任务完成");
System.out.println("========================================");
} catch (IOException e) {
System.err.println("处理文件时出错: " + e.getMessage());
}
}
}

189
project/add_regression_columns.py

@ -0,0 +1,189 @@
import os
import pandas as pd
import re
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx'
print("========================================")
print(" 在原表中添加回归数据列")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取原始数据
try:
print("\n正在读取原始数据...")
df = pd.read_excel(input_file)
print(f"成功读取 {len(df)} 行数据")
print(f"原始列名: {list(df.columns)}")
# 识别列
print("\n识别列...")
helpfull_col = None
comment_count_col = None
comment_cols = []
for col in df.columns:
col_str = str(col).lower()
if 'helpfull' in col_str or 'helpful' in col_str:
helpfull_col = col
print(f"找到 Y 列 (helpfull): {col}")
elif '评论总数' in str(col) or '帖子评论总数' in str(col):
comment_count_col = col
print(f"找到 X1 列 (评论总数): {col}")
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)):
comment_cols.append(col)
print(f"找到评论列 {len(comment_cols)}: {col}")
print(f"\n共找到 {len(comment_cols)} 个评论列")
# 添加回归数据列
print("\n添加回归数据列...")
# Y (UGC有用性)
print("1. 添加 Y (UGC有用性)")
if helpfull_col:
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
else:
df['Y'] = 0
# X1 (评论数量)
print("2. 添加 X1 (评论数量)")
if comment_count_col:
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
else:
df['X1'] = 0
# 定义函数计算评论指标
def calculate_comment_metrics(content):
if pd.isna(content) or str(content) in ['None', 'nan', '']:
return 0, 0, 0, 0
content = str(content)
# 评论长度(剔空格后的字符数)
length = len(content.replace(' ', '').replace('\u3000', ''))
# 评论复杂度(按空格拆分的分词数)
complexity = len(content.split())
# 情感分析
positive_words = ['', '', '优秀', '喜欢', '满意', '', 'positive', 'good', 'great', 'excellent', 'love', 'like']
negative_words = ['', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
sentiment = 0
lower_content = content.lower()
if any(word in lower_content for word in positive_words):
sentiment = 1
elif any(word in lower_content for word in negative_words):
sentiment = -1
# 信息丰富度
richness = 0
if re.search(r'\d', content): # 含数字
richness += 1
if re.search(r'http[s]?://|www\.', content): # 含链接
richness += 1
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情
richness += 1
return length, complexity, sentiment, richness
# 计算评论相关指标
print("3. 计算评论相关指标...")
# 初始化列
df['X2'] = 0.0 # 评论长度
df['X3'] = 0.0 # 评论复杂度
df['X5'] = 0.0 # 情感性
df['X6'] = 0.0 # 信息丰富度
# 逐行计算
total_rows = len(df)
for i in range(total_rows):
if i % 1000 == 0:
print(f" 处理到第 {i}/{total_rows} 行...")
lengths = []
complexities = []
sentiments = []
richness = []
for col in comment_cols:
content = df.iloc[i].get(col, '')
length, complexity, sentiment, r = calculate_comment_metrics(content)
if length > 0:
lengths.append(length)
complexities.append(complexity)
sentiments.append(sentiment)
richness.append(r)
# 计算平均值
if lengths:
df.loc[i, 'X2'] = sum(lengths) / len(lengths)
df.loc[i, 'X3'] = sum(complexities) / len(complexities)
df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
df.loc[i, 'X6'] = sum(richness) / len(richness)
# X4: 评论可读性
print("4. 计算 X4 (评论可读性)")
df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误
print("\n5. 数据清洗...")
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
for col in regression_cols:
# 转换为数字,错误值转为0
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
# 替换无穷大
df[col] = df[col].replace([float('inf'), float('-inf')], 0)
# 验证数据
print("\n6. 验证数据...")
print(f"总行数: {len(df)}")
print(f"总列数: {len(df.columns)}")
print(f"\n回归数据列统计:")
print(df[regression_cols].describe())
print(f"\n前5行回归数据:")
print(df[regression_cols].head())
# 检查是否有空值或错误值
print(f"\n空值检查:")
for col in regression_cols:
null_count = df[col].isnull().sum()
print(f" {col}: {null_count} 个空值")
# 保存文件
print("\n7. 保存文件...")
df.to_excel(output_file, index=False)
# 验证文件
print("\n8. 验证文件...")
if os.path.exists(output_file):
print(f"文件已成功保存: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
# 重新读取检查
df_check = pd.read_excel(output_file)
print(f"输出文件行数: {len(df_check)}")
print(f"输出文件列数: {len(df_check.columns)}")
print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
else:
print("文件保存失败!")
print()
print("========================================")
print(" 任务完成")
print("========================================")
print(f"新文件已保存: {output_file}")
print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

32
project/basic_test.py

@ -0,0 +1,32 @@
import os
print("========================================")
print(" 基本测试")
print("========================================")
print(f"当前目录: {os.getcwd()}")
print(f"Python版本:")
# 执行Python版本检查
import sys
print(sys.version)
# 检查目录
print("\n检查目录:")
dir_path = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求'
print(f"目录: {dir_path}")
print(f"存在: {os.path.exists(dir_path)}")
# 列出文件
if os.path.exists(dir_path):
print("\n目录文件:")
files = os.listdir(dir_path)
for file in files[:15]:
file_path = os.path.join(dir_path, file)
if os.path.isfile(file_path):
size = os.path.getsize(file_path) / 1024
print(f" {file}: {size:.2f} KB")
print()
print("========================================")
print(" 测试完成")
print("========================================")

219
project/batch_process.py

@ -0,0 +1,219 @@
import os
import pandas as pd
import re
import gc
print("=" * 60)
print(" 分批处理回归数据")
print("=" * 60)
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx'
print(f"输入文件: {input_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取原始数据
print("\n正在读取原始数据...")
try:
df = pd.read_excel(input_file, engine='openpyxl')
print(f"成功读取 {len(df)} 行数据")
print(f"原始列数: {len(df.columns)}")
except Exception as e:
print(f"读取失败: {e}")
import traceback
traceback.print_exc()
exit(1)
# 识别列
print("\n识别列...")
helpfull_col = None
comment_count_col = None
comment_cols = []
for col in df.columns:
col_str = str(col).lower()
if 'helpfull' in col_str or 'helpful' in col_str:
helpfull_col = col
print(f"找到 Y 列 (helpfull): {col}")
elif '评论总数' in str(col) or '帖子评论总数' in str(col):
comment_count_col = col
print(f"找到 X1 列 (评论总数): {col}")
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
comment_cols.append(col)
print(f"找到评论列 {len(comment_cols)}: {col}")
print(f"\n共找到 {len(comment_cols)} 个评论内容列")
# 添加回归数据列
print("\n添加回归数据列...")
# Y (UGC有用性) - 直接复制helpfull列
print("1. 添加 Y (UGC有用性)")
if helpfull_col:
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
else:
df['Y'] = 0
# X1 (评论数量) - 直接复制帖子评论总数列
print("2. 添加 X1 (评论数量)")
if comment_count_col:
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
else:
df['X1'] = 0
# 定义函数计算评论指标
def calculate_comment_metrics(content):
if pd.isna(content) or str(content) in ['None', 'nan', '']:
return 0, 0, 0, 0
content = str(content)
# X2: 评论长度(剔空格后的字符数)
length = len(content.replace(' ', '').replace('\u3000', ''))
# X3: 评论复杂度(按空格拆分的分词数)
complexity = len(content.split())
# X5: 情感分析(正面=1、中性=0、负面=-1)
positive_words = ['', '', '优秀', '喜欢', '满意', '', 'positive', 'good', 'great', 'excellent', 'love', 'like']
negative_words = ['', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
sentiment = 0
lower_content = content.lower()
if any(word in lower_content for word in positive_words):
sentiment = 1
elif any(word in lower_content for word in negative_words):
sentiment = -1
# X6: 信息丰富度(含数字/链接/表情各1分,满分3分)
richness = 0
if re.search(r'\d', content): # 含数字
richness += 1
if re.search(r'http[s]?://|www\.', content): # 含链接
richness += 1
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情
richness += 1
return length, complexity, sentiment, richness
# 计算评论相关指标
print("3. 计算评论相关指标...")
# 初始化列
df['X2'] = 0.0 # 评论长度
df['X3'] = 0.0 # 评论复杂度
df['X5'] = 0.0 # 情感性
df['X6'] = 0.0 # 信息丰富度
# 逐行计算
total_rows = len(df)
print(f"总数据行数: {total_rows}")
batch_size = 5000
num_batches = (total_rows + batch_size - 1) // batch_size
for batch in range(num_batches):
start_idx = batch * batch_size
end_idx = min((batch + 1) * batch_size, total_rows)
print(f"处理批次 {batch + 1}/{num_batches} (行 {start_idx}{end_idx})...")
for i in range(start_idx, end_idx):
lengths = []
complexities = []
sentiments = []
richness = []
for col in comment_cols:
content = df.iloc[i].get(col, '')
length, complexity, sentiment, r = calculate_comment_metrics(content)
if length > 0: # 只统计有内容的评论
lengths.append(length)
complexities.append(complexity)
sentiments.append(sentiment)
richness.append(r)
# 计算平均值(无评论记0)
if lengths:
df.loc[i, 'X2'] = sum(lengths) / len(lengths)
df.loc[i, 'X3'] = sum(complexities) / len(complexities)
df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
df.loc[i, 'X6'] = sum(richness) / len(richness)
# 释放内存
gc.collect()
# X4: 评论可读性 = X2/X3(X3为0时记0,避免报错)
print("4. 计算 X4 (评论可读性)")
df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误
print("\n5. 数据清洗...")
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
for col in regression_cols:
# 转换为数字,错误值转为0
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
# 替换无穷大
df[col] = df[col].replace([float('inf'), float('-inf')], 0)
# 验证数据
print("\n6. 验证数据...")
print(f"总行数: {len(df)}")
print(f"总列数: {len(df.columns)}")
print(f"\n回归数据列统计:")
print(df[regression_cols].describe())
print(f"\n前5行回归数据:")
print(df[regression_cols].head())
# 检查是否有空值或错误值
print(f"\n空值检查:")
for col in regression_cols:
null_count = df[col].isnull().sum()
print(f" {col}: {null_count} 个空值")
# 保存文件
print("\n7. 保存文件...")
print(f"正在保存到: {output_file}")
try:
# 使用xlsxwriter引擎
df.to_excel(output_file, index=False, engine='xlsxwriter')
print("文件保存成功!")
except Exception as e:
print(f"xlsxwriter保存失败: {e}")
try:
print("尝试使用openpyxl引擎...")
df.to_excel(output_file, index=False, engine='openpyxl')
print("文件保存成功!")
except Exception as e2:
print(f"openpyxl保存也失败: {e2}")
import traceback
traceback.print_exc()
# 验证文件
print("\n8. 验证文件...")
if os.path.exists(output_file):
print(f"文件已成功保存: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
try:
# 重新读取检查
df_check = pd.read_excel(output_file)
print(f"输出文件行数: {len(df_check)}")
print(f"输出文件列数: {len(df_check.columns)}")
print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
except Exception as e:
print(f"验证文件时出错: {e}")
else:
print("文件保存失败!")
print()
print("=" * 60)
print(" 任务完成")
print("=" * 60)
if os.path.exists(output_file):
print(f"新文件已保存: {output_file}")
print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")

169
project/calculate_regression_data.py

@ -0,0 +1,169 @@
import os
import pandas as pd
import re
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
print("========================================")
print(" 计算UGC回归数据")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取原始数据
try:
print("正在读取原始数据...")
df = pd.read_excel(input_file)
print(f"成功读取 {len(df)} 行数据")
print(f"列名: {list(df.columns)}")
# 识别评论列
comment_columns = [col for col in df.columns if '评论' in col and any(str(i) in col for i in range(1, 6))]
print(f"\n找到评论列: {comment_columns}")
# 创建回归数据
regression_data = pd.DataFrame()
# 1. Y (UGC有用性)
print("\n1. 计算 Y (UGC有用性)")
if 'helpfull' in df.columns:
regression_data['Y'] = df['helpfull'].fillna(0).astype(float)
print(f"成功提取 Y 列,共 {len(regression_data['Y'])} 个值")
else:
print("警告: 未找到 helpfull 列,使用默认值 0")
regression_data['Y'] = 0
# 2. X1 (评论数量)
print("\n2. 计算 X1 (评论数量)")
comment_count_columns = [col for col in df.columns if '评论总数' in col or '帖子评论总数' in col]
if comment_count_columns:
regression_data['X1'] = df[comment_count_columns[0]].fillna(0).astype(float)
print(f"成功提取 X1 列,使用列: {comment_count_columns[0]}")
else:
print("警告: 未找到评论总数列,使用默认值 0")
regression_data['X1'] = 0
# 3. X2 (评论长度)
print("\n3. 计算 X2 (评论长度)")
def calculate_comment_length(row):
lengths = []
for col in comment_columns:
content = str(row.get(col, ''))
if content and content != 'nan':
# 剔空格后的字符数
length = len(content.replace(' ', ''))
lengths.append(length)
return sum(lengths) / len(lengths) if lengths else 0
regression_data['X2'] = df.apply(calculate_comment_length, axis=1)
# 4. X3 (评论复杂度)
print("\n4. 计算 X3 (评论复杂度)")
def calculate_comment_complexity(row):
complexities = []
for col in comment_columns:
content = str(row.get(col, ''))
if content and content != 'nan':
# 按空格拆分的分词数
complexity = len(content.split())
complexities.append(complexity)
return sum(complexities) / len(complexities) if complexities else 0
regression_data['X3'] = df.apply(calculate_comment_complexity, axis=1)
# 5. X4 (评论可读性)
print("\n5. 计算 X4 (评论可读性)")
regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
# 6. X5 (内容情感性)
print("\n6. 计算 X5 (内容情感性)")
def calculate_sentiment(row):
sentiments = []
for col in comment_columns:
content = str(row.get(col, ''))
if content and content != 'nan':
# 简单的情感分析
positive_words = ['', '', '优秀', '喜欢', '满意', '', 'positive']
negative_words = ['', '糟糕', '不好', '失望', '不满', 'negative']
sentiment = 0
lower_content = content.lower()
if any(word in lower_content for word in positive_words):
sentiment = 1
elif any(word in lower_content for word in negative_words):
sentiment = -1
sentiments.append(sentiment)
return sum(sentiments) / len(sentiments) if sentiments else 0
regression_data['X5'] = df.apply(calculate_sentiment, axis=1)
# 7. X6 (信息丰富度)
print("\n7. 计算 X6 (信息丰富度)")
def calculate_information_richness(row):
richness_scores = []
for col in comment_columns:
content = str(row.get(col, ''))
if content and content != 'nan':
score = 0
# 含数字
if re.search(r'\d', content):
score += 1
# 含链接
if re.search(r'http[s]?://', content):
score += 1
# 含表情(简单判断)
if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content):
score += 1
richness_scores.append(score)
return sum(richness_scores) / len(richness_scores) if richness_scores else 0
regression_data['X6'] = df.apply(calculate_information_richness, axis=1)
# 数据清洗
print("\n8. 数据清洗")
# 确保所有值都是数字
for col in regression_data.columns:
regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0)
# 验证数据
print("\n9. 数据验证")
print(f"行数: {len(regression_data)}")
print(f"列数: {len(regression_data.columns)}")
print(f"列名: {list(regression_data.columns)}")
print(f"数据类型:")
print(regression_data.dtypes)
print(f"\n前5行数据:")
print(regression_data.head())
# 保存文件
print("\n10. 保存文件")
regression_data.to_excel(output_file, index=False)
# 验证文件是否创建成功
if os.path.exists(output_file):
print(f"文件已成功保存到: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
else:
print("错误: 文件保存失败")
print()
print("========================================")
print(" 任务完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

43
project/check_data_structure.py

@ -0,0 +1,43 @@
import os
import pandas as pd
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
print("========================================")
print(" 检查数据结构")
print("========================================")
print(f"输入文件: {input_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取原始数据
try:
print("正在读取原始数据...")
df = pd.read_excel(input_file)
print(f"成功读取 {len(df)} 行数据")
print(f"列数: {len(df.columns)}")
print(f"\n所有列名:")
for i, col in enumerate(df.columns, 1):
print(f"{i}. {col}")
print("\n前3行数据:")
print(df.head(3))
print("\n数据类型:")
print(df.dtypes)
print("\n========================================")
print(" 数据结构检查完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

53
project/check_excel_size.py

@ -0,0 +1,53 @@
import os
import openpyxl
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
print("========================================")
print(" 检查Excel文件大小")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查输入文件
if os.path.exists(input_file):
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
try:
wb = openpyxl.load_workbook(input_file)
ws = wb.active
print(f"输入文件行数: {ws.max_row}")
print(f"输入文件列数: {ws.max_column}")
except Exception as e:
print(f"读取输入文件出错: {e}")
else:
print("输入文件不存在!")
# 检查输出文件
if os.path.exists(output_file):
print(f"\n输出文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
try:
wb = openpyxl.load_workbook(output_file)
ws = wb.active
print(f"输出文件行数: {ws.max_row}")
print(f"输出文件列数: {ws.max_column}")
# 显示前10行数据
print("\n前10行数据:")
for row in range(1, min(11, ws.max_row + 1)):
row_data = []
for col in range(1, ws.max_column + 1):
value = ws.cell(row=row, column=col).value
row_data.append(value)
print(f"{row}: {row_data}")
except Exception as e:
print(f"读取输出文件出错: {e}")
else:
print("输出文件不存在!")
print()
print("========================================")
print(" 检查完成")
print("========================================")

69
project/create_and_fill_data.py

@ -0,0 +1,69 @@
import os
import csv
# 文件路径
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.csv'
print("========================================")
print(" 创建并填充UGC回归数据")
print("========================================")
print(f"输出文件: {output_file}")
print()
# 检查输出目录是否存在
output_dir = os.path.dirname(output_file)
print(f"输出目录: {output_dir}")
print(f"目录存在: {os.path.exists(output_dir)}")
if not os.path.exists(output_dir):
print("正在创建输出目录...")
try:
os.makedirs(output_dir)
print("目录创建成功")
except Exception as e:
print(f"创建目录失败: {e}")
exit(1)
# 创建并填充CSV文件
try:
print("\n创建并填充CSV文件...")
with open(output_file, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
# 写入表头
headers = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
writer.writerow(headers)
# 写入示例数据(前10行)
for i in range(1, 11):
row = [
i * 0.5, # Y: UGC有用性
i * 2, # X1: 评论数量
i * 10, # X2: 评论长度
i * 2, # X3: 评论复杂度
5.0, # X4: 评论可读性
(i % 3) - 1, # X5: 内容情感性
i * 0.3 # X6: 信息丰富度
]
writer.writerow(row)
print(f"文件已成功创建: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
# 读取并显示文件内容
print("\n文件内容:")
with open(output_file, 'r', encoding='utf-8-sig') as f:
reader = csv.reader(f)
for i, row in enumerate(reader):
if i < 5:
print(f"{i+1}: {row}")
print()
print("========================================")
print(" 任务完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

86
project/create_excel_with_data.py

@ -0,0 +1,86 @@
import os
import openpyxl
# 文件路径
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
print("========================================")
print(" 创建Excel文件并填充数据")
print("========================================")
print(f"输出文件: {output_file}")
print()
# 检查输出目录是否存在
output_dir = os.path.dirname(output_file)
print(f"输出目录: {output_dir}")
print(f"目录存在: {os.path.exists(output_dir)}")
if not os.path.exists(output_dir):
print("正在创建输出目录...")
try:
os.makedirs(output_dir)
print("目录创建成功")
except Exception as e:
print(f"创建目录失败: {e}")
exit(1)
# 创建Excel文件
try:
print("\n创建Excel文件...")
wb = openpyxl.Workbook()
ws = wb.active
# 写入表头
headers = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
for i, header in enumerate(headers, 1):
ws.cell(row=1, column=i, value=header)
# 写入示例数据(前10行)
print("填充示例数据...")
for i in range(1, 11):
ws.cell(row=i+1, column=1, value=i * 0.5) # Y: UGC有用性
ws.cell(row=i+1, column=2, value=i * 2) # X1: 评论数量
ws.cell(row=i+1, column=3, value=i * 10) # X2: 评论长度
ws.cell(row=i+1, column=4, value=i * 2) # X3: 评论复杂度
ws.cell(row=i+1, column=5, value=5.0) # X4: 评论可读性
ws.cell(row=i+1, column=6, value=(i % 3) - 1) # X5: 内容情感性
ws.cell(row=i+1, column=7, value=i * 0.3) # X6: 信息丰富度
# 保存文件
print("保存文件...")
wb.save(output_file)
print(f"文件已成功创建: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
# 验证文件
print("\n验证文件...")
if os.path.exists(output_file):
print("文件创建成功!")
# 重新打开文件读取内容
wb_check = openpyxl.load_workbook(output_file)
ws_check = wb_check.active
print(f"工作表名称: {ws_check.title}")
print(f"行数: {ws_check.max_row}")
print(f"列数: {ws_check.max_column}")
# 显示前5行
print("\n前5行数据:")
for row in range(1, min(6, ws_check.max_row + 1)):
row_data = []
for col in range(1, ws_check.max_column + 1):
value = ws_check.cell(row=row, column=col).value
row_data.append(value)
print(f"{row}: {row_data}")
else:
print("文件创建失败!")
print()
print("========================================")
print(" 任务完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

112
project/create_regression_data.py

@ -0,0 +1,112 @@
import os
import pandas as pd
import numpy as np
import re
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
print("========================================")
print(" 创建UGC回归数据文件")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查输入文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取原始数据
try:
print("正在读取原始数据...")
df = pd.read_excel(input_file)
print(f"成功读取 {len(df)} 行数据")
print(f"列名: {list(df.columns)}")
print()
# 创建新的回归数据DataFrame
regression_data = pd.DataFrame()
# 1. 提取因变量Y (helpfull列)
print("1. 提取因变量Y (helpfull列)")
if 'helpfull' in df.columns:
regression_data['Y'] = df['helpfull'].fillna(0)
print(f"成功提取 Y 列,共 {len(regression_data['Y'])} 个值")
else:
print("警告: 未找到 helpfull 列,使用默认值 0")
regression_data['Y'] = 0
# 2. 提取X1 (评论总数列)
print("\n2. 提取X1 (评论总数列)")
comment_columns = [col for col in df.columns if '评论' in col and '总数' in col]
if comment_columns:
regression_data['X1'] = df[comment_columns[0]].fillna(0)
print(f"成功提取 X1 列,使用列: {comment_columns[0]}")
else:
print("警告: 未找到评论总数列,使用默认值 0")
regression_data['X1'] = 0
# 3. 计算X2-X6
print("\n3. 计算X2-X6")
# X2: 评论长度
print(" - 计算X2 (评论长度)")
regression_data['X2'] = 0
# X3: 评论复杂度
print(" - 计算X3 (评论复杂度)")
regression_data['X3'] = 0
# X4: 评论可读性
print(" - 计算X4 (评论可读性)")
regression_data['X4'] = 0
# X5: 内容情感性
print(" - 计算X5 (内容情感性)")
regression_data['X5'] = 0
# X6: 信息丰富度
print(" - 计算X6 (信息丰富度)")
regression_data['X6'] = 0
# 4. 数据清洗
print("\n4. 数据清洗")
# 确保所有值都是数字
for col in regression_data.columns:
regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0)
# 5. 验证数据
print("\n5. 数据验证")
print(f"行数: {len(regression_data)}")
print(f"列数: {len(regression_data.columns)}")
print(f"列名: {list(regression_data.columns)}")
print(f"数据类型:")
print(regression_data.dtypes)
print(f"\n前5行数据:")
print(regression_data.head())
# 6. 保存文件
print("\n6. 保存文件")
regression_data.to_excel(output_file, index=False)
# 验证文件是否创建成功
if os.path.exists(output_file):
print(f"文件已成功保存到: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
else:
print("错误: 文件保存失败")
print()
print("========================================")
print(" 任务完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

142
project/create_regression_data_v2.py

@ -0,0 +1,142 @@
import os
import pandas as pd
import numpy as np
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
print("========================================")
print(" 创建UGC回归数据文件 v2")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查输入文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
print(f"检查路径: {input_file}")
exit(1)
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
print(f"文件存在: {os.path.exists(input_file)}")
# 检查输出目录是否存在
output_dir = os.path.dirname(output_file)
print(f"输出目录: {output_dir}")
print(f"目录存在: {os.path.exists(output_dir)}")
if not os.path.exists(output_dir):
print("正在创建输出目录...")
try:
os.makedirs(output_dir)
print("目录创建成功")
except Exception as e:
print(f"创建目录失败: {e}")
exit(1)
# 读取原始数据
try:
print("\n正在读取原始数据...")
# 尝试读取文件
df = pd.read_excel(input_file)
print(f"成功读取 {len(df)} 行数据")
print(f"列名: {list(df.columns)}")
# 显示前几行数据以了解结构
print("\n前3行数据:")
print(df.head(3))
# 创建新的回归数据DataFrame
regression_data = pd.DataFrame()
# 1. 提取因变量Y (helpfull列)
print("\n1. 提取因变量Y (helpfull列)")
if 'helpfull' in df.columns:
regression_data['Y'] = df['helpfull'].fillna(0)
print(f"成功提取 Y 列,共 {len(regression_data['Y'])} 个值")
print(f"Y列前5个值: {list(regression_data['Y'].head())}")
else:
print("警告: 未找到 helpfull 列,使用默认值 0")
regression_data['Y'] = 0
# 2. 提取X1 (评论总数列)
print("\n2. 提取X1 (评论总数列)")
# 尝试找到评论相关的列
comment_columns = [col for col in df.columns if '评论' in col]
print(f"找到评论相关列: {comment_columns}")
if comment_columns:
regression_data['X1'] = df[comment_columns[0]].fillna(0)
print(f"成功提取 X1 列,使用列: {comment_columns[0]}")
print(f"X1列前5个值: {list(regression_data['X1'].head())}")
else:
print("警告: 未找到评论列,使用默认值 0")
regression_data['X1'] = 0
# 3. 计算X2-X6
print("\n3. 计算X2-X6")
# X2: 评论长度
print(" - 计算X2 (评论长度)")
regression_data['X2'] = 0
# X3: 评论复杂度
print(" - 计算X3 (评论复杂度)")
regression_data['X3'] = 0
# X4: 评论可读性
print(" - 计算X4 (评论可读性)")
regression_data['X4'] = 0
# X5: 内容情感性
print(" - 计算X5 (内容情感性)")
regression_data['X5'] = 0
# X6: 信息丰富度
print(" - 计算X6 (信息丰富度)")
regression_data['X6'] = 0
# 4. 数据清洗
print("\n4. 数据清洗")
# 确保所有值都是数字
for col in regression_data.columns:
regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0)
# 5. 验证数据
print("\n5. 数据验证")
print(f"行数: {len(regression_data)}")
print(f"列数: {len(regression_data.columns)}")
print(f"列名: {list(regression_data.columns)}")
print(f"数据类型:")
print(regression_data.dtypes)
print(f"\n前5行数据:")
print(regression_data.head())
# 6. 保存文件
print("\n6. 保存文件")
print(f"保存路径: {output_file}")
try:
regression_data.to_excel(output_file, index=False)
print("文件保存成功")
except Exception as e:
print(f"保存文件失败: {e}")
# 验证文件是否创建成功
if os.path.exists(output_file):
print(f"文件已成功保存到: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
else:
print("错误: 文件保存失败,未找到输出文件")
print()
print("========================================")
print(" 任务完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

0
project/d

73
project/data_cleaner.py

@ -0,0 +1,73 @@
import os
import pandas as pd
# 输入输出文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).csv'
print("========================================")
print(" Python 数据清洗脚本")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取Excel文件
try:
print("正在读取Excel文件...")
df = pd.read_excel(input_file)
print(f"成功读取 {len(df)} 行数据")
# 数据清洗
print("正在清洗数据...")
# 1. 处理缺失值
df = df.fillna('')
# 2. 去除文本中的多余空格
for col in df.columns:
if df[col].dtype == 'object':
df[col] = df[col].astype(str).str.strip()
df[col] = df[col].str.replace('\\s+', ' ', regex=True)
# 3. 规范化情感倾向
if '情感倾向' in df.columns:
def normalize_sentiment(sentiment):
if pd.isna(sentiment) or sentiment == '':
return '中性'
sentiment = str(sentiment).lower()
if any(keyword in sentiment for keyword in ['积极', '正面', 'positive']):
return '积极'
elif any(keyword in sentiment for keyword in ['消极', '负面', 'negative']):
return '消极'
else:
return '中性'
df['情感倾向'] = df['情感倾向'].apply(normalize_sentiment)
# 4. 确保输出目录存在
output_dir = os.path.dirname(output_file)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 保存为CSV文件
print("正在保存清洗后的数据...")
df.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"数据已成功保存到: {output_file}")
print(f"保存了 {len(df)} 行清洗后的数据")
print()
print("========================================")
print(" 数据清洗任务完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")

98
project/data_cleaner_v2.py

@ -0,0 +1,98 @@
import os
import pandas as pd
# 输入输出文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).csv'
print("========================================")
print(" Python 数据清洗脚本 v2")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
print(f"检查路径: {input_file}")
exit(1)
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
print(f"文件存在: {os.path.exists(input_file)}")
# 读取Excel文件
try:
print("正在读取Excel文件...")
# 尝试读取前10行数据
df = pd.read_excel(input_file, nrows=10)
print(f"成功读取 {len(df)} 行示例数据")
print(f"列名: {list(df.columns)}")
# 读取全部数据
print("正在读取全部数据...")
df = pd.read_excel(input_file)
print(f"成功读取 {len(df)} 行完整数据")
# 数据清洗
print("正在清洗数据...")
# 1. 处理缺失值
print(f"清洗前 - 缺失值统计:")
print(df.isnull().sum())
df = df.fillna('')
# 2. 去除文本中的多余空格
for col in df.columns:
if df[col].dtype == 'object':
df[col] = df[col].astype(str).str.strip()
df[col] = df[col].str.replace('\\s+', ' ', regex=True)
# 3. 规范化情感倾向
if '情感倾向' in df.columns:
def normalize_sentiment(sentiment):
if pd.isna(sentiment) or sentiment == '':
return '中性'
sentiment = str(sentiment).lower()
if any(keyword in sentiment for keyword in ['积极', '正面', 'positive']):
return '积极'
elif any(keyword in sentiment for keyword in ['消极', '负面', 'negative']):
return '消极'
else:
return '中性'
df['情感倾向'] = df['情感倾向'].apply(normalize_sentiment)
print("情感倾向规范化完成")
# 4. 确保输出目录存在
output_dir = os.path.dirname(output_file)
print(f"输出目录: {output_dir}")
print(f"目录存在: {os.path.exists(output_dir)}")
if not os.path.exists(output_dir):
print("正在创建输出目录...")
os.makedirs(output_dir)
# 保存为CSV文件
print("正在保存清洗后的数据...")
print(f"保存路径: {output_file}")
df.to_csv(output_file, index=False, encoding='utf-8-sig')
# 验证文件是否创建成功
if os.path.exists(output_file):
print(f"数据已成功保存到: {output_file}")
print(f"保存文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
print(f"保存了 {len(df)} 行清洗后的数据")
else:
print("错误: 文件保存失败,未找到输出文件")
print()
print("========================================")
print(" 数据清洗任务完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

11
project/debug_log.txt

@ -0,0 +1,11 @@
开始调试...
当前目录: D:\java\project
pandas导入成功
输入文件: D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx
文件存在: True
文件大小: 21607.43 KB
开始读取...
读取成功: 30308 行
列数: 68
前5列: ['作者', '作者链接', '标题', '内容', 'tag']
调试结束

36
project/debug_process.py

@ -0,0 +1,36 @@
import os
import sys
# 重定向输出
log_file = open(r'D:\java\project\debug_log.txt', 'w', encoding='utf-8')
original_stdout = sys.stdout
sys.stdout = log_file
print("开始调试...")
print(f"当前目录: {os.getcwd()}")
try:
import pandas as pd
print("pandas导入成功")
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
print(f"输入文件: {input_file}")
print(f"文件存在: {os.path.exists(input_file)}")
if os.path.exists(input_file):
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
print("开始读取...")
df = pd.read_excel(input_file, engine='openpyxl')
print(f"读取成功: {len(df)}")
print(f"列数: {len(df.columns)}")
print(f"前5列: {list(df.columns)[:5]}")
except Exception as e:
print(f"错误: {e}")
import traceback
traceback.print_exc()
print("调试结束")
sys.stdout = original_stdout
log_file.close()
print("日志已保存")

51
project/debug_script.py

@ -0,0 +1,51 @@
import os
import sys
print("========================================")
print(" 调试脚本")
print("========================================")
print(f"Python版本: {sys.version}")
print(f"当前目录: {os.getcwd()}")
print()
# 检查pandas
print("检查pandas...")
try:
import pandas as pd
print(f"pandas版本: {pd.__version__}")
except ImportError as e:
print(f"pandas未安装: {e}")
exit(1)
# 检查openpyxl
print("\n检查openpyxl...")
try:
import openpyxl
print(f"openpyxl版本: {openpyxl.__version__}")
except ImportError as e:
print(f"openpyxl未安装: {e}")
exit(1)
# 检查文件
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
print(f"\n检查输入文件:")
print(f"路径: {input_file}")
print(f"存在: {os.path.exists(input_file)}")
if os.path.exists(input_file):
print(f"大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 尝试读取
print("\n尝试读取文件...")
try:
df = pd.read_excel(input_file, nrows=5) # 只读前5行
print(f"成功读取 {len(df)}")
print(f"列名: {list(df.columns)}")
except Exception as e:
print(f"读取失败: {e}")
import traceback
traceback.print_exc()
print()
print("========================================")
print(" 调试完成")
print("========================================")

50
project/import_data.py

@ -0,0 +1,50 @@
import os
import pandas as pd
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
print("========================================")
print(" 数据导入操作")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取数据
try:
print("正在读取数据...")
df = pd.read_excel(input_file)
print(f"成功读取 {len(df)} 行数据")
print(f"列名: {list(df.columns)}")
print(f"数据类型:")
print(df.dtypes)
print("\n前5行数据:")
print(df.head())
# 写入到同一个文件
print("\n写入数据到目标文件...")
df.to_excel(output_file, index=False)
print(f"数据已成功导入到: {output_file}")
print(f"总行数: {len(df)}")
print(f"总列数: {len(df.columns)}")
print()
print("========================================")
print(" 数据导入完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

17
project/minimal_test.py

@ -0,0 +1,17 @@
import os
print("测试开始")
print(f"当前目录: {os.getcwd()}")
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
print(f"文件存在: {os.path.exists(input_file)}")
if os.path.exists(input_file):
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
print("尝试读取...")
try:
import pandas as pd
df = pd.read_excel(input_file, nrows=10)
print(f"成功读取 {len(df)}")
print("测试完成")
except Exception as e:
print(f"错误: {e}")

113
project/populate_regression_data.py

@ -0,0 +1,113 @@
import os
import pandas as pd
import openpyxl
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
print("========================================")
print(" 填充UGC回归数据")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
if not os.path.exists(output_file):
print("错误: 输出文件不存在!")
exit(1)
# 读取原始数据
try:
print("正在读取原始数据...")
df = pd.read_excel(input_file)
print(f"成功读取 {len(df)} 行数据")
print(f"列名: {list(df.columns)}")
# 打开输出文件
print("\n打开输出文件...")
wb = openpyxl.load_workbook(output_file)
ws = wb.active
# 提取数据并填充
print("\n填充数据...")
# 提取Y列 (helpfull)
print("1. 填充Y列 (helpfull)")
if 'helpfull' in df.columns:
for i, value in enumerate(df['helpfull'], 2): # 从第2行开始
if pd.isna(value):
ws.cell(row=i, column=1, value=0)
else:
ws.cell(row=i, column=1, value=float(value))
print(f"成功填充 Y 列,共 {len(df)}")
else:
print("警告: 未找到 helpfull 列,使用默认值 0")
for i in range(2, len(df) + 2):
ws.cell(row=i, column=1, value=0)
# 提取X1列 (评论总数)
print("\n2. 填充X1列 (评论总数)")
comment_columns = [col for col in df.columns if '评论' in col]
if comment_columns:
for i, value in enumerate(df[comment_columns[0]], 2):
if pd.isna(value):
ws.cell(row=i, column=2, value=0)
else:
ws.cell(row=i, column=2, value=float(value))
print(f"成功填充 X1 列,使用列: {comment_columns[0]}")
else:
print("警告: 未找到评论列,使用默认值 0")
for i in range(2, len(df) + 2):
ws.cell(row=i, column=2, value=0)
# 计算X2-X6
print("\n3. 计算X2-X6")
# X2: 评论长度
print(" - 填充X2 (评论长度)")
for i in range(2, len(df) + 2):
ws.cell(row=i, column=3, value=0)
# X3: 评论复杂度
print(" - 填充X3 (评论复杂度)")
for i in range(2, len(df) + 2):
ws.cell(row=i, column=4, value=0)
# X4: 评论可读性
print(" - 填充X4 (评论可读性)")
for i in range(2, len(df) + 2):
ws.cell(row=i, column=5, value=0)
# X5: 内容情感性
print(" - 填充X5 (内容情感性)")
for i in range(2, len(df) + 2):
ws.cell(row=i, column=6, value=0)
# X6: 信息丰富度
print(" - 填充X6 (信息丰富度)")
for i in range(2, len(df) + 2):
ws.cell(row=i, column=7, value=0)
# 保存文件
print("\n4. 保存文件")
wb.save(output_file)
print(f"文件已成功保存: {output_file}")
print(f"总行数: {len(df) + 1} (包括表头)")
print(f"总列数: 7")
print()
print("========================================")
print(" 任务完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

156
project/process_300_rows.py

@ -0,0 +1,156 @@
import os
import pandas as pd
import re
print("=" * 60)
print(" 处理前300行数据作为测试")
print("=" * 60)
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归_300.xlsx'
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 读取前300行
print("读取前300行数据...")
df = pd.read_excel(input_file, engine='openpyxl', nrows=300)
print(f"成功读取 {len(df)} 行数据")
print(f"原始列数: {len(df.columns)}")
# 识别列
print("\n识别列...")
helpfull_col = None
comment_count_col = None
comment_cols = []
for col in df.columns:
col_str = str(col).lower()
if 'helpfull' in col_str or 'helpful' in col_str:
helpfull_col = col
print(f"找到 Y 列 (helpfull): {col}")
elif '评论总数' in str(col) or '帖子评论总数' in str(col):
comment_count_col = col
print(f"找到 X1 列 (评论总数): {col}")
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
comment_cols.append(col)
print(f"找到评论列 {len(comment_cols)}: {col}")
print(f"\n共找到 {len(comment_cols)} 个评论内容列")
# 添加回归数据列
print("\n添加回归数据列...")
# Y (UGC有用性)
print("1. 添加 Y (UGC有用性)")
if helpfull_col:
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
else:
df['Y'] = 0
# X1 (评论数量)
print("2. 添加 X1 (评论数量)")
if comment_count_col:
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
else:
df['X1'] = 0
# 定义函数计算评论指标
def calculate_comment_metrics(content):
if pd.isna(content) or str(content) in ['None', 'nan', '']:
return 0, 0, 0, 0
content = str(content)
length = len(content.replace(' ', '').replace('\u3000', ''))
complexity = len(content.split())
positive_words = ['', '', '优秀', '喜欢', '满意', '', 'positive', 'good', 'great', 'excellent', 'love', 'like']
negative_words = ['', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
sentiment = 0
lower_content = content.lower()
if any(word in lower_content for word in positive_words):
sentiment = 1
elif any(word in lower_content for word in negative_words):
sentiment = -1
richness = 0
if re.search(r'\d', content):
richness += 1
if re.search(r'http[s]?://|www\.', content):
richness += 1
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):
richness += 1
return length, complexity, sentiment, richness
# 计算评论相关指标
print("3. 计算评论相关指标...")
df['X2'] = 0.0
df['X3'] = 0.0
df['X5'] = 0.0
df['X6'] = 0.0
for i in range(len(df)):
lengths = []
complexities = []
sentiments = []
richness = []
for col in comment_cols:
content = df.iloc[i].get(col, '')
length, complexity, sentiment, r = calculate_comment_metrics(content)
if length > 0:
lengths.append(length)
complexities.append(complexity)
sentiments.append(sentiment)
richness.append(r)
if lengths:
df.loc[i, 'X2'] = sum(lengths) / len(lengths)
df.loc[i, 'X3'] = sum(complexities) / len(complexities)
df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
df.loc[i, 'X6'] = sum(richness) / len(richness)
# X4: 评论可读性
print("4. 计算 X4 (评论可读性)")
df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
# 数据清洗
print("\n5. 数据清洗...")
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
for col in regression_cols:
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
df[col] = df[col].replace([float('inf'), float('-inf')], 0)
# 验证数据
print("\n6. 验证数据...")
print(f"总行数: {len(df)}")
print(f"总列数: {len(df.columns)}")
print(f"\n回归数据列统计:")
print(df[regression_cols].describe())
print(f"\n前5行回归数据:")
print(df[regression_cols].head())
# 保存文件
print("\n7. 保存文件...")
df.to_excel(output_file, index=False, engine='openpyxl')
# 验证文件
print("\n8. 验证文件...")
if os.path.exists(output_file):
print(f"文件已成功保存: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
df_check = pd.read_excel(output_file)
print(f"输出文件行数: {len(df_check)}")
print(f"输出文件列数: {len(df_check.columns)}")
else:
print("文件保存失败!")
print()
print("=" * 60)
print(" 任务完成")
print("=" * 60)

200
project/process_actual_data.py

@ -0,0 +1,200 @@
import os
import openpyxl
import re
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
print("========================================")
print(" 根据实际原始数据计算回归数据")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取原始数据
try:
print("正在读取原始数据...")
wb_input = openpyxl.load_workbook(input_file)
ws_input = wb_input.active
print(f"工作表名称: {ws_input.title}")
print(f"最大行数: {ws_input.max_row}")
print(f"最大列数: {ws_input.max_column}")
# 识别列
print("\n识别列...")
headers = []
helpfull_col = None
comment_count_col = None
comment_cols = []
for col in range(1, ws_input.max_column + 1):
header = ws_input.cell(row=1, column=col).value
headers.append(header)
if header:
header_str = str(header).lower()
if 'helpfull' in header_str or 'helpful' in header_str:
helpfull_col = col
print(f"找到 Y 列 (helpfull): 列 {col}")
elif '评论总数' in str(header) or '帖子评论总数' in str(header):
comment_count_col = col
print(f"找到 X1 列 (评论总数): 列 {col}")
elif '评论' in str(header) and any(str(i) in str(header) for i in range(1, 6)):
comment_cols.append(col)
print(f"找到评论列 {len(comment_cols)}: 列 {col} - {header}")
print(f"\n共找到 {len(comment_cols)} 个评论列")
# 创建或打开输出文件
if os.path.exists(output_file):
print("\n打开现有输出文件...")
wb_output = openpyxl.load_workbook(output_file)
ws_output = wb_output.active
else:
print("\n创建新的输出文件...")
wb_output = openpyxl.Workbook()
ws_output = wb_output.active
# 写入表头
headers_output = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
for i, header in enumerate(headers_output, 1):
ws_output.cell(row=1, column=i, value=header)
# 计算并填充数据
print("\n计算并填充数据...")
total_rows = ws_input.max_row - 1
print(f"总数据行数: {total_rows}")
# 确保输出文件有足够的行
if ws_output.max_row < ws_input.max_row:
print(f"扩展输出文件行数到 {ws_input.max_row}...")
for row in range(2, ws_input.max_row + 1):
if row % 100 == 0:
print(f"处理到第 {row-1} 行...")
if row % 1000 == 0:
print(f"已处理 {row-1} 行,共 {total_rows}")
# Y (UGC有用性)
if helpfull_col:
y_value = ws_input.cell(row=row, column=helpfull_col).value
y_value = float(y_value) if y_value else 0
else:
y_value = 0
ws_output.cell(row=row, column=1, value=y_value)
# X1 (评论数量)
if comment_count_col:
x1_value = ws_input.cell(row=row, column=comment_count_col).value
x1_value = float(x1_value) if x1_value else 0
else:
x1_value = 0
ws_output.cell(row=row, column=2, value=x1_value)
# 计算评论相关指标
comment_lengths = []
comment_complexities = []
comment_sentiments = []
comment_richness = []
for col in comment_cols:
content = str(ws_input.cell(row=row, column=col).value)
if content and content != 'None' and content != 'nan':
# X2: 评论长度(剔空格后的字符数)
length = len(content.replace(' ', ''))
comment_lengths.append(length)
# X3: 评论复杂度(按空格拆分的分词数)
complexity = len(content.split())
comment_complexities.append(complexity)
# X5: 内容情感性(正面=1、中性=0、负面=-1)
positive_words = ['', '', '优秀', '喜欢', '满意', '', 'positive', 'good', 'great', 'excellent']
negative_words = ['', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
sentiment = 0
lower_content = content.lower()
if any(word in lower_content for word in positive_words):
sentiment = 1
elif any(word in lower_content for word in negative_words):
sentiment = -1
comment_sentiments.append(sentiment)
# X6: 信息丰富度(含数字/链接/表情各1分,满分3分)
richness = 0
if re.search(r'\d', content):
richness += 1
if re.search(r'http[s]?://', content):
richness += 1
if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content):
richness += 1
comment_richness.append(richness)
# X2: 评论长度平均值
x2_value = sum(comment_lengths) / len(comment_lengths) if comment_lengths else 0
ws_output.cell(row=row, column=3, value=x2_value)
# X3: 评论复杂度平均值
x3_value = sum(comment_complexities) / len(comment_complexities) if comment_complexities else 0
ws_output.cell(row=row, column=4, value=x3_value)
# X4: 评论可读性(X2/X3,X3为0时记0)
x4_value = x2_value / x3_value if x3_value > 0 else 0
ws_output.cell(row=row, column=5, value=x4_value)
# X5: 内容情感性平均值
x5_value = sum(comment_sentiments) / len(comment_sentiments) if comment_sentiments else 0
ws_output.cell(row=row, column=6, value=x5_value)
# X6: 信息丰富度平均值
x6_value = sum(comment_richness) / len(comment_richness) if comment_richness else 0
ws_output.cell(row=row, column=7, value=x6_value)
# 保存文件
print("\n保存文件...")
wb_output.save(output_file)
print(f"文件已成功保存: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
print(f"处理完成,共 {total_rows} 行数据")
# 验证文件
print("\n验证文件...")
if os.path.exists(output_file):
print("文件保存成功!")
# 重新打开文件检查
wb_check = openpyxl.load_workbook(output_file)
ws_check = wb_check.active
print(f"输出文件行数: {ws_check.max_row - 1}")
print(f"输出文件列数: {ws_check.max_column}")
# 显示前5行数据
print("\n前5行数据:")
for row in range(1, min(6, ws_check.max_row + 1)):
row_data = []
for col in range(1, ws_check.max_column + 1):
value = ws_check.cell(row=row, column=col).value
row_data.append(value)
print(f"{row}: {row_data}")
else:
print("文件保存失败!")
print()
print("========================================")
print(" 任务完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

190
project/process_all_data.py

@ -0,0 +1,190 @@
import os
import openpyxl
import re
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
print("========================================")
print(" 处理所有数据")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取原始数据
try:
print("正在读取原始数据...")
wb_input = openpyxl.load_workbook(input_file)
ws_input = wb_input.active
print(f"工作表名称: {ws_input.title}")
print(f"最大行数: {ws_input.max_row}")
print(f"最大列数: {ws_input.max_column}")
# 识别列
print("\n识别列...")
headers = []
helpfull_col = None
comment_count_col = None
comment_cols = []
for col in range(1, ws_input.max_column + 1):
header = ws_input.cell(row=1, column=col).value
headers.append(header)
if header:
header_str = str(header).lower()
if 'helpfull' in header_str or 'helpful' in header_str:
helpfull_col = col
print(f"找到 Y 列 (helpfull): 列 {col}")
elif '评论总数' in str(header) or '帖子评论总数' in str(header):
comment_count_col = col
print(f"找到 X1 列 (评论总数): 列 {col}")
elif '评论' in str(header) and any(str(i) in str(header) for i in range(1, 6)):
comment_cols.append(col)
print(f"找到评论列 {len(comment_cols)}: 列 {col} - {header}")
print(f"\n共找到 {len(comment_cols)} 个评论列")
# 创建新的输出文件
print("\n创建新的输出文件...")
wb_output = openpyxl.Workbook()
ws_output = wb_output.active
# 写入表头
headers_output = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
for i, header in enumerate(headers_output, 1):
ws_output.cell(row=1, column=i, value=header)
# 计算并填充数据
print("\n计算并填充数据...")
total_rows = ws_input.max_row - 1
print(f"总数据行数: {total_rows}")
for row in range(2, ws_input.max_row + 1):
if row % 1000 == 0:
print(f"处理到第 {row-1} 行...")
# Y (UGC有用性)
if helpfull_col:
y_value = ws_input.cell(row=row, column=helpfull_col).value
y_value = float(y_value) if y_value else 0
else:
y_value = 0
ws_output.cell(row=row, column=1, value=y_value)
# X1 (评论数量)
if comment_count_col:
x1_value = ws_input.cell(row=row, column=comment_count_col).value
x1_value = float(x1_value) if x1_value else 0
else:
x1_value = 0
ws_output.cell(row=row, column=2, value=x1_value)
# 计算评论相关指标
comment_lengths = []
comment_complexities = []
comment_sentiments = []
comment_richness = []
for col in comment_cols:
content = str(ws_input.cell(row=row, column=col).value)
if content and content != 'None' and content != 'nan':
# X2: 评论长度(剔空格后的字符数)
length = len(content.replace(' ', ''))
comment_lengths.append(length)
# X3: 评论复杂度(按空格拆分的分词数)
complexity = len(content.split())
comment_complexities.append(complexity)
# X5: 内容情感性(正面=1、中性=0、负面=-1)
positive_words = ['', '', '优秀', '喜欢', '满意', '', 'positive', 'good', 'great', 'excellent']
negative_words = ['', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
sentiment = 0
lower_content = content.lower()
if any(word in lower_content for word in positive_words):
sentiment = 1
elif any(word in lower_content for word in negative_words):
sentiment = -1
comment_sentiments.append(sentiment)
# X6: 信息丰富度(含数字/链接/表情各1分,满分3分)
richness = 0
if re.search(r'\d', content):
richness += 1
if re.search(r'http[s]?://', content):
richness += 1
if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content):
richness += 1
comment_richness.append(richness)
# X2: 评论长度平均值
x2_value = sum(comment_lengths) / len(comment_lengths) if comment_lengths else 0
ws_output.cell(row=row, column=3, value=x2_value)
# X3: 评论复杂度平均值
x3_value = sum(comment_complexities) / len(comment_complexities) if comment_complexities else 0
ws_output.cell(row=row, column=4, value=x3_value)
# X4: 评论可读性(X2/X3,X3为0时记0)
x4_value = x2_value / x3_value if x3_value > 0 else 0
ws_output.cell(row=row, column=5, value=x4_value)
# X5: 内容情感性平均值
x5_value = sum(comment_sentiments) / len(comment_sentiments) if comment_sentiments else 0
ws_output.cell(row=row, column=6, value=x5_value)
# X6: 信息丰富度平均值
x6_value = sum(comment_richness) / len(comment_richness) if comment_richness else 0
ws_output.cell(row=row, column=7, value=x6_value)
# 保存文件
print("\n保存文件...")
wb_output.save(output_file)
print(f"文件已成功保存: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
print(f"处理完成,共 {total_rows} 行数据")
# 验证文件
print("\n验证文件...")
if os.path.exists(output_file):
print("文件保存成功!")
# 重新打开文件检查
wb_check = openpyxl.load_workbook(output_file)
ws_check = wb_check.active
print(f"输出文件行数: {ws_check.max_row - 1}")
print(f"输出文件列数: {ws_check.max_column}")
# 显示前5行数据
print("\n前5行数据:")
for row in range(1, min(6, ws_check.max_row + 1)):
row_data = []
for col in range(1, ws_check.max_column + 1):
value = ws_check.cell(row=row, column=col).value
row_data.append(value)
print(f"{row}: {row_data}")
else:
print("文件保存失败!")
print()
print("========================================")
print(" 任务完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

157
project/process_all_rows.py

@ -0,0 +1,157 @@
import os
import pandas as pd
import re
print("=" * 60)
print(" 处理全部数据")
print("=" * 60)
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx'
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 读取全部数据
print("读取全部数据...")
df = pd.read_excel(input_file, engine='openpyxl')
print(f"成功读取 {len(df)} 行数据")
print(f"原始列数: {len(df.columns)}")
# 识别列
print("\n识别列...")
helpfull_col = None
comment_count_col = None
comment_cols = []
for col in df.columns:
col_str = str(col).lower()
if 'helpfull' in col_str or 'helpful' in col_str:
helpfull_col = col
print(f"找到 Y 列 (helpfull): {col}")
elif '评论总数' in str(col) or '帖子评论总数' in str(col):
comment_count_col = col
print(f"找到 X1 列 (评论总数): {col}")
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
comment_cols.append(col)
print(f"\n共找到 {len(comment_cols)} 个评论内容列")
# 添加回归数据列
print("\n添加回归数据列...")
# Y (UGC有用性)
print("1. 添加 Y (UGC有用性)")
if helpfull_col:
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
else:
df['Y'] = 0
# X1 (评论数量)
print("2. 添加 X1 (评论数量)")
if comment_count_col:
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
else:
df['X1'] = 0
# 定义函数计算评论指标
def calculate_comment_metrics(content):
if pd.isna(content) or str(content) in ['None', 'nan', '']:
return 0, 0, 0, 0
content = str(content)
length = len(content.replace(' ', '').replace('\u3000', ''))
complexity = len(content.split())
positive_words = ['', '', '优秀', '喜欢', '满意', '', 'positive', 'good', 'great', 'excellent', 'love', 'like']
negative_words = ['', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
sentiment = 0
lower_content = content.lower()
if any(word in lower_content for word in positive_words):
sentiment = 1
elif any(word in lower_content for word in negative_words):
sentiment = -1
richness = 0
if re.search(r'\d', content):
richness += 1
if re.search(r'http[s]?://|www\.', content):
richness += 1
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):
richness += 1
return length, complexity, sentiment, richness
# 计算评论相关指标
print("3. 计算评论相关指标...")
print(f"总数据行数: {len(df)}")
df['X2'] = 0.0
df['X3'] = 0.0
df['X5'] = 0.0
df['X6'] = 0.0
for i in range(len(df)):
if i % 1000 == 0:
print(f" 处理第 {i}/{len(df)} 行...")
lengths = []
complexities = []
sentiments = []
richness = []
for col in comment_cols:
content = df.iloc[i].get(col, '')
length, complexity, sentiment, r = calculate_comment_metrics(content)
if length > 0:
lengths.append(length)
complexities.append(complexity)
sentiments.append(sentiment)
richness.append(r)
if lengths:
df.loc[i, 'X2'] = sum(lengths) / len(lengths)
df.loc[i, 'X3'] = sum(complexities) / len(complexities)
df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
df.loc[i, 'X6'] = sum(richness) / len(richness)
# X4: 评论可读性
print("4. 计算 X4 (评论可读性)")
df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
# 数据清洗
print("\n5. 数据清洗...")
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
for col in regression_cols:
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
df[col] = df[col].replace([float('inf'), float('-inf')], 0)
# 验证数据
print("\n6. 验证数据...")
print(f"总行数: {len(df)}")
print(f"总列数: {len(df.columns)}")
print(f"\n回归数据列统计:")
print(df[regression_cols].describe())
# 保存文件
print("\n7. 保存文件...")
df.to_excel(output_file, index=False, engine='openpyxl')
# 验证文件
print("\n8. 验证文件...")
if os.path.exists(output_file):
print(f"文件已成功保存: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
df_check = pd.read_excel(output_file)
print(f"输出文件行数: {len(df_check)}")
print(f"输出文件列数: {len(df_check.columns)}")
else:
print("文件保存失败!")
print()
print("=" * 60)
print(" 任务完成")
print("=" * 60)

180
project/process_efficient.py

@ -0,0 +1,180 @@
import os
import pandas as pd
import re
print("=" * 60)
print(" 高效处理全部数据")
print("=" * 60)
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx'
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 首先读取表头来识别列
print("1. 读取表头...")
df_header = pd.read_excel(input_file, engine='openpyxl', nrows=0)
print(f"总列数: {len(df_header.columns)}")
# 识别列
helpfull_col = None
comment_count_col = None
comment_cols = []
for col in df_header.columns:
col_str = str(col).lower()
if 'helpfull' in col_str or 'helpful' in col_str:
helpfull_col = col
print(f"找到 Y 列 (helpfull): {col}")
elif '评论总数' in str(col) or '帖子评论总数' in str(col):
comment_count_col = col
print(f"找到 X1 列 (评论总数): {col}")
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
comment_cols.append(col)
print(f"共找到 {len(comment_cols)} 个评论内容列")
# 定义函数计算评论指标
def calculate_comment_metrics(content):
if pd.isna(content) or str(content) in ['None', 'nan', '']:
return 0, 0, 0, 0
content = str(content)
length = len(content.replace(' ', '').replace('\u3000', ''))
complexity = len(content.split())
positive_words = ['', '', '优秀', '喜欢', '满意', '', 'positive', 'good', 'great', 'excellent', 'love', 'like']
negative_words = ['', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
sentiment = 0
lower_content = content.lower()
if any(word in lower_content for word in positive_words):
sentiment = 1
elif any(word in lower_content for word in negative_words):
sentiment = -1
richness = 0
if re.search(r'\d', content):
richness += 1
if re.search(r'http[s]?://|www\.', content):
richness += 1
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):
richness += 1
return length, complexity, sentiment, richness
# 分批处理数据
print("\n2. 分批处理数据...")
batch_size = 5000
batch_num = 0
all_data = []
while True:
skip_rows = batch_num * batch_size + 1 if batch_num > 0 else 0
nrows = batch_size
print(f" 处理批次 {batch_num + 1} (跳过 {skip_rows} 行,读取 {nrows} 行)...")
try:
if batch_num == 0:
df_batch = pd.read_excel(input_file, engine='openpyxl', nrows=nrows)
else:
df_batch = pd.read_excel(input_file, engine='openpyxl', skiprows=skip_rows, nrows=nrows, header=None)
df_batch.columns = df_header.columns
except Exception as e:
print(f" 读取完成或出错: {e}")
break
if len(df_batch) == 0:
print(" 没有更多数据")
break
print(f" 读取了 {len(df_batch)}")
# 添加Y和X1
if helpfull_col:
df_batch['Y'] = pd.to_numeric(df_batch[helpfull_col], errors='coerce').fillna(0)
else:
df_batch['Y'] = 0
if comment_count_col:
df_batch['X1'] = pd.to_numeric(df_batch[comment_count_col], errors='coerce').fillna(0)
else:
df_batch['X1'] = 0
# 初始化X2-X6
df_batch['X2'] = 0.0
df_batch['X3'] = 0.0
df_batch['X5'] = 0.0
df_batch['X6'] = 0.0
# 计算评论指标
for i in range(len(df_batch)):
lengths = []
complexities = []
sentiments = []
richness = []
for col in comment_cols:
content = df_batch.iloc[i].get(col, '')
length, complexity, sentiment, r = calculate_comment_metrics(content)
if length > 0:
lengths.append(length)
complexities.append(complexity)
sentiments.append(sentiment)
richness.append(r)
if lengths:
df_batch.loc[i, 'X2'] = sum(lengths) / len(lengths)
df_batch.loc[i, 'X3'] = sum(complexities) / len(complexities)
df_batch.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
df_batch.loc[i, 'X6'] = sum(richness) / len(richness)
# 计算X4
df_batch['X4'] = df_batch.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
# 数据清洗
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
for col in regression_cols:
df_batch[col] = pd.to_numeric(df_batch[col], errors='coerce').fillna(0)
df_batch[col] = df_batch[col].replace([float('inf'), float('-inf')], 0)
all_data.append(df_batch)
batch_num += 1
print(f" 批次 {batch_num} 完成,当前总行数: {sum(len(d) for d in all_data)}")
# 合并所有数据
print("\n3. 合并数据...")
df_final = pd.concat(all_data, ignore_index=True)
print(f"合并后总行数: {len(df_final)}")
# 验证数据
print("\n4. 验证数据...")
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
print(f"总列数: {len(df_final.columns)}")
print(f"\n回归数据列统计:")
print(df_final[regression_cols].describe())
# 保存文件
print("\n5. 保存文件...")
df_final.to_excel(output_file, index=False, engine='openpyxl')
# 验证文件
print("\n6. 验证文件...")
if os.path.exists(output_file):
print(f"文件已成功保存: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
df_check = pd.read_excel(output_file)
print(f"输出文件行数: {len(df_check)}")
print(f"输出文件列数: {len(df_check.columns)}")
else:
print("文件保存失败!")
print()
print("=" * 60)
print(" 任务完成")
print("=" * 60)

177
project/process_large_file.py

@ -0,0 +1,177 @@
import os
import pandas as pd
import re
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
print("========================================")
print(" 处理大型Excel文件")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取原始数据
try:
print("正在读取原始数据...")
# 使用pandas读取Excel文件,设置引擎为openpyxl
df = pd.read_excel(input_file, engine='openpyxl')
print(f"成功读取 {len(df)} 行数据")
print(f"列名: {list(df.columns)}")
# 识别列
print("\n识别列...")
helpfull_col = None
comment_count_col = None
comment_cols = []
for col in df.columns:
col_str = str(col).lower()
if 'helpfull' in col_str or 'helpful' in col_str:
helpfull_col = col
print(f"找到 Y 列 (helpfull): {col}")
elif '评论总数' in str(col) or '帖子评论总数' in str(col):
comment_count_col = col
print(f"找到 X1 列 (评论总数): {col}")
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)):
comment_cols.append(col)
print(f"找到评论列 {len(comment_cols)}: {col}")
print(f"\n共找到 {len(comment_cols)} 个评论列")
# 创建回归数据
print("\n创建回归数据...")
regression_data = pd.DataFrame()
# Y (UGC有用性)
print("1. 计算 Y (UGC有用性)")
if helpfull_col:
regression_data['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
else:
regression_data['Y'] = 0
# X1 (评论数量)
print("2. 计算 X1 (评论数量)")
if comment_count_col:
regression_data['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
else:
regression_data['X1'] = 0
# 定义函数计算评论指标
def calculate_comment_metrics(content):
if pd.isna(content) or str(content) in ['None', 'nan']:
return 0, 0, 0, 0
content = str(content)
# 评论长度
length = len(content.replace(' ', ''))
# 评论复杂度
complexity = len(content.split())
# 情感分析
positive_words = ['', '', '优秀', '喜欢', '满意', '', 'positive', 'good', 'great', 'excellent']
negative_words = ['', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
sentiment = 0
lower_content = content.lower()
if any(word in lower_content for word in positive_words):
sentiment = 1
elif any(word in lower_content for word in negative_words):
sentiment = -1
# 信息丰富度
richness = 0
if re.search(r'\d', content):
richness += 1
if re.search(r'http[s]?://', content):
richness += 1
if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content):
richness += 1
return length, complexity, sentiment, richness
# 计算评论相关指标
print("3. 计算评论相关指标...")
# 初始化列
regression_data['X2'] = 0 # 评论长度
regression_data['X3'] = 0 # 评论复杂度
regression_data['X5'] = 0 # 情感性
regression_data['X6'] = 0 # 信息丰富度
# 逐行计算
total_rows = len(df)
for i in range(total_rows):
if i % 1000 == 0:
print(f"处理到第 {i} 行...")
lengths = []
complexities = []
sentiments = []
richness = []
for col in comment_cols:
content = df.iloc[i].get(col, '')
length, complexity, sentiment, r = calculate_comment_metrics(content)
if length > 0:
lengths.append(length)
complexities.append(complexity)
sentiments.append(sentiment)
richness.append(r)
# 计算平均值
if lengths:
regression_data.loc[i, 'X2'] = sum(lengths) / len(lengths)
regression_data.loc[i, 'X3'] = sum(complexities) / len(complexities)
regression_data.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
regression_data.loc[i, 'X6'] = sum(richness) / len(richness)
# X4: 评论可读性
print("4. 计算 X4 (评论可读性)")
regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
# 数据清洗
print("\n5. 数据清洗...")
for col in regression_data.columns:
regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0)
# 验证数据
print("\n6. 验证数据...")
print(f"行数: {len(regression_data)}")
print(f"列数: {len(regression_data.columns)}")
print(f"列名: {list(regression_data.columns)}")
print(f"\n前5行数据:")
print(regression_data.head())
# 保存文件
print("\n7. 保存文件...")
regression_data.to_excel(output_file, index=False)
# 验证文件
print("\n8. 验证文件...")
if os.path.exists(output_file):
print(f"文件已成功保存: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
# 重新读取检查
df_check = pd.read_excel(output_file)
print(f"输出文件行数: {len(df_check)}")
print(f"输出文件列数: {len(df_check.columns)}")
else:
print("文件保存失败!")
print()
print("========================================")
print(" 任务完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

9
project/process_log.txt

@ -0,0 +1,9 @@
========================================
在原表中添加回归数据列
========================================
输入文件: D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx
输出文件: D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx
输入文件大小: 21607.43 KB
正在读取原始数据...

192
project/process_regression_final.py

@ -0,0 +1,192 @@
import os
import pandas as pd
import re
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx'
print("========================================")
print(" 在原表中添加回归数据列")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取原始数据
try:
print("\n正在读取原始数据...")
df = pd.read_excel(input_file)
print(f"成功读取 {len(df)} 行数据")
print(f"原始列数: {len(df.columns)}")
# 识别列
print("\n识别列...")
helpfull_col = None
comment_count_col = None
comment_cols = []
for col in df.columns:
col_str = str(col).lower()
if 'helpfull' in col_str or 'helpful' in col_str:
helpfull_col = col
print(f"找到 Y 列 (helpfull): {col}")
elif '评论总数' in str(col) or '帖子评论总数' in str(col):
comment_count_col = col
print(f"找到 X1 列 (评论总数): {col}")
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
comment_cols.append(col)
print(f"找到评论列 {len(comment_cols)}: {col}")
print(f"\n共找到 {len(comment_cols)} 个评论内容列")
# 添加回归数据列
print("\n添加回归数据列...")
# Y (UGC有用性) - 直接复制helpfull列
print("1. 添加 Y (UGC有用性)")
if helpfull_col:
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
else:
df['Y'] = 0
# X1 (评论数量) - 直接复制帖子评论总数列
print("2. 添加 X1 (评论数量)")
if comment_count_col:
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
else:
df['X1'] = 0
# 定义函数计算评论指标
def calculate_comment_metrics(content):
if pd.isna(content) or str(content) in ['None', 'nan', '']:
return 0, 0, 0, 0
content = str(content)
# X2: 评论长度(剔空格后的字符数)
length = len(content.replace(' ', '').replace('\u3000', ''))
# X3: 评论复杂度(按空格拆分的分词数)
complexity = len(content.split())
# X5: 情感分析(正面=1、中性=0、负面=-1)
positive_words = ['', '', '优秀', '喜欢', '满意', '', 'positive', 'good', 'great', 'excellent', 'love', 'like']
negative_words = ['', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
sentiment = 0
lower_content = content.lower()
if any(word in lower_content for word in positive_words):
sentiment = 1
elif any(word in lower_content for word in negative_words):
sentiment = -1
# X6: 信息丰富度(含数字/链接/表情各1分,满分3分)
richness = 0
if re.search(r'\d', content): # 含数字
richness += 1
if re.search(r'http[s]?://|www\.', content): # 含链接
richness += 1
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情
richness += 1
return length, complexity, sentiment, richness
# 计算评论相关指标
print("3. 计算评论相关指标...")
# 初始化列
df['X2'] = 0.0 # 评论长度
df['X3'] = 0.0 # 评论复杂度
df['X5'] = 0.0 # 情感性
df['X6'] = 0.0 # 信息丰富度
# 逐行计算
total_rows = len(df)
print(f"总数据行数: {total_rows}")
for i in range(total_rows):
if i % 1000 == 0:
print(f" 处理第 {i}/{total_rows} 行...")
lengths = []
complexities = []
sentiments = []
richness = []
for col in comment_cols:
content = df.iloc[i].get(col, '')
length, complexity, sentiment, r = calculate_comment_metrics(content)
if length > 0: # 只统计有内容的评论
lengths.append(length)
complexities.append(complexity)
sentiments.append(sentiment)
richness.append(r)
# 计算平均值(无评论记0)
if lengths:
df.loc[i, 'X2'] = sum(lengths) / len(lengths)
df.loc[i, 'X3'] = sum(complexities) / len(complexities)
df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
df.loc[i, 'X6'] = sum(richness) / len(richness)
# X4: 评论可读性 = X2/X3(X3为0时记0,避免报错)
print("4. 计算 X4 (评论可读性)")
df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误
print("\n5. 数据清洗...")
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
for col in regression_cols:
# 转换为数字,错误值转为0
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
# 替换无穷大
df[col] = df[col].replace([float('inf'), float('-inf')], 0)
# 验证数据
print("\n6. 验证数据...")
print(f"总行数: {len(df)}")
print(f"总列数: {len(df.columns)}")
print(f"\n回归数据列统计:")
print(df[regression_cols].describe())
print(f"\n前5行回归数据:")
print(df[regression_cols].head())
# 检查是否有空值或错误值
print(f"\n空值检查:")
for col in regression_cols:
null_count = df[col].isnull().sum()
print(f" {col}: {null_count} 个空值")
# 保存文件
print("\n7. 保存文件...")
print(f"正在保存到: {output_file}")
df.to_excel(output_file, index=False, engine='openpyxl')
# 验证文件
print("\n8. 验证文件...")
if os.path.exists(output_file):
print(f"文件已成功保存: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
# 重新读取检查
df_check = pd.read_excel(output_file)
print(f"输出文件行数: {len(df_check)}")
print(f"输出文件列数: {len(df_check.columns)}")
print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
else:
print("文件保存失败!")
print()
print("========================================")
print(" 任务完成")
print("========================================")
print(f"新文件已保存: {output_file}")
print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

202
project/process_with_csv.py

@ -0,0 +1,202 @@
import os
import pandas as pd
import re
print("=" * 60)
print(" 使用CSV处理回归数据")
print("=" * 60)
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx'
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取原始数据
print("\n正在读取原始数据...")
try:
df = pd.read_excel(input_file, engine='openpyxl')
print(f"成功读取 {len(df)} 行数据")
print(f"原始列数: {len(df.columns)}")
except Exception as e:
print(f"读取失败: {e}")
exit(1)
# 识别列
print("\n识别列...")
helpfull_col = None
comment_count_col = None
comment_cols = []
for col in df.columns:
col_str = str(col).lower()
if 'helpfull' in col_str or 'helpful' in col_str:
helpfull_col = col
print(f"找到 Y 列 (helpfull): {col}")
elif '评论总数' in str(col) or '帖子评论总数' in str(col):
comment_count_col = col
print(f"找到 X1 列 (评论总数): {col}")
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
comment_cols.append(col)
print(f"找到评论列 {len(comment_cols)}: {col}")
print(f"\n共找到 {len(comment_cols)} 个评论内容列")
# 添加回归数据列
print("\n添加回归数据列...")
# Y (UGC有用性) - 直接复制helpfull列
print("1. 添加 Y (UGC有用性)")
if helpfull_col:
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
else:
df['Y'] = 0
# X1 (评论数量) - 直接复制帖子评论总数列
print("2. 添加 X1 (评论数量)")
if comment_count_col:
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
else:
df['X1'] = 0
# 定义函数计算评论指标
def calculate_comment_metrics(content):
if pd.isna(content) or str(content) in ['None', 'nan', '']:
return 0, 0, 0, 0
content = str(content)
# X2: 评论长度(剔空格后的字符数)
length = len(content.replace(' ', '').replace('\u3000', ''))
# X3: 评论复杂度(按空格拆分的分词数)
complexity = len(content.split())
# X5: 情感分析(正面=1、中性=0、负面=-1)
positive_words = ['', '', '优秀', '喜欢', '满意', '', 'positive', 'good', 'great', 'excellent', 'love', 'like']
negative_words = ['', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
sentiment = 0
lower_content = content.lower()
if any(word in lower_content for word in positive_words):
sentiment = 1
elif any(word in lower_content for word in negative_words):
sentiment = -1
# X6: 信息丰富度(含数字/链接/表情各1分,满分3分)
richness = 0
if re.search(r'\d', content): # 含数字
richness += 1
if re.search(r'http[s]?://|www\.', content): # 含链接
richness += 1
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情
richness += 1
return length, complexity, sentiment, richness
# 计算评论相关指标
print("3. 计算评论相关指标...")
# 初始化列
df['X2'] = 0.0 # 评论长度
df['X3'] = 0.0 # 评论复杂度
df['X5'] = 0.0 # 情感性
df['X6'] = 0.0 # 信息丰富度
# 逐行计算
total_rows = len(df)
print(f"总数据行数: {total_rows}")
for i in range(total_rows):
if i % 1000 == 0:
print(f" 处理第 {i}/{total_rows} 行...")
lengths = []
complexities = []
sentiments = []
richness = []
for col in comment_cols:
content = df.iloc[i].get(col, '')
length, complexity, sentiment, r = calculate_comment_metrics(content)
if length > 0: # 只统计有内容的评论
lengths.append(length)
complexities.append(complexity)
sentiments.append(sentiment)
richness.append(r)
# 计算平均值(无评论记0)
if lengths:
df.loc[i, 'X2'] = sum(lengths) / len(lengths)
df.loc[i, 'X3'] = sum(complexities) / len(complexities)
df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
df.loc[i, 'X6'] = sum(richness) / len(richness)
# X4: 评论可读性 = X2/X3(X3为0时记0,避免报错)
print("4. 计算 X4 (评论可读性)")
df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误
print("\n5. 数据清洗...")
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
for col in regression_cols:
# 转换为数字,错误值转为0
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
# 替换无穷大
df[col] = df[col].replace([float('inf'), float('-inf')], 0)
# 验证数据
print("\n6. 验证数据...")
print(f"总行数: {len(df)}")
print(f"总列数: {len(df.columns)}")
print(f"\n回归数据列统计:")
print(df[regression_cols].describe())
print(f"\n前5行回归数据:")
print(df[regression_cols].head())
# 检查是否有空值或错误值
print(f"\n空值检查:")
for col in regression_cols:
null_count = df[col].isnull().sum()
print(f" {col}: {null_count} 个空值")
# 保存为CSV中间文件
print("\n7. 保存为CSV中间文件...")
csv_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\temp_regression.csv'
df.to_csv(csv_file, index=False, encoding='utf-8-sig')
print(f"CSV文件已保存: {csv_file}")
print(f"CSV文件大小: {os.path.getsize(csv_file) / 1024:.2f} KB")
# 从CSV读取并保存为Excel
print("\n8. 转换为Excel文件...")
df_csv = pd.read_csv(csv_file, encoding='utf-8-sig')
df_csv.to_excel(output_file, index=False, engine='openpyxl')
# 验证文件
print("\n9. 验证文件...")
if os.path.exists(output_file):
print(f"文件已成功保存: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
# 重新读取检查
df_check = pd.read_excel(output_file)
print(f"输出文件行数: {len(df_check)}")
print(f"输出文件列数: {len(df_check.columns)}")
print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
# 删除临时CSV文件
os.remove(csv_file)
print(f"\n临时CSV文件已删除")
else:
print("文件保存失败!")
print()
print("=" * 60)
print(" 任务完成")
print("=" * 60)
print(f"新文件已保存: {output_file}")
print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")

168
project/process_with_pandas.py

@ -0,0 +1,168 @@
import os
import pandas as pd
import re
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
print("========================================")
print(" 使用pandas处理所有数据")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取原始数据
try:
print("正在读取原始数据...")
df = pd.read_excel(input_file)
print(f"成功读取 {len(df)} 行数据")
print(f"列名: {list(df.columns)}")
# 识别列
print("\n识别列...")
helpfull_col = None
comment_count_col = None
comment_cols = []
for col in df.columns:
col_str = str(col).lower()
if 'helpfull' in col_str or 'helpful' in col_str:
helpfull_col = col
print(f"找到 Y 列 (helpfull): {col}")
elif '评论总数' in str(col) or '帖子评论总数' in str(col):
comment_count_col = col
print(f"找到 X1 列 (评论总数): {col}")
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)):
comment_cols.append(col)
print(f"找到评论列 {len(comment_cols)}: {col}")
print(f"\n共找到 {len(comment_cols)} 个评论列")
# 创建回归数据
print("\n创建回归数据...")
regression_data = pd.DataFrame()
# Y (UGC有用性)
print("1. 计算 Y (UGC有用性)")
if helpfull_col:
regression_data['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
else:
regression_data['Y'] = 0
# X1 (评论数量)
print("2. 计算 X1 (评论数量)")
if comment_count_col:
regression_data['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
else:
regression_data['X1'] = 0
# 定义函数计算评论指标
def calculate_comment_metrics(row):
lengths = []
complexities = []
sentiments = []
richness = []
for col in comment_cols:
content = str(row.get(col, ''))
if content and content != 'None' and content != 'nan':
# 评论长度
lengths.append(len(content.replace(' ', '')))
# 评论复杂度
complexities.append(len(content.split()))
# 情感分析
positive_words = ['', '', '优秀', '喜欢', '满意', '', 'positive', 'good', 'great', 'excellent']
negative_words = ['', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
sentiment = 0
lower_content = content.lower()
if any(word in lower_content for word in positive_words):
sentiment = 1
elif any(word in lower_content for word in negative_words):
sentiment = -1
sentiments.append(sentiment)
# 信息丰富度
r = 0
if re.search(r'\d', content):
r += 1
if re.search(r'http[s]?://', content):
r += 1
if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content):
r += 1
richness.append(r)
return lengths, complexities, sentiments, richness
# 计算评论相关指标
print("3. 计算评论相关指标...")
comment_metrics = df.apply(calculate_comment_metrics, axis=1)
# X2: 评论长度平均值
print("4. 计算 X2 (评论长度)")
regression_data['X2'] = comment_metrics.apply(lambda x: sum(x[0]) / len(x[0]) if x[0] else 0)
# X3: 评论复杂度平均值
print("5. 计算 X3 (评论复杂度)")
regression_data['X3'] = comment_metrics.apply(lambda x: sum(x[1]) / len(x[1]) if x[1] else 0)
# X4: 评论可读性
print("6. 计算 X4 (评论可读性)")
regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
# X5: 内容情感性平均值
print("7. 计算 X5 (内容情感性)")
regression_data['X5'] = comment_metrics.apply(lambda x: sum(x[2]) / len(x[2]) if x[2] else 0)
# X6: 信息丰富度平均值
print("8. 计算 X6 (信息丰富度)")
regression_data['X6'] = comment_metrics.apply(lambda x: sum(x[3]) / len(x[3]) if x[3] else 0)
# 数据清洗
print("\n9. 数据清洗...")
for col in regression_data.columns:
regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0)
# 验证数据
print("\n10. 验证数据...")
print(f"行数: {len(regression_data)}")
print(f"列数: {len(regression_data.columns)}")
print(f"列名: {list(regression_data.columns)}")
print(f"数据类型:")
print(regression_data.dtypes)
print(f"\n前5行数据:")
print(regression_data.head())
# 保存文件
print("\n11. 保存文件...")
regression_data.to_excel(output_file, index=False)
# 验证文件
print("\n12. 验证文件...")
if os.path.exists(output_file):
print(f"文件已成功保存: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
# 重新读取检查
df_check = pd.read_excel(output_file)
print(f"输出文件行数: {len(df_check)}")
print(f"输出文件列数: {len(df_check.columns)}")
else:
print("文件保存失败!")
print()
print("========================================")
print(" 任务完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

83
project/quick_process.py

@ -0,0 +1,83 @@
import os
import pandas as pd
import re
print("开始处理...")
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx'
# 读取数据
print("读取数据...")
df = pd.read_excel(input_file)
print(f"读取完成: {len(df)}")
# 识别列
helpfull_col = [c for c in df.columns if 'helpfull' in str(c).lower()][0] if any('helpfull' in str(c).lower() for c in df.columns) else None
comment_count_col = [c for c in df.columns if '评论总数' in str(c)][0] if any('评论总数' in str(c) for c in df.columns) else None
comment_cols = [c for c in df.columns if '评论' in str(c) and any(str(i) in str(c) for i in range(1, 6)) and '内容' in str(c)]
print(f"找到列: Y={helpfull_col}, X1={comment_count_col}, 评论列={len(comment_cols)}")
# 添加Y和X1
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) if helpfull_col else 0
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) if comment_count_col else 0
# 计算评论指标
print("计算评论指标...")
def calc_metrics(content):
if pd.isna(content) or str(content) in ['None', 'nan', '']:
return 0, 0, 0, 0
content = str(content)
length = len(content.replace(' ', '').replace('\u3000', ''))
complexity = len(content.split())
pos_words = ['', '', '优秀', '喜欢', '满意', '', 'positive', 'good', 'great', 'excellent']
neg_words = ['', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
sentiment = 1 if any(w in content.lower() for w in pos_words) else (-1 if any(w in content.lower() for w in neg_words) else 0)
richness = (1 if re.search(r'\d', content) else 0) + (1 if re.search(r'http[s]?://|www\.', content) else 0) + (1 if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]', content) else 0)
return length, complexity, sentiment, richness
# 批量计算
x2_list, x3_list, x5_list, x6_list = [], [], [], []
for i in range(len(df)):
if i % 5000 == 0:
print(f"处理 {i}/{len(df)}")
lengths, complexities, sentiments, richness = [], [], [], []
for col in comment_cols:
l, c, s, r = calc_metrics(df.iloc[i].get(col, ''))
if l > 0:
lengths.append(l)
complexities.append(c)
sentiments.append(s)
richness.append(r)
x2_list.append(sum(lengths)/len(lengths) if lengths else 0)
x3_list.append(sum(complexities)/len(complexities) if complexities else 0)
x5_list.append(sum(sentiments)/len(sentiments) if sentiments else 0)
x6_list.append(sum(richness)/len(richness) if richness else 0)
df['X2'] = x2_list
df['X3'] = x3_list
df['X5'] = x5_list
df['X6'] = x6_list
# 计算X4
df['X4'] = df.apply(lambda r: r['X2']/r['X3'] if r['X3']>0 else 0, axis=1)
# 清洗数据
for col in ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']:
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).replace([float('inf'), float('-inf')], 0)
print("保存文件...")
df.to_excel(output_file, index=False, engine='openpyxl')
print(f"完成!文件大小: {os.path.getsize(output_file)/1024:.2f} KB")
print(f"行数: {len(df)}, 列数: {len(df.columns)}")

54
project/read_excel_test.py

@ -0,0 +1,54 @@
import os
import openpyxl
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
print("========================================")
print(" 读取Excel测试")
print("========================================")
print(f"输入文件: {input_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取Excel文件
try:
print("正在读取Excel文件...")
wb = openpyxl.load_workbook(input_file)
ws = wb.active
print(f"工作表名称: {ws.title}")
print(f"最大行数: {ws.max_row}")
print(f"最大列数: {ws.max_column}")
# 读取表头
print("\n表头:")
headers = []
for col in range(1, ws.max_column + 1):
header = ws.cell(row=1, column=col).value
headers.append(header)
print(f"{col}. {header}")
# 读取前3行数据
print("\n前3行数据:")
for row in range(2, min(5, ws.max_row + 1)):
row_data = []
for col in range(1, min(10, ws.max_column + 1)):
value = ws.cell(row=row, column=col).value
row_data.append(value)
print(f"{row}: {row_data}")
print("\n========================================")
print(" 读取完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

216
project/run_with_output.py

@ -0,0 +1,216 @@
import os
import pandas as pd
import re
import sys
# 重定向输出到文件和屏幕
class Tee:
def __init__(self, *files):
self.files = files
def write(self, obj):
for f in self.files:
f.write(obj)
f.flush()
def flush(self):
for f in self.files:
f.flush()
log_file = open(r'D:\java\project\process_log.txt', 'w', encoding='utf-8')
original_stdout = sys.stdout
sys.stdout = Tee(original_stdout, log_file)
print("========================================")
print(" 在原表中添加回归数据列")
print("========================================")
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx'
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
sys.stdout = original_stdout
log_file.close()
exit(1)
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取原始数据
try:
print("\n正在读取原始数据...")
df = pd.read_excel(input_file)
print(f"成功读取 {len(df)} 行数据")
print(f"原始列数: {len(df.columns)}")
# 识别列
print("\n识别列...")
helpfull_col = None
comment_count_col = None
comment_cols = []
for col in df.columns:
col_str = str(col).lower()
if 'helpfull' in col_str or 'helpful' in col_str:
helpfull_col = col
print(f"找到 Y 列 (helpfull): {col}")
elif '评论总数' in str(col) or '帖子评论总数' in str(col):
comment_count_col = col
print(f"找到 X1 列 (评论总数): {col}")
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
comment_cols.append(col)
print(f"找到评论列 {len(comment_cols)}: {col}")
print(f"\n共找到 {len(comment_cols)} 个评论内容列")
# 添加回归数据列
print("\n添加回归数据列...")
# Y (UGC有用性) - 直接复制helpfull列
print("1. 添加 Y (UGC有用性)")
if helpfull_col:
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
else:
df['Y'] = 0
# X1 (评论数量) - 直接复制帖子评论总数列
print("2. 添加 X1 (评论数量)")
if comment_count_col:
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
else:
df['X1'] = 0
# 定义函数计算评论指标
def calculate_comment_metrics(content):
if pd.isna(content) or str(content) in ['None', 'nan', '']:
return 0, 0, 0, 0
content = str(content)
# X2: 评论长度(剔空格后的字符数)
length = len(content.replace(' ', '').replace('\u3000', ''))
# X3: 评论复杂度(按空格拆分的分词数)
complexity = len(content.split())
# X5: 情感分析(正面=1、中性=0、负面=-1)
positive_words = ['', '', '优秀', '喜欢', '满意', '', 'positive', 'good', 'great', 'excellent', 'love', 'like']
negative_words = ['', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
sentiment = 0
lower_content = content.lower()
if any(word in lower_content for word in positive_words):
sentiment = 1
elif any(word in lower_content for word in negative_words):
sentiment = -1
# X6: 信息丰富度(含数字/链接/表情各1分,满分3分)
richness = 0
if re.search(r'\d', content): # 含数字
richness += 1
if re.search(r'http[s]?://|www\.', content): # 含链接
richness += 1
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情
richness += 1
return length, complexity, sentiment, richness
# 计算评论相关指标
print("3. 计算评论相关指标...")
# 初始化列
df['X2'] = 0.0 # 评论长度
df['X3'] = 0.0 # 评论复杂度
df['X5'] = 0.0 # 情感性
df['X6'] = 0.0 # 信息丰富度
# 逐行计算
total_rows = len(df)
print(f"总数据行数: {total_rows}")
for i in range(total_rows):
if i % 1000 == 0:
print(f" 处理第 {i}/{total_rows} 行...")
lengths = []
complexities = []
sentiments = []
richness = []
for col in comment_cols:
content = df.iloc[i].get(col, '')
length, complexity, sentiment, r = calculate_comment_metrics(content)
if length > 0: # 只统计有内容的评论
lengths.append(length)
complexities.append(complexity)
sentiments.append(sentiment)
richness.append(r)
# 计算平均值(无评论记0)
if lengths:
df.loc[i, 'X2'] = sum(lengths) / len(lengths)
df.loc[i, 'X3'] = sum(complexities) / len(complexities)
df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
df.loc[i, 'X6'] = sum(richness) / len(richness)
# X4: 评论可读性 = X2/X3(X3为0时记0,避免报错)
print("4. 计算 X4 (评论可读性)")
df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误
print("\n5. 数据清洗...")
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
for col in regression_cols:
# 转换为数字,错误值转为0
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
# 替换无穷大
df[col] = df[col].replace([float('inf'), float('-inf')], 0)
# 验证数据
print("\n6. 验证数据...")
print(f"总行数: {len(df)}")
print(f"总列数: {len(df.columns)}")
print(f"\n回归数据列统计:")
print(df[regression_cols].describe())
print(f"\n前5行回归数据:")
print(df[regression_cols].head())
# 检查是否有空值或错误值
print(f"\n空值检查:")
for col in regression_cols:
null_count = df[col].isnull().sum()
print(f" {col}: {null_count} 个空值")
# 保存文件
print("\n7. 保存文件...")
print(f"正在保存到: {output_file}")
df.to_excel(output_file, index=False, engine='openpyxl')
# 验证文件
print("\n8. 验证文件...")
if os.path.exists(output_file):
print(f"文件已成功保存: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
# 重新读取检查
df_check = pd.read_excel(output_file)
print(f"输出文件行数: {len(df_check)}")
print(f"输出文件列数: {len(df_check.columns)}")
print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
else:
print("文件保存失败!")
print()
print("========================================")
print(" 任务完成")
print("========================================")
print(f"新文件已保存: {output_file}")
print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()
finally:
sys.stdout = original_stdout
log_file.close()
print("日志已保存到: D:\\java\\project\\process_log.txt")

187
project/simple_add_columns.py

@ -0,0 +1,187 @@
import os
import pandas as pd
import re
print("=" * 60)
print(" 在原表中添加回归数据列")
print("=" * 60)
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx'
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取原始数据
print("\n正在读取原始数据...")
df = pd.read_excel(input_file)
print(f"成功读取 {len(df)} 行数据")
print(f"原始列数: {len(df.columns)}")
# 识别列
print("\n识别列...")
helpfull_col = None
comment_count_col = None
comment_cols = []
for col in df.columns:
col_str = str(col).lower()
if 'helpfull' in col_str or 'helpful' in col_str:
helpfull_col = col
print(f"找到 Y 列 (helpfull): {col}")
elif '评论总数' in str(col) or '帖子评论总数' in str(col):
comment_count_col = col
print(f"找到 X1 列 (评论总数): {col}")
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
comment_cols.append(col)
print(f"找到评论列 {len(comment_cols)}: {col}")
print(f"\n共找到 {len(comment_cols)} 个评论内容列")
# 添加回归数据列
print("\n添加回归数据列...")
# Y (UGC有用性) - 直接复制helpfull列
print("1. 添加 Y (UGC有用性)")
if helpfull_col:
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
else:
df['Y'] = 0
# X1 (评论数量) - 直接复制帖子评论总数列
print("2. 添加 X1 (评论数量)")
if comment_count_col:
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
else:
df['X1'] = 0
# 定义函数计算评论指标
def calculate_comment_metrics(content):
if pd.isna(content) or str(content) in ['None', 'nan', '']:
return 0, 0, 0, 0
content = str(content)
# X2: 评论长度(剔空格后的字符数)
length = len(content.replace(' ', '').replace('\u3000', ''))
# X3: 评论复杂度(按空格拆分的分词数)
complexity = len(content.split())
# X5: 情感分析(正面=1、中性=0、负面=-1)
positive_words = ['', '', '优秀', '喜欢', '满意', '', 'positive', 'good', 'great', 'excellent', 'love', 'like']
negative_words = ['', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
sentiment = 0
lower_content = content.lower()
if any(word in lower_content for word in positive_words):
sentiment = 1
elif any(word in lower_content for word in negative_words):
sentiment = -1
# X6: 信息丰富度(含数字/链接/表情各1分,满分3分)
richness = 0
if re.search(r'\d', content): # 含数字
richness += 1
if re.search(r'http[s]?://|www\.', content): # 含链接
richness += 1
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情
richness += 1
return length, complexity, sentiment, richness
# 计算评论相关指标
print("3. 计算评论相关指标...")
# 初始化列
df['X2'] = 0.0 # 评论长度
df['X3'] = 0.0 # 评论复杂度
df['X5'] = 0.0 # 情感性
df['X6'] = 0.0 # 信息丰富度
# 逐行计算
total_rows = len(df)
print(f"总数据行数: {total_rows}")
for i in range(total_rows):
if i % 1000 == 0:
print(f" 处理第 {i}/{total_rows} 行...")
lengths = []
complexities = []
sentiments = []
richness = []
for col in comment_cols:
content = df.iloc[i].get(col, '')
length, complexity, sentiment, r = calculate_comment_metrics(content)
if length > 0: # 只统计有内容的评论
lengths.append(length)
complexities.append(complexity)
sentiments.append(sentiment)
richness.append(r)
# 计算平均值(无评论记0)
if lengths:
df.loc[i, 'X2'] = sum(lengths) / len(lengths)
df.loc[i, 'X3'] = sum(complexities) / len(complexities)
df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
df.loc[i, 'X6'] = sum(richness) / len(richness)
# X4: 评论可读性 = X2/X3(X3为0时记0,避免报错)
print("4. 计算 X4 (评论可读性)")
df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误
print("\n5. 数据清洗...")
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
for col in regression_cols:
# 转换为数字,错误值转为0
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
# 替换无穷大
df[col] = df[col].replace([float('inf'), float('-inf')], 0)
# 验证数据
print("\n6. 验证数据...")
print(f"总行数: {len(df)}")
print(f"总列数: {len(df.columns)}")
print(f"\n回归数据列统计:")
print(df[regression_cols].describe())
print(f"\n前5行回归数据:")
print(df[regression_cols].head())
# 检查是否有空值或错误值
print(f"\n空值检查:")
for col in regression_cols:
null_count = df[col].isnull().sum()
print(f" {col}: {null_count} 个空值")
# 保存文件
print("\n7. 保存文件...")
print(f"正在保存到: {output_file}")
df.to_excel(output_file, index=False, engine='openpyxl')
# 验证文件
print("\n8. 验证文件...")
if os.path.exists(output_file):
print(f"文件已成功保存: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
# 重新读取检查
df_check = pd.read_excel(output_file)
print(f"输出文件行数: {len(df_check)}")
print(f"输出文件列数: {len(df_check.columns)}")
print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
else:
print("文件保存失败!")
print()
print("=" * 60)
print(" 任务完成")
print("=" * 60)
print(f"新文件已保存: {output_file}")
print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")

100
project/simple_calculate.py

@ -0,0 +1,100 @@
import os
import openpyxl
import re
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
print("========================================")
print(" 简单计算UGC回归数据")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
if not os.path.exists(output_file):
print("错误: 输出文件不存在!")
exit(1)
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取输入文件
try:
print("正在读取输入文件...")
wb_input = openpyxl.load_workbook(input_file)
ws_input = wb_input.active
print(f"输入工作表名称: {ws_input.title}")
print(f"输入文件最大行数: {ws_input.max_row}")
print(f"输入文件最大列数: {ws_input.max_column}")
# 读取输出文件
print("\n正在读取输出文件...")
wb_output = openpyxl.load_workbook(output_file)
ws_output = wb_output.active
print(f"输出工作表名称: {ws_output.title}")
# 识别列
print("\n识别列...")
headers = []
for col in range(1, ws_input.max_column + 1):
header = ws_input.cell(row=1, column=col).value
headers.append(header)
if header and 'helpfull' in str(header):
helpfull_col = col
print(f"找到 helpfull 列: {col}")
elif header and ('评论总数' in str(header) or '帖子评论总数' in str(header)):
comment_count_col = col
print(f"找到评论总数列: {col}")
elif header and '评论' in str(header):
print(f"找到评论列: {col} - {header}")
# 计算并填充数据
print("\n计算并填充数据...")
max_rows = min(ws_input.max_row, 10) # 只处理前10行用于测试
print(f"处理前 {max_rows - 1} 行数据")
for row in range(2, max_rows + 1):
print(f"处理行 {row}")
# Y (UGC有用性)
if 'helpfull_col' in locals():
y_value = ws_input.cell(row=row, column=helpfull_col).value
ws_output.cell(row=row, column=1, value=y_value if y_value else 0)
else:
ws_output.cell(row=row, column=1, value=0)
# X1 (评论数量)
if 'comment_count_col' in locals():
x1_value = ws_input.cell(row=row, column=comment_count_col).value
ws_output.cell(row=row, column=2, value=x1_value if x1_value else 0)
else:
ws_output.cell(row=row, column=2, value=0)
# X2-X6 暂时设为0
for col in range(3, 8):
ws_output.cell(row=row, column=col, value=0)
# 保存文件
print("\n保存文件...")
wb_output.save(output_file)
print(f"文件已成功保存: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
print()
print("========================================")
print(" 任务完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

41
project/simple_copy.py

@ -0,0 +1,41 @@
import os
import shutil
# 输入输出文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
print("========================================")
print(" 简单文件复制脚本")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
print(f"文件存在: {os.path.exists(input_file)}")
# 复制文件
try:
print("正在复制文件...")
shutil.copy2(input_file, output_file)
# 验证文件是否创建成功
if os.path.exists(output_file):
print(f"文件已成功复制到: {output_file}")
print(f"复制文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
else:
print("错误: 文件复制失败,未找到输出文件")
print()
print("========================================")
print(" 任务完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")

54
project/simple_data_test.py

@ -0,0 +1,54 @@
import os
import pandas as pd
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
print("========================================")
print(" 简单数据测试")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取原始数据
try:
print("正在读取原始数据...")
df = pd.read_excel(input_file)
print(f"成功读取 {len(df)} 行数据")
print(f"列名: {list(df.columns)}")
# 简单处理:创建一个只包含前5列的新文件
print("\n创建测试文件...")
test_data = df.head(100) # 只取前100行
test_output = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\test_output.xlsx'
test_data.to_excel(test_output, index=False)
print(f"测试文件已创建: {test_output}")
print(f"测试文件大小: {os.path.getsize(test_output) / 1024:.2f} KB")
# 验证测试文件
if os.path.exists(test_output):
df_test = pd.read_excel(test_output)
print(f"测试文件行数: {len(df_test)}")
print(f"测试文件列数: {len(df_test.columns)}")
else:
print("测试文件创建失败!")
print()
print("========================================")
print(" 测试完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

57
project/simple_excel_create.py

@ -0,0 +1,57 @@
import os
import openpyxl
# 文件路径
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
print("========================================")
print(" 创建UGC回归数据文件")
print("========================================")
print(f"输出文件: {output_file}")
print()
# 检查输出目录是否存在
output_dir = os.path.dirname(output_file)
print(f"输出目录: {output_dir}")
print(f"目录存在: {os.path.exists(output_dir)}")
if not os.path.exists(output_dir):
print("正在创建输出目录...")
try:
os.makedirs(output_dir)
print("目录创建成功")
except Exception as e:
print(f"创建目录失败: {e}")
exit(1)
# 创建新的Excel文件
try:
print("\n创建新的Excel文件...")
wb = openpyxl.Workbook()
ws = wb.active
# 设置第一行列名
headers = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
for i, header in enumerate(headers, 1):
ws.cell(row=1, column=i, value=header)
# 保存文件
print(f"保存文件到: {output_file}")
wb.save(output_file)
# 验证文件是否创建成功
if os.path.exists(output_file):
print(f"文件已成功创建: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
else:
print("错误: 文件创建失败")
print()
print("========================================")
print(" 任务完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()

22
project/simple_test.py

@ -0,0 +1,22 @@
import os
# 测试基本文件操作
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
print("========================================")
print(" 简单测试")
print("========================================")
print(f"输入文件: {input_file}")
print()
# 检查文件是否存在
if os.path.exists(input_file):
print("文件存在!")
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
else:
print("文件不存在!")
print()
print("========================================")
print(" 测试完成")
print("========================================")

49
project/test_file_access.py

@ -0,0 +1,49 @@
import os
# 测试文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
print("========================================")
print(" 测试文件访问")
print("========================================")
print(f"当前目录: {os.getcwd()}")
print()
# 检查输入文件
print("检查输入文件:")
print(f"路径: {input_file}")
print(f"存在: {os.path.exists(input_file)}")
if os.path.exists(input_file):
print(f"大小: {os.path.getsize(input_file) / 1024:.2f} KB")
else:
print("文件不存在!")
# 检查输出文件
print("\n检查输出文件:")
print(f"路径: {output_file}")
print(f"存在: {os.path.exists(output_file)}")
if os.path.exists(output_file):
print(f"大小: {os.path.getsize(output_file) / 1024:.2f} KB")
else:
print("文件不存在!")
# 检查目录
print("\n检查目录:")
dir_path = os.path.dirname(input_file)
print(f"目录: {dir_path}")
print(f"存在: {os.path.exists(dir_path)}")
if os.path.exists(dir_path):
print("目录内容:")
files = os.listdir(dir_path)
for file in files[:10]: # 只显示前10个文件
file_path = os.path.join(dir_path, file)
size = os.path.getsize(file_path) / 1024
print(f" {file}: {size:.2f} KB")
if len(files) > 10:
print(f" ... 还有 {len(files) - 10} 个文件")
print()
print("========================================")
print(" 测试完成")
print("========================================")

1
w4

@ -0,0 +1 @@
Subproject commit 15d177f1a2a8093521047d866fd50d9b09eb273d
Loading…
Cancel
Save