Compare commits
No commits in common. 'main' and 'master' have entirely different histories.
92 changed files with 94 additions and 6999 deletions
@ -0,0 +1,2 @@ |
|||||
|
本次作业借助豆包AI完成核心任务。首先我上传Python温度转换程序,请求将其完整移植为Java程序,要求保留注释并补充方法说明,AI生成了Java源码框架。随后我遇到文件名后缀错误、JDK环境变量未配置的问题,连续提问排查方法,AI给出了修改和配置指引。最后我请求编写符合作业要求的README模板、梳理Git提交与Innoschool提交流程,AI提供了精准模板和操作步骤,助力我完成作业准备。 |
||||
|
|
||||
@ -1,66 +0,0 @@ |
|||||
// 1. 定义Swimmable接口:包含swim()方法
|
|
||||
public interface Swimmable { |
|
||||
// 接口方法默认public abstract,可省略修饰符
|
|
||||
void swim(); |
|
||||
} |
|
||||
|
|
||||
// 2. 定义抽象类Animal:包含抽象方法makeSound()
|
|
||||
public abstract class Animal { |
|
||||
// 抽象方法:没有方法体,由子类实现
|
|
||||
public abstract void makeSound(); |
|
||||
} |
|
||||
|
|
||||
// 3. Dog类:继承Animal,实现Swimmable接口
|
|
||||
public class Dog extends Animal implements Swimmable { |
|
||||
// 实现父类抽象方法makeSound()
|
|
||||
@Override |
|
||||
public void makeSound() { |
|
||||
System.out.println("狗叫:汪汪汪!"); |
|
||||
} |
|
||||
|
|
||||
// 实现Swimmable接口的swim()方法
|
|
||||
@Override |
|
||||
public void swim() { |
|
||||
System.out.println("狗在游泳:狗刨式!"); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// 4. Cat类:仅继承Animal,不实现Swimmable接口
|
|
||||
public class Cat extends Animal { |
|
||||
// 实现父类抽象方法makeSound()
|
|
||||
@Override |
|
||||
public void makeSound() { |
|
||||
System.out.println("猫叫:喵喵喵!"); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// 5. 主类:测试多态调用
|
|
||||
public class AnimalTest { |
|
||||
public static void main(String[] args) { |
|
||||
// 多态1:父类引用指向子类对象(Animal多态)
|
|
||||
Animal dog1 = new Dog(); |
|
||||
Animal cat1 = new Cat(); |
|
||||
|
|
||||
System.out.println("=== Animal多态调用makeSound() ==="); |
|
||||
dog1.makeSound(); // 调用Dog类的makeSound()
|
|
||||
cat1.makeSound(); // 调用Cat类的makeSound()
|
|
||||
|
|
||||
// 多态2:接口引用指向实现类对象(Swimmable多态)
|
|
||||
Swimmable dog2 = new Dog(); |
|
||||
System.out.println("\n=== Swimmable多态调用swim() ==="); |
|
||||
dog2.swim(); // 调用Dog类的swim()
|
|
||||
|
|
||||
// 类型转换:将Animal类型的dog1转为Swimmable,调用swim()
|
|
||||
System.out.println("\n=== 类型转换后调用swim() ==="); |
|
||||
if (dog1 instanceof Swimmable) { // 安全判断:避免类型转换异常
|
|
||||
Swimmable swimmableDog = (Swimmable) dog1; |
|
||||
swimmableDog.swim(); |
|
||||
} |
|
||||
|
|
||||
// Cat无法转换为Swimmable,会抛出异常,因此不执行
|
|
||||
// if (cat1 instanceof Swimmable) {
|
|
||||
// Swimmable swimmableCat = (Swimmable) cat1;
|
|
||||
// swimmableCat.swim();
|
|
||||
// }
|
|
||||
} |
|
||||
} |
|
||||
Binary file not shown.
@ -1,63 +0,0 @@ |
|||||
public class BankAccount { |
|
||||
// 私有属性
|
|
||||
private final String accountNumber; |
|
||||
private String ownerName; |
|
||||
private double balance; |
|
||||
|
|
||||
// 构造方法
|
|
||||
public BankAccount(String accountNumber, String ownerName) { |
|
||||
this.accountNumber = accountNumber; |
|
||||
this.ownerName = ownerName; |
|
||||
this.balance = 0.0; |
|
||||
} |
|
||||
|
|
||||
// Getter 方法
|
|
||||
public String getAccountNumber() { |
|
||||
return accountNumber; |
|
||||
} |
|
||||
|
|
||||
public String getOwnerName() { |
|
||||
return ownerName; |
|
||||
} |
|
||||
|
|
||||
public double getBalance() { |
|
||||
return balance; |
|
||||
} |
|
||||
|
|
||||
// Setter 方法
|
|
||||
public void setOwnerName(String ownerName) { |
|
||||
this.ownerName = ownerName; |
|
||||
} |
|
||||
|
|
||||
// 存款操作
|
|
||||
public void deposit(double amount) { |
|
||||
if (amount > 0) { |
|
||||
balance += amount; |
|
||||
System.out.println("存款成功!当前余额:" + balance); |
|
||||
} else { |
|
||||
System.out.println("存款金额必须大于 0"); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// 取款操作
|
|
||||
public void withdraw(double amount) { |
|
||||
if (amount > 0) { |
|
||||
if (amount <= balance) { |
|
||||
balance -= amount; |
|
||||
System.out.println("取款成功!当前余额:" + balance); |
|
||||
} else { |
|
||||
System.out.println("余额不足,无法取款"); |
|
||||
} |
|
||||
} else { |
|
||||
System.out.println("取款金额必须大于 0"); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// 显示账户信息
|
|
||||
public void displayInfo() { |
|
||||
System.out.println("账号:" + accountNumber); |
|
||||
System.out.println("户主:" + ownerName); |
|
||||
System.out.println("余额:" + balance); |
|
||||
System.out.println(); |
|
||||
} |
|
||||
} |
|
||||
Binary file not shown.
@ -1,29 +0,0 @@ |
|||||
public class TestBankAccount { |
|
||||
public static void main(String[] args) { |
|
||||
// 创建银行账户
|
|
||||
BankAccount account = new BankAccount("123456789", "张三"); |
|
||||
|
|
||||
// 显示初始账户信息
|
|
||||
System.out.println("初始账户信息:"); |
|
||||
account.displayInfo(); |
|
||||
|
|
||||
// 测试存款
|
|
||||
System.out.println("测试存款:"); |
|
||||
account.deposit(1000); |
|
||||
account.deposit(-500); // 测试非法存款金额
|
|
||||
|
|
||||
// 测试取款
|
|
||||
System.out.println("测试取款:"); |
|
||||
account.withdraw(500); |
|
||||
account.withdraw(1000); // 测试余额不足
|
|
||||
account.withdraw(-200); // 测试非法取款金额
|
|
||||
|
|
||||
// 测试修改户主姓名
|
|
||||
System.out.println("测试修改户主姓名:"); |
|
||||
account.setOwnerName("李四"); |
|
||||
account.displayInfo(); |
|
||||
|
|
||||
// 测试查询余额
|
|
||||
System.out.println("当前余额:" + account.getBalance()); |
|
||||
} |
|
||||
} |
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,155 +0,0 @@ |
|||||
import java.io.BufferedReader; |
|
||||
import java.io.FileWriter; |
|
||||
import java.io.IOException; |
|
||||
import java.io.InputStreamReader; |
|
||||
import java.net.HttpURLConnection; |
|
||||
import java.net.URL; |
|
||||
import java.util.ArrayList; |
|
||||
import java.util.HashMap; |
|
||||
import java.util.List; |
|
||||
import java.util.Map; |
|
||||
|
|
||||
public class SimpleMovieCrawler { |
|
||||
|
|
||||
public static void main(String[] args) { |
|
||||
try { |
|
||||
// 1. 抓取电影数据
|
|
||||
List<Movie> movies = crawlMovies(); |
|
||||
System.out.println("爬取完成,共获取 " + movies.size() + " 部电影数据"); |
|
||||
|
|
||||
// 2. 保存到文件
|
|
||||
saveToFile(movies, "movies.txt"); |
|
||||
|
|
||||
// 3. 分析数据
|
|
||||
analyzeData(movies); |
|
||||
|
|
||||
} catch (IOException e) { |
|
||||
e.printStackTrace(); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// 简单的爬虫实现
|
|
||||
public static List<Movie> crawlMovies() throws IOException { |
|
||||
List<Movie> movies = new ArrayList<>(); |
|
||||
String url = "https://www.imdb.com/chart/top/"; |
|
||||
|
|
||||
// 发送 HTTP 请求
|
|
||||
HttpURLConnection connection = (HttpURLConnection) new URL(url).openConnection(); |
|
||||
connection.setRequestMethod("GET"); |
|
||||
connection.setRequestProperty("User-Agent", "Mozilla/5.0"); |
|
||||
|
|
||||
// 读取响应
|
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream())); |
|
||||
StringBuilder content = new StringBuilder(); |
|
||||
String line; |
|
||||
while ((line = reader.readLine()) != null) { |
|
||||
content.append(line); |
|
||||
} |
|
||||
reader.close(); |
|
||||
connection.disconnect(); |
|
||||
|
|
||||
// 简单解析 HTML(实际项目中建议使用 Jsoup)
|
|
||||
String html = content.toString(); |
|
||||
int start = html.indexOf("<tbody class=\"lister-list\">"); |
|
||||
int end = html.indexOf("</tbody>", start); |
|
||||
if (start != -1 && end != -1) { |
|
||||
String tableContent = html.substring(start, end); |
|
||||
String[] rows = tableContent.split("<tr>"); |
|
||||
|
|
||||
for (int i = 1; i < Math.min(rows.length, 21); i++) { // 只取前 20 部
|
|
||||
String row = rows[i]; |
|
||||
Movie movie = new Movie(); |
|
||||
|
|
||||
// 提取标题
|
|
||||
int titleStart = row.indexOf("<a href="); |
|
||||
int titleEnd = row.indexOf("</a>", titleStart); |
|
||||
if (titleStart != -1 && titleEnd != -1) { |
|
||||
String titleHtml = row.substring(titleStart, titleEnd); |
|
||||
int titleTextStart = titleHtml.indexOf(">" ) + 1; |
|
||||
if (titleTextStart != -1) { |
|
||||
movie.setTitle(titleHtml.substring(titleTextStart).trim()); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// 提取年份
|
|
||||
int yearStart = row.indexOf("<span class=\"secondaryInfo\">"); |
|
||||
int yearEnd = row.indexOf("</span>", yearStart); |
|
||||
if (yearStart != -1 && yearEnd != -1) { |
|
||||
String year = row.substring(yearStart + 27, yearEnd).replaceAll("[()]", "").trim(); |
|
||||
movie.setYear(year); |
|
||||
} |
|
||||
|
|
||||
// 提取评分
|
|
||||
int ratingStart = row.indexOf("<strong>"); |
|
||||
int ratingEnd = row.indexOf("</strong>", ratingStart); |
|
||||
if (ratingStart != -1 && ratingEnd != -1) { |
|
||||
String rating = row.substring(ratingStart + 8, ratingEnd).trim(); |
|
||||
movie.setRating(rating); |
|
||||
} |
|
||||
|
|
||||
if (movie.getTitle() != null) { |
|
||||
movies.add(movie); |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return movies; |
|
||||
} |
|
||||
|
|
||||
// 保存数据到文件
|
|
||||
public static void saveToFile(List<Movie> movies, String fileName) throws IOException { |
|
||||
FileWriter writer = new FileWriter(fileName); |
|
||||
writer.write("Title,Rating,Year\n"); |
|
||||
for (Movie movie : movies) { |
|
||||
writer.write(movie.getTitle() + "," + movie.getRating() + "," + movie.getYear() + "\n"); |
|
||||
} |
|
||||
writer.close(); |
|
||||
System.out.println("数据已保存到: " + fileName); |
|
||||
} |
|
||||
|
|
||||
// 分析数据
|
|
||||
public static void analyzeData(List<Movie> movies) { |
|
||||
System.out.println("\n=== 电影数据分析 ==="); |
|
||||
|
|
||||
// 评分分布
|
|
||||
Map<String, Integer> ratingDist = new HashMap<>(); |
|
||||
for (Movie movie : movies) { |
|
||||
String rating = movie.getRating(); |
|
||||
ratingDist.put(rating, ratingDist.getOrDefault(rating, 0) + 1); |
|
||||
} |
|
||||
|
|
||||
System.out.println("\n1. 评分分布:"); |
|
||||
for (Map.Entry<String, Integer> entry : ratingDist.entrySet()) { |
|
||||
System.out.println("评分 " + entry.getKey() + ": " + entry.getValue() + " 部"); |
|
||||
} |
|
||||
|
|
||||
// 年份分布
|
|
||||
Map<String, Integer> yearDist = new HashMap<>(); |
|
||||
for (Movie movie : movies) { |
|
||||
String year = movie.getYear(); |
|
||||
if (year != null) { |
|
||||
yearDist.put(year, yearDist.getOrDefault(year, 0) + 1); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
System.out.println("\n2. 年份分布:"); |
|
||||
yearDist.entrySet().stream() |
|
||||
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed()) |
|
||||
.limit(10) |
|
||||
.forEach(entry -> System.out.println(entry.getKey() + "年: " + entry.getValue() + " 部")); |
|
||||
} |
|
||||
|
|
||||
// 电影模型类
|
|
||||
static class Movie { |
|
||||
private String title; |
|
||||
private String rating; |
|
||||
private String year; |
|
||||
|
|
||||
public String getTitle() { return title; } |
|
||||
public void setTitle(String title) { this.title = title; } |
|
||||
public String getRating() { return rating; } |
|
||||
public void setRating(String rating) { this.rating = rating; } |
|
||||
public String getYear() { return year; } |
|
||||
public void setYear(String year) { this.year = year; } |
|
||||
} |
|
||||
} |
|
||||
@ -1 +0,0 @@ |
|||||
Title,Rating,Year |
|
||||
@ -1,51 +0,0 @@ |
|||||
<?xml version="1.0" encoding="UTF-8"?> |
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" |
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|
||||
<modelVersion>4.0.0</modelVersion> |
|
||||
|
|
||||
<groupId>com.example</groupId> |
|
||||
<artifactId>movie-crawler</artifactId> |
|
||||
<version>1.0-SNAPSHOT</version> |
|
||||
|
|
||||
<properties> |
|
||||
<maven.compiler.source>11</maven.compiler.source> |
|
||||
<maven.compiler.target>11</maven.compiler.target> |
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
|
||||
</properties> |
|
||||
|
|
||||
<dependencies> |
|
||||
<!-- Jsoup for HTML parsing --> |
|
||||
<dependency> |
|
||||
<groupId>org.jsoup</groupId> |
|
||||
<artifactId>jsoup</artifactId> |
|
||||
<version>1.17.2</version> |
|
||||
</dependency> |
|
||||
<!-- JFreeChart for chart generation --> |
|
||||
<dependency> |
|
||||
<groupId>org.jfree</groupId> |
|
||||
<artifactId>jfreechart</artifactId> |
|
||||
<version>1.5.4</version> |
|
||||
</dependency> |
|
||||
<!-- Commons CSV for CSV handling --> |
|
||||
<dependency> |
|
||||
<groupId>org.apache.commons</groupId> |
|
||||
<artifactId>commons-csv</artifactId> |
|
||||
<version>1.10.0</version> |
|
||||
</dependency> |
|
||||
</dependencies> |
|
||||
|
|
||||
<build> |
|
||||
<plugins> |
|
||||
<plugin> |
|
||||
<groupId>org.apache.maven.plugins</groupId> |
|
||||
<artifactId>maven-compiler-plugin</artifactId> |
|
||||
<version>3.11.0</version> |
|
||||
<configuration> |
|
||||
<source>11</source> |
|
||||
<target>11</target> |
|
||||
</configuration> |
|
||||
</plugin> |
|
||||
</plugins> |
|
||||
</build> |
|
||||
</project> |
|
||||
@ -1,38 +0,0 @@ |
|||||
@echo off |
|
||||
|
|
||||
rem 创建 lib 目录并下载依赖 |
|
||||
if not exist lib mkdir lib |
|
||||
|
|
||||
rem 下载 Jsoup |
|
||||
if not exist lib\jsoup-1.17.2.jar ( |
|
||||
echo 下载 Jsoup... |
|
||||
powershell -Command "Invoke-WebRequest -Uri 'https://repo1.maven.org/maven2/org/jsoup/jsoup/1.17.2/jsoup-1.17.2.jar' -OutFile 'lib\jsoup-1.17.2.jar'" |
|
||||
) |
|
||||
|
|
||||
rem 下载 JFreeChart |
|
||||
if not exist lib\jfreechart-1.5.4.jar ( |
|
||||
echo 下载 JFreeChart... |
|
||||
powershell -Command "Invoke-WebRequest -Uri 'https://repo1.maven.org/maven2/org/jfree/jfreechart/1.5.4/jfreechart-1.5.4.jar' -OutFile 'lib\jfreechart-1.5.4.jar'" |
|
||||
) |
|
||||
|
|
||||
rem 下载 JCommon(JFreeChart 依赖) |
|
||||
if not exist lib\jcommon-1.0.24.jar ( |
|
||||
echo 下载 JCommon... |
|
||||
powershell -Command "Invoke-WebRequest -Uri 'https://repo1.maven.org/maven2/org/jfree/jcommon/1.0.24/jcommon-1.0.24.jar' -OutFile 'lib\jcommon-1.0.24.jar'" |
|
||||
) |
|
||||
|
|
||||
rem 下载 Commons CSV |
|
||||
if not exist lib\commons-csv-1.10.0.jar ( |
|
||||
echo 下载 Commons CSV... |
|
||||
powershell -Command "Invoke-WebRequest -Uri 'https://repo1.maven.org/maven2/org/apache/commons/commons-csv/1.10.0/commons-csv-1.10.0.jar' -OutFile 'lib\commons-csv-1.10.0.jar'" |
|
||||
) |
|
||||
|
|
||||
rem 编译项目 |
|
||||
echo 编译项目... |
|
||||
javac -cp "lib/*" -d bin src\main\java\com\example\*.java src\main\java\com\example\model\*.java src\main\java\com\example\crawler\*.java src\main\java\com\example\processor\*.java src\main\java\com\example\analyzer\*.java src\main\java\com\example\chart\*.java |
|
||||
|
|
||||
rem 运行项目 |
|
||||
echo 运行项目... |
|
||||
java -cp "bin;lib/*" com.example.Main |
|
||||
|
|
||||
pause |
|
||||
@ -1,62 +0,0 @@ |
|||||
package com.example; |
|
||||
|
|
||||
import com.example.analyzer.MovieAnalyzer; |
|
||||
import com.example.chart.ChartGenerator; |
|
||||
import com.example.crawler.MovieCrawler; |
|
||||
import com.example.model.Movie; |
|
||||
import com.example.processor.DataProcessor; |
|
||||
|
|
||||
import java.io.IOException; |
|
||||
import java.util.List; |
|
||||
|
|
||||
public class Main { |
|
||||
public static void main(String[] args) { |
|
||||
try { |
|
||||
// 1. 初始化爬虫
|
|
||||
MovieCrawler crawler = new MovieCrawler(); |
|
||||
System.out.println("开始爬取 IMDb Top 250 电影数据..."); |
|
||||
|
|
||||
// 2. 抓取电影数据(限制为50部)
|
|
||||
List<Movie> movies = crawler.crawlTopMovies(50); |
|
||||
System.out.println("爬取完成,共获取 " + movies.size() + " 部电影数据"); |
|
||||
|
|
||||
// 3. 数据处理与存储
|
|
||||
DataProcessor processor = new DataProcessor(); |
|
||||
String csvFilePath = "movies.csv"; |
|
||||
processor.saveMoviesToCsv(movies, csvFilePath); |
|
||||
|
|
||||
// 4. 数据分析
|
|
||||
MovieAnalyzer analyzer = new MovieAnalyzer(); |
|
||||
analyzer.printStatistics(movies); |
|
||||
|
|
||||
// 5. 图表生成
|
|
||||
ChartGenerator chartGenerator = new ChartGenerator(); |
|
||||
|
|
||||
// 生成评分分布图表
|
|
||||
chartGenerator.generateRatingDistributionChart( |
|
||||
analyzer.analyzeRatingDistribution(movies), |
|
||||
"rating_distribution.png" |
|
||||
); |
|
||||
|
|
||||
// 生成类型分布图表
|
|
||||
chartGenerator.generateGenreDistributionChart( |
|
||||
analyzer.analyzeGenreDistribution(movies), |
|
||||
"genre_distribution.png" |
|
||||
); |
|
||||
|
|
||||
// 生成导演作品数图表
|
|
||||
chartGenerator.generateDirectorWorksChart( |
|
||||
analyzer.analyzeDirectorWorks(movies), |
|
||||
"director_works.png" |
|
||||
); |
|
||||
|
|
||||
System.out.println("\n项目执行完成!"); |
|
||||
System.out.println("数据已保存到: " + csvFilePath); |
|
||||
System.out.println("图表已生成到当前目录"); |
|
||||
|
|
||||
} catch (IOException e) { |
|
||||
System.out.println("执行过程中出现错误: " + e.getMessage()); |
|
||||
e.printStackTrace(); |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
@ -1,94 +0,0 @@ |
|||||
package com.example.analyzer; |
|
||||
|
|
||||
import com.example.model.Movie; |
|
||||
|
|
||||
import java.util.*; |
|
||||
import java.util.stream.Collectors; |
|
||||
|
|
||||
public class MovieAnalyzer { |
|
||||
|
|
||||
// 统计评分分布
|
|
||||
public Map<String, Integer> analyzeRatingDistribution(List<Movie> movies) { |
|
||||
return movies.stream() |
|
||||
.collect(Collectors.groupingBy(Movie::getRating, Collectors.summingInt(e -> 1))); |
|
||||
} |
|
||||
|
|
||||
// 统计年份与评分的关系
|
|
||||
public Map<String, Double> analyzeYearRatingRelation(List<Movie> movies) { |
|
||||
return movies.stream() |
|
||||
.collect(Collectors.groupingBy(Movie::getYear, |
|
||||
Collectors.averagingDouble(m -> Double.parseDouble(m.getRating())))); |
|
||||
} |
|
||||
|
|
||||
// 统计导演作品数排行
|
|
||||
public Map<String, Integer> analyzeDirectorWorks(List<Movie> movies) { |
|
||||
return movies.stream() |
|
||||
.collect(Collectors.groupingBy(Movie::getDirector, Collectors.summingInt(e -> 1))) |
|
||||
.entrySet().stream() |
|
||||
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed()) |
|
||||
.limit(10) |
|
||||
.collect(Collectors.toMap( |
|
||||
Map.Entry::getKey, |
|
||||
Map.Entry::getValue, |
|
||||
(e1, e2) -> e1, |
|
||||
LinkedHashMap::new |
|
||||
)); |
|
||||
} |
|
||||
|
|
||||
// 统计类型分布
|
|
||||
public Map<String, Integer> analyzeGenreDistribution(List<Movie> movies) { |
|
||||
Map<String, Integer> genreCount = new HashMap<>(); |
|
||||
|
|
||||
for (Movie movie : movies) { |
|
||||
String genre = movie.getGenre(); |
|
||||
if (genre != null && !genre.isEmpty()) { |
|
||||
String[] genres = genre.split(", "); |
|
||||
for (String g : genres) { |
|
||||
genreCount.put(g, genreCount.getOrDefault(g, 0) + 1); |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return genreCount.entrySet().stream() |
|
||||
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed()) |
|
||||
.limit(10) |
|
||||
.collect(Collectors.toMap( |
|
||||
Map.Entry::getKey, |
|
||||
Map.Entry::getValue, |
|
||||
(e1, e2) -> e1, |
|
||||
LinkedHashMap::new |
|
||||
)); |
|
||||
} |
|
||||
|
|
||||
// 打印统计结果
|
|
||||
public void printStatistics(List<Movie> movies) { |
|
||||
System.out.println("\n=== 电影数据分析结果 ==="); |
|
||||
|
|
||||
// 评分分布
|
|
||||
System.out.println("\n1. 评分分布:"); |
|
||||
Map<String, Integer> ratingDist = analyzeRatingDistribution(movies); |
|
||||
ratingDist.forEach((rating, count) -> |
|
||||
System.out.printf("评分 %.1f: %d 部\n", Double.parseDouble(rating), count)); |
|
||||
|
|
||||
// 年份与评分关系(前10年)
|
|
||||
System.out.println("\n2. 年份与平均评分(前10年):"); |
|
||||
Map<String, Double> yearRating = analyzeYearRatingRelation(movies); |
|
||||
yearRating.entrySet().stream() |
|
||||
.sorted(Map.Entry.<String, Double>comparingByValue().reversed()) |
|
||||
.limit(10) |
|
||||
.forEach(entry -> |
|
||||
System.out.printf("%s年: %.2f\n", entry.getKey(), entry.getValue())); |
|
||||
|
|
||||
// 导演作品数排行
|
|
||||
System.out.println("\n3. 导演作品数排行(前10):"); |
|
||||
Map<String, Integer> directorWorks = analyzeDirectorWorks(movies); |
|
||||
directorWorks.forEach((director, count) -> |
|
||||
System.out.printf("%s: %d 部\n", director, count)); |
|
||||
|
|
||||
// 类型分布
|
|
||||
System.out.println("\n4. 类型分布(前10):"); |
|
||||
Map<String, Integer> genreDist = analyzeGenreDistribution(movies); |
|
||||
genreDist.forEach((genre, count) -> |
|
||||
System.out.printf("%s: %d 部\n", genre, count)); |
|
||||
} |
|
||||
} |
|
||||
@ -1,81 +0,0 @@ |
|||||
package com.example.chart; |
|
||||
|
|
||||
import org.jfree.chart.ChartFactory; |
|
||||
import org.jfree.chart.ChartUtils; |
|
||||
import org.jfree.chart.JFreeChart; |
|
||||
import org.jfree.chart.plot.PlotOrientation; |
|
||||
import org.jfree.data.category.DefaultCategoryDataset; |
|
||||
import org.jfree.data.general.DefaultPieDataset; |
|
||||
|
|
||||
import java.io.File; |
|
||||
import java.io.IOException; |
|
||||
import java.util.Map; |
|
||||
|
|
||||
public class ChartGenerator { |
|
||||
|
|
||||
// 生成评分分布柱状图
|
|
||||
public void generateRatingDistributionChart(Map<String, Integer> ratingDist, String outputPath) throws IOException { |
|
||||
DefaultCategoryDataset dataset = new DefaultCategoryDataset(); |
|
||||
|
|
||||
ratingDist.forEach((rating, count) -> { |
|
||||
dataset.addValue(count, "电影数量", rating); |
|
||||
}); |
|
||||
|
|
||||
JFreeChart chart = ChartFactory.createBarChart( |
|
||||
"IMDb Top 250 电影评分分布", |
|
||||
"评分", |
|
||||
"电影数量", |
|
||||
dataset, |
|
||||
PlotOrientation.VERTICAL, |
|
||||
true, |
|
||||
true, |
|
||||
false |
|
||||
); |
|
||||
|
|
||||
ChartUtils.saveChartAsPNG(new File(outputPath), chart, 800, 600); |
|
||||
System.out.println("评分分布图表已保存到:" + outputPath); |
|
||||
} |
|
||||
|
|
||||
// 生成类型分布饼图
|
|
||||
public void generateGenreDistributionChart(Map<String, Integer> genreDist, String outputPath) throws IOException { |
|
||||
DefaultPieDataset dataset = new DefaultPieDataset(); |
|
||||
|
|
||||
genreDist.forEach((genre, count) -> { |
|
||||
dataset.setValue(genre, count); |
|
||||
}); |
|
||||
|
|
||||
JFreeChart chart = ChartFactory.createPieChart( |
|
||||
"IMDb Top 250 电影类型分布", |
|
||||
dataset, |
|
||||
true, |
|
||||
true, |
|
||||
false |
|
||||
); |
|
||||
|
|
||||
ChartUtils.saveChartAsPNG(new File(outputPath), chart, 800, 600); |
|
||||
System.out.println("类型分布图表已保存到:" + outputPath); |
|
||||
} |
|
||||
|
|
||||
// 生成导演作品数柱状图
|
|
||||
public void generateDirectorWorksChart(Map<String, Integer> directorWorks, String outputPath) throws IOException { |
|
||||
DefaultCategoryDataset dataset = new DefaultCategoryDataset(); |
|
||||
|
|
||||
directorWorks.forEach((director, count) -> { |
|
||||
dataset.addValue(count, "作品数量", director); |
|
||||
}); |
|
||||
|
|
||||
JFreeChart chart = ChartFactory.createBarChart( |
|
||||
"IMDb Top 250 导演作品数排行", |
|
||||
"导演", |
|
||||
"作品数量", |
|
||||
dataset, |
|
||||
PlotOrientation.VERTICAL, |
|
||||
true, |
|
||||
true, |
|
||||
false |
|
||||
); |
|
||||
|
|
||||
ChartUtils.saveChartAsPNG(new File(outputPath), chart, 800, 600); |
|
||||
System.out.println("导演作品数图表已保存到:" + outputPath); |
|
||||
} |
|
||||
} |
|
||||
@ -1,119 +0,0 @@ |
|||||
package com.example.crawler; |
|
||||
|
|
||||
import com.example.model.Movie; |
|
||||
import org.jsoup.Jsoup; |
|
||||
import org.jsoup.nodes.Document; |
|
||||
import org.jsoup.nodes.Element; |
|
||||
import org.jsoup.select.Elements; |
|
||||
|
|
||||
import java.io.IOException; |
|
||||
import java.util.ArrayList; |
|
||||
import java.util.List; |
|
||||
import java.util.stream.Collectors; |
|
||||
|
|
||||
public class MovieCrawler { |
|
||||
private static final String BASE_URL = "https://www.imdb.com/chart/top/"; |
|
||||
|
|
||||
public List<Movie> crawlTopMovies(int limit) throws IOException { |
|
||||
List<Movie> movies = new ArrayList<>(); |
|
||||
|
|
||||
// 发送 HTTP 请求获取网页内容
|
|
||||
Document doc = Jsoup.connect(BASE_URL) |
|
||||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") |
|
||||
.timeout(10000) |
|
||||
.get(); |
|
||||
|
|
||||
// 解析电影列表
|
|
||||
Elements movieElements = doc.select("tbody.lister-list tr"); |
|
||||
|
|
||||
int count = 0; |
|
||||
for (Element element : movieElements) { |
|
||||
if (count >= limit) break; |
|
||||
|
|
||||
Movie movie = new Movie(); |
|
||||
|
|
||||
// 提取电影标题
|
|
||||
Element titleElement = element.selectFirst(".titleColumn a"); |
|
||||
if (titleElement != null) { |
|
||||
movie.setTitle(titleElement.text()); |
|
||||
} |
|
||||
|
|
||||
// 提取年份
|
|
||||
Element yearElement = element.selectFirst(".titleColumn .secondaryInfo"); |
|
||||
if (yearElement != null) { |
|
||||
String year = yearElement.text().replaceAll("[()]", ""); |
|
||||
movie.setYear(year); |
|
||||
} |
|
||||
|
|
||||
// 提取评分
|
|
||||
Element ratingElement = element.selectFirst(".ratingColumn.imdbRating strong"); |
|
||||
if (ratingElement != null) { |
|
||||
movie.setRating(ratingElement.text()); |
|
||||
} |
|
||||
|
|
||||
// 提取导演和主演(需要进入详情页)
|
|
||||
String movieUrl = "https://www.imdb.com" + titleElement.attr("href"); |
|
||||
try { |
|
||||
Document movieDoc = Jsoup.connect(movieUrl) |
|
||||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") |
|
||||
.timeout(10000) |
|
||||
.get(); |
|
||||
|
|
||||
// 提取导演
|
|
||||
Elements directorElements = movieDoc.select("a[href*=name]").stream() |
|
||||
.filter(e -> e.parent().text().contains("Director")) |
|
||||
.limit(1) |
|
||||
.collect(Collectors.toList()); |
|
||||
if (!directorElements.isEmpty()) { |
|
||||
movie.setDirector(directorElements.get(0).text()); |
|
||||
} |
|
||||
|
|
||||
// 提取主演
|
|
||||
Elements starElements = movieDoc.select("a[href*=name]").stream() |
|
||||
.filter(e -> e.parent().text().contains("Stars")) |
|
||||
.limit(3) |
|
||||
.collect(Collectors.toList()); |
|
||||
if (!starElements.isEmpty()) { |
|
||||
StringBuilder stars = new StringBuilder(); |
|
||||
for (int i = 0; i < starElements.size(); i++) { |
|
||||
stars.append(starElements.get(i).text()); |
|
||||
if (i < starElements.size() - 1) stars.append(", "); |
|
||||
} |
|
||||
movie.setStars(stars.toString()); |
|
||||
} |
|
||||
|
|
||||
// 提取类型
|
|
||||
Elements genreElements = movieDoc.select("a[href*=genres]").limit(3); |
|
||||
if (!genreElements.isEmpty()) { |
|
||||
StringBuilder genres = new StringBuilder(); |
|
||||
for (int i = 0; i < genreElements.size(); i++) { |
|
||||
genres.append(genreElements.get(i).text()); |
|
||||
if (i < genreElements.size() - 1) genres.append(", "); |
|
||||
} |
|
||||
movie.setGenre(genres.toString()); |
|
||||
} |
|
||||
|
|
||||
// 提取时长
|
|
||||
Element runtimeElement = movieDoc.selectFirst("time"); |
|
||||
if (runtimeElement != null) { |
|
||||
movie.setRuntime(runtimeElement.text()); |
|
||||
} |
|
||||
|
|
||||
} catch (IOException e) { |
|
||||
System.out.println("Error crawling movie details: " + e.getMessage()); |
|
||||
} |
|
||||
|
|
||||
movies.add(movie); |
|
||||
count++; |
|
||||
|
|
||||
// 控制请求频率,避免被封
|
|
||||
try { |
|
||||
Thread.sleep(1000); |
|
||||
} catch (InterruptedException e) { |
|
||||
e.printStackTrace(); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return movies; |
|
||||
} |
|
||||
} |
|
||||
@ -1,81 +0,0 @@ |
|||||
package com.example.model; |
|
||||
|
|
||||
public class Movie { |
|
||||
private String title; |
|
||||
private String rating; |
|
||||
private String year; |
|
||||
private String director; |
|
||||
private String stars; |
|
||||
private String runtime; |
|
||||
private String genre; |
|
||||
|
|
||||
// Getters and Setters
|
|
||||
public String getTitle() { |
|
||||
return title; |
|
||||
} |
|
||||
|
|
||||
public void setTitle(String title) { |
|
||||
this.title = title; |
|
||||
} |
|
||||
|
|
||||
public String getRating() { |
|
||||
return rating; |
|
||||
} |
|
||||
|
|
||||
public void setRating(String rating) { |
|
||||
this.rating = rating; |
|
||||
} |
|
||||
|
|
||||
public String getYear() { |
|
||||
return year; |
|
||||
} |
|
||||
|
|
||||
public void setYear(String year) { |
|
||||
this.year = year; |
|
||||
} |
|
||||
|
|
||||
public String getDirector() { |
|
||||
return director; |
|
||||
} |
|
||||
|
|
||||
public void setDirector(String director) { |
|
||||
this.director = director; |
|
||||
} |
|
||||
|
|
||||
public String getStars() { |
|
||||
return stars; |
|
||||
} |
|
||||
|
|
||||
public void setStars(String stars) { |
|
||||
this.stars = stars; |
|
||||
} |
|
||||
|
|
||||
public String getRuntime() { |
|
||||
return runtime; |
|
||||
} |
|
||||
|
|
||||
public void setRuntime(String runtime) { |
|
||||
this.runtime = runtime; |
|
||||
} |
|
||||
|
|
||||
public String getGenre() { |
|
||||
return genre; |
|
||||
} |
|
||||
|
|
||||
public void setGenre(String genre) { |
|
||||
this.genre = genre; |
|
||||
} |
|
||||
|
|
||||
@Override |
|
||||
public String toString() { |
|
||||
return "Movie{" + |
|
||||
"title='" + title + '\'' + |
|
||||
", rating='" + rating + '\'' + |
|
||||
", year='" + year + '\'' + |
|
||||
", director='" + director + '\'' + |
|
||||
", stars='" + stars + '\'' + |
|
||||
", runtime='" + runtime + '\'' + |
|
||||
", genre='" + genre + '\'' + |
|
||||
'}'; |
|
||||
} |
|
||||
} |
|
||||
@ -1,40 +0,0 @@ |
|||||
package com.example.processor; |
|
||||
|
|
||||
import com.example.model.Movie; |
|
||||
import org.apache.commons.csv.CSVFormat; |
|
||||
import org.apache.commons.csv.CSVPrinter; |
|
||||
|
|
||||
import java.io.FileWriter; |
|
||||
import java.io.IOException; |
|
||||
import java.util.List; |
|
||||
|
|
||||
public class DataProcessor { |
|
||||
|
|
||||
public void saveMoviesToCsv(List<Movie> movies, String filePath) throws IOException { |
|
||||
try (FileWriter writer = new FileWriter(filePath); |
|
||||
CSVPrinter csvPrinter = new CSVPrinter(writer, CSVFormat.DEFAULT |
|
||||
.withHeader("Title", "Rating", "Year", "Director", "Stars", "Runtime", "Genre"))) { |
|
||||
|
|
||||
for (Movie movie : movies) { |
|
||||
csvPrinter.printRecord( |
|
||||
cleanText(movie.getTitle()), |
|
||||
movie.getRating(), |
|
||||
movie.getYear(), |
|
||||
cleanText(movie.getDirector()), |
|
||||
cleanText(movie.getStars()), |
|
||||
movie.getRuntime(), |
|
||||
cleanText(movie.getGenre()) |
|
||||
); |
|
||||
} |
|
||||
|
|
||||
csvPrinter.flush(); |
|
||||
System.out.println("Movies saved to CSV file: " + filePath); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
private String cleanText(String text) { |
|
||||
if (text == null) return ""; |
|
||||
// 去除首尾空格,去除 HTML 标签
|
|
||||
return text.trim().replaceAll("<[^>]*>", ""); |
|
||||
} |
|
||||
} |
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,104 +0,0 @@ |
|||||
package com.rental; |
|
||||
|
|
||||
public class Car { |
|
||||
// 私有属性
|
|
||||
private final String licensePlate; |
|
||||
private String brand; |
|
||||
private String model; |
|
||||
private double dailyRent; |
|
||||
private boolean isRented; |
|
||||
|
|
||||
// 静态变量,统计车辆总数
|
|
||||
private static int totalCars = 0; |
|
||||
|
|
||||
// 全参构造方法
|
|
||||
public Car(String licensePlate, String brand, String model, double dailyRent) { |
|
||||
this.licensePlate = licensePlate; |
|
||||
this.brand = brand; |
|
||||
this.model = model; |
|
||||
this.dailyRent = dailyRent; |
|
||||
this.isRented = false; |
|
||||
totalCars++; |
|
||||
} |
|
||||
|
|
||||
// 三参构造方法,使用默认日租金 300 元/天
|
|
||||
public Car(String licensePlate, String brand, String model) { |
|
||||
this(licensePlate, brand, model, 300.0); |
|
||||
} |
|
||||
|
|
||||
// Getter 方法
|
|
||||
public String getLicensePlate() { |
|
||||
return licensePlate; |
|
||||
} |
|
||||
|
|
||||
public String getBrand() { |
|
||||
return brand; |
|
||||
} |
|
||||
|
|
||||
public String getModel() { |
|
||||
return model; |
|
||||
} |
|
||||
|
|
||||
public double getDailyRent() { |
|
||||
return dailyRent; |
|
||||
} |
|
||||
|
|
||||
public boolean isRented() { |
|
||||
return isRented; |
|
||||
} |
|
||||
|
|
||||
// Setter 方法
|
|
||||
public void setBrand(String brand) { |
|
||||
this.brand = brand; |
|
||||
} |
|
||||
|
|
||||
public void setModel(String model) { |
|
||||
this.model = model; |
|
||||
} |
|
||||
|
|
||||
public void setDailyRent(double dailyRent) { |
|
||||
if (dailyRent > 0) { |
|
||||
this.dailyRent = dailyRent; |
|
||||
} else { |
|
||||
System.out.println("日租金必须大于 0,保持原值"); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// 业务方法
|
|
||||
public void rentCar() { |
|
||||
if (isRented) { |
|
||||
System.out.println("车辆已租出,无法再次租用"); |
|
||||
} else { |
|
||||
isRented = true; |
|
||||
System.out.println("车辆租用成功"); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
public void returnCar() { |
|
||||
if (!isRented) { |
|
||||
System.out.println("车辆未被租用,无需归还"); |
|
||||
} else { |
|
||||
isRented = false; |
|
||||
System.out.println("车辆归还成功"); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
public double calculateRent(int days) { |
|
||||
return dailyRent * days; |
|
||||
} |
|
||||
|
|
||||
// 显示车辆信息
|
|
||||
public void displayInfo() { |
|
||||
System.out.println("车牌号: " + licensePlate); |
|
||||
System.out.println("品牌: " + brand); |
|
||||
System.out.println("型号: " + model); |
|
||||
System.out.println("日租金: " + dailyRent + " 元/天"); |
|
||||
System.out.println("状态: " + (isRented ? "已租出" : "可租")); |
|
||||
System.out.println(); |
|
||||
} |
|
||||
|
|
||||
// 静态方法,返回总车辆数
|
|
||||
public static int getTotalCars() { |
|
||||
return totalCars; |
|
||||
} |
|
||||
} |
|
||||
@ -1,48 +0,0 @@ |
|||||
package com.rental; |
|
||||
|
|
||||
public class TestCar { |
|
||||
public static void main(String[] args) { |
|
||||
// 创建 3 个 Car 对象
|
|
||||
Car car1 = new Car("京A12345", "宝马", "5系", 500.0); |
|
||||
Car car2 = new Car("京B67890", "奔驰", "C级"); |
|
||||
Car car3 = new Car("京C54321", "奥迪", "A4L", 450.0); |
|
||||
|
|
||||
// 输出所有车辆信息
|
|
||||
System.out.println("所有车辆信息:"); |
|
||||
System.out.println("------------------------"); |
|
||||
car1.displayInfo(); |
|
||||
car2.displayInfo(); |
|
||||
car3.displayInfo(); |
|
||||
|
|
||||
// 测试车辆租用和归还
|
|
||||
System.out.println("测试车辆租用和归还:"); |
|
||||
System.out.println("------------------------"); |
|
||||
System.out.println("测试 car1:"); |
|
||||
car1.rentCar(); // 首次租用
|
|
||||
car1.rentCar(); // 再次租用(应该提示已租出)
|
|
||||
car1.returnCar(); // 归还
|
|
||||
car1.returnCar(); // 再次归还(应该提示未租用)
|
|
||||
System.out.println(); |
|
||||
|
|
||||
// 计算租金
|
|
||||
System.out.println("计算租金:"); |
|
||||
System.out.println("------------------------"); |
|
||||
double rent = car1.calculateRent(5); |
|
||||
System.out.println("car1 租用 5 天的费用:" + rent + " 元"); |
|
||||
System.out.println(); |
|
||||
|
|
||||
// 测试修改日租金为非法值
|
|
||||
System.out.println("测试修改日租金:"); |
|
||||
System.out.println("------------------------"); |
|
||||
System.out.println("尝试将 car2 的日租金修改为 -100:"); |
|
||||
car2.setDailyRent(-100); |
|
||||
System.out.println("car2 当前日租金:" + car2.getDailyRent() + " 元/天"); |
|
||||
System.out.println("尝试将 car2 的日租金修改为 400:"); |
|
||||
car2.setDailyRent(400); |
|
||||
System.out.println("car2 当前日租金:" + car2.getDailyRent() + " 元/天"); |
|
||||
System.out.println(); |
|
||||
|
|
||||
// 输出总车辆数
|
|
||||
System.out.println("总车辆数:" + Car.getTotalCars()); |
|
||||
} |
|
||||
} |
|
||||
@ -1,2 +1,22 @@ |
|||||
# java |
\# 温度转换程序 - 作业提交 |
||||
|
|
||||
|
作业标题:W1-郑佳音-202401060101 |
||||
|
|
||||
|
|
||||
|
|
||||
|
\## 编译与运行命令 |
||||
|
|
||||
|
\### 1. 编译代码 |
||||
|
|
||||
|
javac TemperatureConverter.java |
||||
|
|
||||
|
|
||||
|
|
||||
|
\### 2. 运行程序 |
||||
|
|
||||
|
\- 交互式模式:java TemperatureConverter |
||||
|
|
||||
|
\- 命令行参数模式:java TemperatureConverter 36.6 C |
||||
|
|
||||
|
\- 批量转换模式:java TemperatureConverter -f temperatures.txt |
||||
|
|
||||
|
|||||
@ -0,0 +1,71 @@ |
|||||
|
import java.util.Scanner; |
||||
|
|
||||
|
/** |
||||
|
* 温度转换程序 |
||||
|
* 实现摄氏温度与华氏温度之间的相互转换 |
||||
|
* @author 你的名字 |
||||
|
*/ |
||||
|
public class TemperatureConverter { |
||||
|
|
||||
|
/** |
||||
|
* 将摄氏温度转换为华氏温度 |
||||
|
* 转换公式: F = C × 9/5 + 32 |
||||
|
* @param celsius 摄氏温度值 |
||||
|
* @return 对应的华氏温度值 |
||||
|
*/ |
||||
|
public static double celsiusToFahrenheit(double celsius) { |
||||
|
return celsius * 9.0 / 5.0 + 32.0; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 将华氏温度转换为摄氏温度 |
||||
|
* 转换公式: C = (F - 32) × 5/9 |
||||
|
* @param fahrenheit 华氏温度值 |
||||
|
* @return 对应的摄氏温度值 |
||||
|
*/ |
||||
|
public static double fahrenheitToCelsius(double fahrenheit) { |
||||
|
return (fahrenheit - 32.0) * 5.0 / 9.0; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 程序主入口,处理用户输入并执行转换 |
||||
|
* @param args 命令行参数(未使用) |
||||
|
*/ |
||||
|
public static void main(String[] args) { |
||||
|
Scanner scanner = new Scanner(System.in); |
||||
|
|
||||
|
System.out.print("请输入温度和单位(例如:36.6 C 或 97 F):"); |
||||
|
String input = scanner.nextLine().trim(); |
||||
|
|
||||
|
if (input.isEmpty()) { |
||||
|
System.out.println("输入不能为空,请重新输入。"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
// 解析输入的数值和单位
|
||||
|
String[] parts = input.split("\\s+"); |
||||
|
if (parts.length < 2) { |
||||
|
System.out.println("输入格式不正确,请输入如 '36.6 C' 的格式。"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
double value = Double.parseDouble(parts[0]); |
||||
|
String unit = parts[1].toUpperCase(); |
||||
|
|
||||
|
if (unit.equals("C")) { |
||||
|
double fahrenheit = celsiusToFahrenheit(value); |
||||
|
System.out.printf("%.1f°C 转换为华氏度是: %.1f°F%n", value, fahrenheit); |
||||
|
} else if (unit.equals("F")) { |
||||
|
double celsius = fahrenheitToCelsius(value); |
||||
|
System.out.printf("%.1f°F 转换为摄氏度是: %.1f°C%n", value, celsius); |
||||
|
} else { |
||||
|
System.out.println("不支持的单位,请使用 C 或 F。"); |
||||
|
} |
||||
|
} catch (NumberFormatException e) { |
||||
|
System.out.println("温度数值格式不正确,请输入有效的数字。"); |
||||
|
} finally { |
||||
|
scanner.close(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -1,224 +0,0 @@ |
|||||
import org.apache.poi.ss.usermodel.*; |
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook; |
|
||||
import java.io.*; |
|
||||
import java.util.*; |
|
||||
import java.util.regex.*; |
|
||||
|
|
||||
public class AddRegressionColumns { |
|
||||
public static void main(String[] args) { |
|
||||
String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).xlsx"; |
|
||||
String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新)_回归.xlsx"; |
|
||||
|
|
||||
System.out.println("========================================"); |
|
||||
System.out.println(" 在原表中添加回归数据列"); |
|
||||
System.out.println("========================================"); |
|
||||
System.out.println("输入文件: " + inputFile); |
|
||||
System.out.println("输出文件: " + outputFile); |
|
||||
System.out.println(); |
|
||||
|
|
||||
try { |
|
||||
// 读取输入文件
|
|
||||
System.out.println("读取输入文件..."); |
|
||||
FileInputStream fis = new FileInputStream(inputFile); |
|
||||
Workbook wb = new XSSFWorkbook(fis); |
|
||||
Sheet sheet = wb.getSheetAt(0); |
|
||||
|
|
||||
int totalRows = sheet.getLastRowNum(); |
|
||||
System.out.println("总行数: " + totalRows); |
|
||||
|
|
||||
// 获取表头行
|
|
||||
Row headerRow = sheet.getRow(0); |
|
||||
int totalCols = headerRow.getLastCellNum(); |
|
||||
System.out.println("总列数: " + totalCols); |
|
||||
|
|
||||
// 识别列
|
|
||||
int helpfullCol = -1; |
|
||||
int commentCountCol = -1; |
|
||||
List<Integer> commentCols = new ArrayList<>(); |
|
||||
|
|
||||
for (int i = 0; i < totalCols; i++) { |
|
||||
Cell cell = headerRow.getCell(i); |
|
||||
if (cell != null) { |
|
||||
String header = cell.getStringCellValue().toLowerCase(); |
|
||||
if (header.contains("helpfull") || header.contains("helpful")) { |
|
||||
helpfullCol = i; |
|
||||
System.out.println("找到 Y 列 (helpfull): 列 " + i); |
|
||||
} else if (header.contains("评论总数") || header.contains("帖子评论总数")) { |
|
||||
commentCountCol = i; |
|
||||
System.out.println("找到 X1 列 (评论总数): 列 " + i); |
|
||||
} else if (header.contains("评论") && header.contains("内容")) { |
|
||||
for (int j = 1; j <= 5; j++) { |
|
||||
if (header.contains(String.valueOf(j))) { |
|
||||
commentCols.add(i); |
|
||||
System.out.println("找到评论列 " + commentCols.size() + ": 列 " + i + " - " + header); |
|
||||
break; |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
System.out.println("\n共找到 " + commentCols.size() + " 个评论列"); |
|
||||
|
|
||||
// 添加新列的表头
|
|
||||
int yCol = totalCols; |
|
||||
int x1Col = totalCols + 1; |
|
||||
int x2Col = totalCols + 2; |
|
||||
int x3Col = totalCols + 3; |
|
||||
int x4Col = totalCols + 4; |
|
||||
int x5Col = totalCols + 5; |
|
||||
int x6Col = totalCols + 6; |
|
||||
|
|
||||
headerRow.createCell(yCol).setCellValue("Y"); |
|
||||
headerRow.createCell(x1Col).setCellValue("X1"); |
|
||||
headerRow.createCell(x2Col).setCellValue("X2"); |
|
||||
headerRow.createCell(x3Col).setCellValue("X3"); |
|
||||
headerRow.createCell(x4Col).setCellValue("X4"); |
|
||||
headerRow.createCell(x5Col).setCellValue("X5"); |
|
||||
headerRow.createCell(x6Col).setCellValue("X6"); |
|
||||
|
|
||||
// 处理每一行数据
|
|
||||
System.out.println("\n处理数据..."); |
|
||||
Pattern digitPattern = Pattern.compile("\\d"); |
|
||||
Pattern urlPattern = Pattern.compile("http[s]?://|www\\."); |
|
||||
Pattern emojiPattern = Pattern.compile("[\\u2600-\\u27BF\\uD83C-\\uDBFF\\uDC00-\\uDFFF]|[:;][-]?[)D]"); |
|
||||
|
|
||||
String[] positiveWords = {"好", "棒", "优秀", "喜欢", "满意", "赞", "positive", "good", "great", "excellent", "love", "like"}; |
|
||||
String[] negativeWords = {"差", "糟糕", "不好", "失望", "不满", "negative", "bad", "terrible", "poor", "hate", "dislike"}; |
|
||||
|
|
||||
for (int i = 1; i <= totalRows; i++) { |
|
||||
if (i % 1000 == 0) { |
|
||||
System.out.println("处理第 " + i + "/" + totalRows + " 行..."); |
|
||||
} |
|
||||
|
|
||||
Row row = sheet.getRow(i); |
|
||||
if (row == null) continue; |
|
||||
|
|
||||
// Y (UGC有用性)
|
|
||||
double y = 0; |
|
||||
if (helpfullCol >= 0) { |
|
||||
Cell cell = row.getCell(helpfullCol); |
|
||||
if (cell != null) { |
|
||||
try { |
|
||||
y = cell.getNumericCellValue(); |
|
||||
} catch (Exception e) { |
|
||||
y = 0; |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
row.createCell(yCol).setCellValue(y); |
|
||||
|
|
||||
// X1 (评论数量)
|
|
||||
double x1 = 0; |
|
||||
if (commentCountCol >= 0) { |
|
||||
Cell cell = row.getCell(commentCountCol); |
|
||||
if (cell != null) { |
|
||||
try { |
|
||||
x1 = cell.getNumericCellValue(); |
|
||||
} catch (Exception e) { |
|
||||
x1 = 0; |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
row.createCell(x1Col).setCellValue(x1); |
|
||||
|
|
||||
// 计算评论相关指标
|
|
||||
List<Double> lengths = new ArrayList<>(); |
|
||||
List<Double> complexities = new ArrayList<>(); |
|
||||
List<Double> sentiments = new ArrayList<>(); |
|
||||
List<Double> richnessList = new ArrayList<>(); |
|
||||
|
|
||||
for (int colIdx : commentCols) { |
|
||||
Cell cell = row.getCell(colIdx); |
|
||||
if (cell != null) { |
|
||||
String content = ""; |
|
||||
try { |
|
||||
content = cell.getStringCellValue(); |
|
||||
} catch (Exception e) { |
|
||||
try { |
|
||||
content = String.valueOf(cell.getNumericCellValue()); |
|
||||
} catch (Exception e2) { |
|
||||
content = ""; |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
if (content != null && !content.isEmpty() && !content.equals("nan") && !content.equals("null")) { |
|
||||
// X2: 评论长度(剔空格后的字符数)
|
|
||||
double length = content.replace(" ", "").replace("\u3000", "").length(); |
|
||||
lengths.add(length); |
|
||||
|
|
||||
// X3: 评论复杂度(按空格拆分的分词数)
|
|
||||
double complexity = content.split("\\s+").length; |
|
||||
complexities.add(complexity); |
|
||||
|
|
||||
// X5: 情感分析
|
|
||||
double sentiment = 0; |
|
||||
String lowerContent = content.toLowerCase(); |
|
||||
for (String word : positiveWords) { |
|
||||
if (lowerContent.contains(word)) { |
|
||||
sentiment = 1; |
|
||||
break; |
|
||||
} |
|
||||
} |
|
||||
if (sentiment == 0) { |
|
||||
for (String word : negativeWords) { |
|
||||
if (lowerContent.contains(word)) { |
|
||||
sentiment = -1; |
|
||||
break; |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
sentiments.add(sentiment); |
|
||||
|
|
||||
// X6: 信息丰富度
|
|
||||
double richness = 0; |
|
||||
if (digitPattern.matcher(content).find()) richness += 1; |
|
||||
if (urlPattern.matcher(content).find()) richness += 1; |
|
||||
if (emojiPattern.matcher(content).find()) richness += 1; |
|
||||
richnessList.add(richness); |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// 计算平均值(无评论记0)
|
|
||||
double x2 = lengths.isEmpty() ? 0 : lengths.stream().mapToDouble(Double::doubleValue).average().getAsDouble(); |
|
||||
double x3 = complexities.isEmpty() ? 0 : complexities.stream().mapToDouble(Double::doubleValue).average().getAsDouble(); |
|
||||
double x5 = sentiments.isEmpty() ? 0 : sentiments.stream().mapToDouble(Double::doubleValue).average().getAsDouble(); |
|
||||
double x6 = richnessList.isEmpty() ? 0 : richnessList.stream().mapToDouble(Double::doubleValue).average().getAsDouble(); |
|
||||
|
|
||||
// X4: 评论可读性 = X2/X3(X3为0时记0)
|
|
||||
double x4 = (x3 > 0) ? x2 / x3 : 0; |
|
||||
|
|
||||
// 写入单元格
|
|
||||
row.createCell(x2Col).setCellValue(x2); |
|
||||
row.createCell(x3Col).setCellValue(x3); |
|
||||
row.createCell(x4Col).setCellValue(x4); |
|
||||
row.createCell(x5Col).setCellValue(x5); |
|
||||
row.createCell(x6Col).setCellValue(x6); |
|
||||
} |
|
||||
|
|
||||
// 保存文件
|
|
||||
System.out.println("\n保存文件..."); |
|
||||
FileOutputStream fos = new FileOutputStream(outputFile); |
|
||||
wb.write(fos); |
|
||||
fos.close(); |
|
||||
wb.close(); |
|
||||
fis.close(); |
|
||||
|
|
||||
// 验证文件
|
|
||||
File output = new File(outputFile); |
|
||||
if (output.exists()) { |
|
||||
System.out.println("文件保存成功!"); |
|
||||
System.out.println("文件大小: " + (output.length() / 1024) + " KB"); |
|
||||
} |
|
||||
|
|
||||
System.out.println("\n========================================"); |
|
||||
System.out.println(" 任务完成"); |
|
||||
System.out.println("========================================"); |
|
||||
|
|
||||
} catch (Exception e) { |
|
||||
System.out.println("错误: " + e.getMessage()); |
|
||||
e.printStackTrace(); |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
@ -1,99 +0,0 @@ |
|||||
import java.util.ArrayList; |
|
||||
import java.util.List; |
|
||||
import java.util.regex.Matcher; |
|
||||
import java.util.regex.Pattern; |
|
||||
|
|
||||
public class DataCleaner { |
|
||||
|
|
||||
public static List<PostInfo> cleanPosts(List<PostInfo> rawPosts) { |
|
||||
List<PostInfo> cleanedPosts = new ArrayList<>(); |
|
||||
|
|
||||
for (PostInfo post : rawPosts) { |
|
||||
PostInfo cleaned = cleanPost(post); |
|
||||
if (isValidPost(cleaned)) { |
|
||||
cleanedPosts.add(cleaned); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
System.out.println("数据清洗完成,有效数据: " + cleanedPosts.size() + " 条"); |
|
||||
return cleanedPosts; |
|
||||
} |
|
||||
|
|
||||
private static PostInfo cleanPost(PostInfo post) { |
|
||||
PostInfo cleaned = new PostInfo(); |
|
||||
|
|
||||
cleaned.setTitle(cleanText(post.getTitle())); |
|
||||
cleaned.setContent(cleanContent(post.getContent())); |
|
||||
cleaned.setAuthor(cleanText(post.getAuthor())); |
|
||||
cleaned.setPostDate(post.getPostDate()); |
|
||||
cleaned.setLikeCount(post.getLikeCount()); |
|
||||
cleaned.setCommentCount(post.getCommentCount()); |
|
||||
cleaned.setViewCount(post.getViewCount()); |
|
||||
cleaned.setTags(cleanText(post.getTags())); |
|
||||
cleaned.setSentiment(normalizeSentiment(post.getSentiment())); |
|
||||
|
|
||||
return cleaned; |
|
||||
} |
|
||||
|
|
||||
private static String cleanText(String text) { |
|
||||
if (text == null) { |
|
||||
return ""; |
|
||||
} |
|
||||
return text.trim().replaceAll("\\s+", " "); |
|
||||
} |
|
||||
|
|
||||
private static String cleanContent(String content) { |
|
||||
if (content == null) { |
|
||||
return ""; |
|
||||
} |
|
||||
return content.trim() |
|
||||
.replaceAll("\\s+", " ") |
|
||||
.replaceAll("[\\r\\n]+", " ") |
|
||||
.replaceAll("<[^>]+>", "") |
|
||||
.replaceAll("\\[.*?\\]", "") |
|
||||
.replaceAll("\\(.*?\\)", ""); |
|
||||
} |
|
||||
|
|
||||
private static String normalizeSentiment(String sentiment) { |
|
||||
if (sentiment == null || sentiment.isEmpty()) { |
|
||||
return "中性"; |
|
||||
} |
|
||||
|
|
||||
String lower = sentiment.toLowerCase(); |
|
||||
if (lower.contains("积极") || lower.contains("正面") || lower.contains("positive")) { |
|
||||
return "积极"; |
|
||||
} else if (lower.contains("消极") || lower.contains("负面") || lower.contains("negative")) { |
|
||||
return "消极"; |
|
||||
} else { |
|
||||
return "中性"; |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
private static boolean isValidPost(PostInfo post) { |
|
||||
return post.getTitle() != null && !post.getTitle().isEmpty() && |
|
||||
post.getContent() != null && !post.getContent().isEmpty(); |
|
||||
} |
|
||||
|
|
||||
public static String[] extractKeywords(String content) { |
|
||||
if (content == null || content.isEmpty()) { |
|
||||
return new String[0]; |
|
||||
} |
|
||||
|
|
||||
String[] commonKeywords = { |
|
||||
"数据", "分析", "学习", "技术", "互联网", "发展", "趋势", |
|
||||
"工具", "方法", "实践", "经验", "案例", "应用", "创新", |
|
||||
"挑战", "机遇", "未来", "智能", "算法", "模型", "平台" |
|
||||
}; |
|
||||
|
|
||||
List<String> keywords = new ArrayList<>(); |
|
||||
String lowerContent = content.toLowerCase(); |
|
||||
|
|
||||
for (String keyword : commonKeywords) { |
|
||||
if (lowerContent.contains(keyword.toLowerCase())) { |
|
||||
keywords.add(keyword); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return keywords.toArray(new String[0]); |
|
||||
} |
|
||||
} |
|
||||
@ -1,226 +0,0 @@ |
|||||
import java.io.*; |
|
||||
import java.time.LocalDate; |
|
||||
import java.time.format.DateTimeFormatter; |
|
||||
import java.util.ArrayList; |
|
||||
import java.util.List; |
|
||||
import java.util.Locale; |
|
||||
|
|
||||
public class DataCleaningScript { |
|
||||
|
|
||||
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.CHINA); |
|
||||
|
|
||||
public static void main(String[] args) { |
|
||||
String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx"; |
|
||||
String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).csv"; |
|
||||
|
|
||||
System.out.println("========================================"); |
|
||||
System.out.println(" 数据清洗脚本"); |
|
||||
System.out.println("========================================"); |
|
||||
System.out.println("输入文件: " + inputFile); |
|
||||
System.out.println("输出文件: " + outputFile); |
|
||||
System.out.println(); |
|
||||
|
|
||||
// 读取数据
|
|
||||
List<PostInfo> rawPosts = readExcelData(inputFile); |
|
||||
System.out.println("读取数据完成,共 " + rawPosts.size() + " 条记录"); |
|
||||
|
|
||||
// 清洗数据
|
|
||||
List<PostInfo> cleanedPosts = cleanPosts(rawPosts); |
|
||||
System.out.println("数据清洗完成,有效记录: " + cleanedPosts.size() + " 条"); |
|
||||
|
|
||||
// 保存清洗后的数据
|
|
||||
saveToCSV(cleanedPosts, outputFile); |
|
||||
System.out.println("数据保存完成!"); |
|
||||
System.out.println(); |
|
||||
System.out.println("========================================"); |
|
||||
System.out.println(" 数据清洗任务完成"); |
|
||||
System.out.println("========================================"); |
|
||||
} |
|
||||
|
|
||||
private static List<PostInfo> readExcelData(String filePath) { |
|
||||
List<PostInfo> posts = new ArrayList<>(); |
|
||||
|
|
||||
try (BufferedReader reader = new BufferedReader(new FileReader(filePath, java.nio.charset.StandardCharsets.UTF_8))) { |
|
||||
|
|
||||
String line; |
|
||||
boolean isFirstLine = true; |
|
||||
|
|
||||
while ((line = reader.readLine()) != null) { |
|
||||
if (isFirstLine) { |
|
||||
isFirstLine = false; |
|
||||
continue; |
|
||||
} |
|
||||
|
|
||||
String[] parts = parseCSVLine(line); |
|
||||
if (parts.length >= 9) { |
|
||||
PostInfo post = parsePostInfo(parts); |
|
||||
if (post != null) { |
|
||||
posts.add(post); |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
} catch (IOException e) { |
|
||||
System.err.println("读取文件时出错: " + e.getMessage()); |
|
||||
} |
|
||||
|
|
||||
return posts; |
|
||||
} |
|
||||
|
|
||||
private static String[] parseCSVLine(String line) { |
|
||||
List<String> fields = new ArrayList<>(); |
|
||||
StringBuilder currentField = new StringBuilder(); |
|
||||
boolean inQuotes = false; |
|
||||
|
|
||||
for (char c : line.toCharArray()) { |
|
||||
if (c == '"') { |
|
||||
inQuotes = !inQuotes; |
|
||||
} else if (c == ',' && !inQuotes) { |
|
||||
fields.add(currentField.toString().trim()); |
|
||||
currentField.setLength(0); |
|
||||
} else { |
|
||||
currentField.append(c); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
fields.add(currentField.toString().trim()); |
|
||||
return fields.toArray(new String[0]); |
|
||||
} |
|
||||
|
|
||||
private static PostInfo parsePostInfo(String[] parts) { |
|
||||
try { |
|
||||
PostInfo post = new PostInfo(); |
|
||||
|
|
||||
post.setTitle(parts[0]); |
|
||||
post.setContent(parts[1]); |
|
||||
post.setAuthor(parts[2]); |
|
||||
|
|
||||
if (!parts[3].isEmpty()) { |
|
||||
post.setPostDate(LocalDate.parse(parts[3], DATE_FORMATTER)); |
|
||||
} |
|
||||
|
|
||||
post.setLikeCount(parseInt(parts[4])); |
|
||||
post.setCommentCount(parseInt(parts[5])); |
|
||||
post.setViewCount(parseInt(parts[6])); |
|
||||
|
|
||||
post.setTags(parts[7]); |
|
||||
post.setSentiment(parts[8]); |
|
||||
|
|
||||
return post; |
|
||||
} catch (Exception e) { |
|
||||
return null; |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
private static int parseInt(String value) { |
|
||||
try { |
|
||||
if (value == null || value.isEmpty()) { |
|
||||
return 0; |
|
||||
} |
|
||||
return Integer.parseInt(value); |
|
||||
} catch (NumberFormatException e) { |
|
||||
return 0; |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
private static List<PostInfo> cleanPosts(List<PostInfo> rawPosts) { |
|
||||
List<PostInfo> cleanedPosts = new ArrayList<>(); |
|
||||
|
|
||||
for (PostInfo post : rawPosts) { |
|
||||
PostInfo cleaned = cleanPost(post); |
|
||||
if (isValidPost(cleaned)) { |
|
||||
cleanedPosts.add(cleaned); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
return cleanedPosts; |
|
||||
} |
|
||||
|
|
||||
private static PostInfo cleanPost(PostInfo post) { |
|
||||
PostInfo cleaned = new PostInfo(); |
|
||||
|
|
||||
cleaned.setTitle(cleanText(post.getTitle())); |
|
||||
cleaned.setContent(cleanContent(post.getContent())); |
|
||||
cleaned.setAuthor(cleanText(post.getAuthor())); |
|
||||
cleaned.setPostDate(post.getPostDate()); |
|
||||
cleaned.setLikeCount(post.getLikeCount()); |
|
||||
cleaned.setCommentCount(post.getCommentCount()); |
|
||||
cleaned.setViewCount(post.getViewCount()); |
|
||||
cleaned.setTags(cleanText(post.getTags())); |
|
||||
cleaned.setSentiment(normalizeSentiment(post.getSentiment())); |
|
||||
|
|
||||
return cleaned; |
|
||||
} |
|
||||
|
|
||||
private static String cleanText(String text) { |
|
||||
if (text == null) { |
|
||||
return ""; |
|
||||
} |
|
||||
return text.trim().replaceAll("\\s+", " "); |
|
||||
} |
|
||||
|
|
||||
private static String cleanContent(String content) { |
|
||||
if (content == null) { |
|
||||
return ""; |
|
||||
} |
|
||||
return content.trim() |
|
||||
.replaceAll("\\s+", " ") |
|
||||
.replaceAll("[\\r\\n]+", " ") |
|
||||
.replaceAll("<[^>]+>", "") |
|
||||
.replaceAll("\\[.*?\\]", "") |
|
||||
.replaceAll("\\(.*?\\)", ""); |
|
||||
} |
|
||||
|
|
||||
private static String normalizeSentiment(String sentiment) { |
|
||||
if (sentiment == null || sentiment.isEmpty()) { |
|
||||
return "中性"; |
|
||||
} |
|
||||
|
|
||||
String lower = sentiment.toLowerCase(); |
|
||||
if (lower.contains("积极") || lower.contains("正面") || lower.contains("positive")) { |
|
||||
return "积极"; |
|
||||
} else if (lower.contains("消极") || lower.contains("负面") || lower.contains("negative")) { |
|
||||
return "消极"; |
|
||||
} else { |
|
||||
return "中性"; |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
private static boolean isValidPost(PostInfo post) { |
|
||||
return post.getTitle() != null && !post.getTitle().isEmpty() && |
|
||||
post.getContent() != null && !post.getContent().isEmpty(); |
|
||||
} |
|
||||
|
|
||||
private static void saveToCSV(List<PostInfo> posts, String filePath) { |
|
||||
if (posts == null || posts.isEmpty()) { |
|
||||
System.out.println("没有数据需要保存"); |
|
||||
return; |
|
||||
} |
|
||||
|
|
||||
try { |
|
||||
// 确保目录存在
|
|
||||
File file = new File(filePath); |
|
||||
File parentDir = file.getParentFile(); |
|
||||
if (parentDir != null && !parentDir.exists()) { |
|
||||
parentDir.mkdirs(); |
|
||||
} |
|
||||
|
|
||||
try (BufferedWriter writer = new BufferedWriter( |
|
||||
new FileWriter(file, java.nio.charset.StandardCharsets.UTF_8))) { |
|
||||
|
|
||||
writer.write("\uFEFF"); // BOM for UTF-8
|
|
||||
writer.write("标题,内容,作者,发布日期,点赞数,评论数,浏览量,标签,情感倾向\n"); |
|
||||
|
|
||||
for (PostInfo post : posts) { |
|
||||
writer.write(post.toCSV()); |
|
||||
writer.write("\n"); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
System.out.println("数据已保存到: " + filePath); |
|
||||
|
|
||||
} catch (IOException e) { |
|
||||
System.err.println("保存CSV文件时出错: " + e.getMessage()); |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
@ -1,121 +0,0 @@ |
|||||
import java.io.BufferedWriter; |
|
||||
import java.io.FileWriter; |
|
||||
import java.io.IOException; |
|
||||
import java.nio.charset.StandardCharsets; |
|
||||
import java.nio.file.Files; |
|
||||
import java.nio.file.Paths; |
|
||||
import java.time.LocalDateTime; |
|
||||
import java.time.format.DateTimeFormatter; |
|
||||
import java.util.List; |
|
||||
|
|
||||
public class DataStorage { |
|
||||
|
|
||||
public static void saveToCSV(List<PostInfo> posts, String directory) { |
|
||||
if (posts == null || posts.isEmpty()) { |
|
||||
System.out.println("没有数据需要保存"); |
|
||||
return; |
|
||||
} |
|
||||
|
|
||||
try { |
|
||||
java.nio.file.Path dirPath = Paths.get(directory); |
|
||||
if (!Files.exists(dirPath)) { |
|
||||
Files.createDirectories(dirPath); |
|
||||
} |
|
||||
|
|
||||
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")); |
|
||||
String filename = "posts_" + timestamp + ".csv"; |
|
||||
java.nio.file.Path filePath = dirPath.resolve(filename); |
|
||||
|
|
||||
try (BufferedWriter writer = new BufferedWriter( |
|
||||
new FileWriter(filePath.toFile(), StandardCharsets.UTF_8))) { |
|
||||
|
|
||||
writer.write("\uFEFF"); |
|
||||
writer.write("标题,内容,作者,发布日期,点赞数,评论数,浏览量,标签,情感倾向\n"); |
|
||||
|
|
||||
for (PostInfo post : posts) { |
|
||||
writer.write(post.toCSV()); |
|
||||
writer.write("\n"); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
System.out.println("数据已保存到: " + filePath.toAbsolutePath()); |
|
||||
|
|
||||
} catch (IOException e) { |
|
||||
System.err.println("保存CSV文件时出错: " + e.getMessage()); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
public static void saveToJSON(List<PostInfo> posts, String directory) { |
|
||||
if (posts == null || posts.isEmpty()) { |
|
||||
System.out.println("没有数据需要保存"); |
|
||||
return; |
|
||||
} |
|
||||
|
|
||||
try { |
|
||||
java.nio.file.Path dirPath = Paths.get(directory); |
|
||||
if (!Files.exists(dirPath)) { |
|
||||
Files.createDirectories(dirPath); |
|
||||
} |
|
||||
|
|
||||
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")); |
|
||||
String filename = "posts_" + timestamp + ".json"; |
|
||||
java.nio.file.Path filePath = dirPath.resolve(filename); |
|
||||
|
|
||||
try (BufferedWriter writer = new BufferedWriter( |
|
||||
new FileWriter(filePath.toFile(), StandardCharsets.UTF_8))) { |
|
||||
|
|
||||
writer.write("[\n"); |
|
||||
for (int i = 0; i < posts.size(); i++) { |
|
||||
writer.write(postToJSON(posts.get(i))); |
|
||||
if (i < posts.size() - 1) { |
|
||||
writer.write(",\n"); |
|
||||
} else { |
|
||||
writer.write("\n"); |
|
||||
} |
|
||||
} |
|
||||
writer.write("]\n"); |
|
||||
} |
|
||||
|
|
||||
System.out.println("数据已保存到: " + filePath.toAbsolutePath()); |
|
||||
|
|
||||
} catch (IOException e) { |
|
||||
System.err.println("保存JSON文件时出错: " + e.getMessage()); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
private static String postToJSON(PostInfo post) { |
|
||||
return String.format( |
|
||||
" {\n" + |
|
||||
" \"title\": \"%s\",\n" + |
|
||||
" \"content\": \"%s\",\n" + |
|
||||
" \"author\": \"%s\",\n" + |
|
||||
" \"postDate\": \"%s\",\n" + |
|
||||
" \"likeCount\": %d,\n" + |
|
||||
" \"commentCount\": %d,\n" + |
|
||||
" \"viewCount\": %d,\n" + |
|
||||
" \"tags\": \"%s\",\n" + |
|
||||
" \"sentiment\": \"%s\"\n" + |
|
||||
" }", |
|
||||
escapeJSON(post.getTitle()), |
|
||||
escapeJSON(post.getContent()), |
|
||||
escapeJSON(post.getAuthor()), |
|
||||
post.getPostDate() != null ? post.getPostDate().toString() : "", |
|
||||
post.getLikeCount(), |
|
||||
post.getCommentCount(), |
|
||||
post.getViewCount(), |
|
||||
escapeJSON(post.getTags()), |
|
||||
escapeJSON(post.getSentiment()) |
|
||||
); |
|
||||
} |
|
||||
|
|
||||
private static String escapeJSON(String text) { |
|
||||
if (text == null) { |
|
||||
return ""; |
|
||||
} |
|
||||
return text.replace("\\", "\\\\") |
|
||||
.replace("\"", "\\\"") |
|
||||
.replace("\n", "\\n") |
|
||||
.replace("\r", "\\r") |
|
||||
.replace("\t", "\\t"); |
|
||||
} |
|
||||
} |
|
||||
@ -1,3 +0,0 @@ |
|||||
public class DuoTai { |
|
||||
|
|
||||
} |
|
||||
@ -1,102 +0,0 @@ |
|||||
import java.io.*; |
|
||||
import java.time.LocalDate; |
|
||||
import java.time.format.DateTimeFormatter; |
|
||||
import java.util.ArrayList; |
|
||||
import java.util.List; |
|
||||
import java.util.Locale; |
|
||||
|
|
||||
public class ExcelReader { |
|
||||
|
|
||||
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.CHINA); |
|
||||
|
|
||||
public static List<PostInfo> readExcelData(String filePath, int maxRows) { |
|
||||
List<PostInfo> posts = new ArrayList<>(); |
|
||||
|
|
||||
try (BufferedReader reader = new BufferedReader(new FileReader(filePath, java.nio.charset.StandardCharsets.UTF_8))) { |
|
||||
|
|
||||
String line; |
|
||||
boolean isFirstLine = true; |
|
||||
int rowCount = 0; |
|
||||
|
|
||||
while ((line = reader.readLine()) != null && rowCount < maxRows) { |
|
||||
if (isFirstLine) { |
|
||||
isFirstLine = false; |
|
||||
continue; |
|
||||
} |
|
||||
|
|
||||
String[] parts = parseCSVLine(line); |
|
||||
if (parts.length >= 9) { |
|
||||
PostInfo post = parsePostInfo(parts); |
|
||||
if (post != null) { |
|
||||
posts.add(post); |
|
||||
rowCount++; |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
System.out.println("成功读取 " + posts.size() + " 条数据"); |
|
||||
|
|
||||
} catch (IOException e) { |
|
||||
System.err.println("读取文件时出错: " + e.getMessage()); |
|
||||
} |
|
||||
|
|
||||
return posts; |
|
||||
} |
|
||||
|
|
||||
private static String[] parseCSVLine(String line) { |
|
||||
List<String> fields = new ArrayList<>(); |
|
||||
StringBuilder currentField = new StringBuilder(); |
|
||||
boolean inQuotes = false; |
|
||||
|
|
||||
for (char c : line.toCharArray()) { |
|
||||
if (c == '"') { |
|
||||
inQuotes = !inQuotes; |
|
||||
} else if (c == ',' && !inQuotes) { |
|
||||
fields.add(currentField.toString().trim()); |
|
||||
currentField.setLength(0); |
|
||||
} else { |
|
||||
currentField.append(c); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
fields.add(currentField.toString().trim()); |
|
||||
return fields.toArray(new String[0]); |
|
||||
} |
|
||||
|
|
||||
private static PostInfo parsePostInfo(String[] parts) { |
|
||||
try { |
|
||||
PostInfo post = new PostInfo(); |
|
||||
|
|
||||
post.setTitle(parts[0]); |
|
||||
post.setContent(parts[1]); |
|
||||
post.setAuthor(parts[2]); |
|
||||
|
|
||||
if (!parts[3].isEmpty()) { |
|
||||
post.setPostDate(LocalDate.parse(parts[3], DATE_FORMATTER)); |
|
||||
} |
|
||||
|
|
||||
post.setLikeCount(parseInt(parts[4])); |
|
||||
post.setCommentCount(parseInt(parts[5])); |
|
||||
post.setViewCount(parseInt(parts[6])); |
|
||||
|
|
||||
post.setTags(parts[7]); |
|
||||
post.setSentiment(parts[8]); |
|
||||
|
|
||||
return post; |
|
||||
} catch (Exception e) { |
|
||||
System.err.println("解析数据时出错: " + e.getMessage()); |
|
||||
return null; |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
private static int parseInt(String value) { |
|
||||
try { |
|
||||
if (value == null || value.isEmpty()) { |
|
||||
return 0; |
|
||||
} |
|
||||
return Integer.parseInt(value); |
|
||||
} catch (NumberFormatException e) { |
|
||||
return 0; |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
@ -1,214 +0,0 @@ |
|||||
package com.project.report; |
|
||||
|
|
||||
import com.project.analyzer.PostAnalyzer; |
|
||||
import com.project.model.PostInfo; |
|
||||
|
|
||||
import java.io.BufferedWriter; |
|
||||
import java.io.FileWriter; |
|
||||
import java.io.IOException; |
|
||||
import java.nio.charset.StandardCharsets; |
|
||||
import java.nio.file.Files; |
|
||||
import java.nio.file.Paths; |
|
||||
import java.time.LocalDateTime; |
|
||||
import java.time.format.DateTimeFormatter; |
|
||||
import java.util.Map; |
|
||||
|
|
||||
public class HTMLReportGenerator { |
|
||||
|
|
||||
private static final String OUTPUT_DIR = "d:\\java\\project\\reports"; |
|
||||
|
|
||||
public static void generateReport(PostAnalyzer analyzer) { |
|
||||
try { |
|
||||
Files.createDirectories(Paths.get(OUTPUT_DIR)); |
|
||||
|
|
||||
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")); |
|
||||
String filename = "report_" + timestamp + ".html"; |
|
||||
String filepath = OUTPUT_DIR + "/" + filename; |
|
||||
|
|
||||
try (BufferedWriter writer = new BufferedWriter( |
|
||||
new FileWriter(filepath, StandardCharsets.UTF_8))) { |
|
||||
|
|
||||
writer.write(generateHTMLContent(analyzer)); |
|
||||
} |
|
||||
|
|
||||
System.out.println("HTML报告已生成: " + filepath); |
|
||||
|
|
||||
} catch (IOException e) { |
|
||||
System.err.println("生成HTML报告时出错: " + e.getMessage()); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
private static String generateHTMLContent(PostAnalyzer analyzer) { |
|
||||
StringBuilder html = new StringBuilder(); |
|
||||
|
|
||||
html.append("<!DOCTYPE html>\n"); |
|
||||
html.append("<html lang=\"zh-CN\">\n"); |
|
||||
html.append("<head>\n"); |
|
||||
html.append(" <meta charset=\"UTF-8\">\n"); |
|
||||
html.append(" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n"); |
|
||||
html.append(" <title>图文帖子数据分析报告</title>\n"); |
|
||||
html.append(" <style>\n"); |
|
||||
html.append(" * { margin: 0; padding: 0; box-sizing: border-box; }\n"); |
|
||||
html.append(" body { font-family: 'Microsoft YaHei', Arial, sans-serif; background: #f5f5f5; padding: 20px; }\n"); |
|
||||
html.append(" .container { max-width: 1200px; margin: 0 auto; background: white; padding: 30px; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }\n"); |
|
||||
html.append(" h1 { color: #333; text-align: center; margin-bottom: 10px; }\n"); |
|
||||
html.append(" .subtitle { color: #666; text-align: center; margin-bottom: 30px; font-size: 14px; }\n"); |
|
||||
html.append(" .section { margin-bottom: 40px; }\n"); |
|
||||
html.append(" .section h2 { color: #2c3e50; border-bottom: 3px solid #3498db; padding-bottom: 10px; margin-bottom: 20px; }\n"); |
|
||||
html.append(" table { width: 100%; border-collapse: collapse; margin-bottom: 20px; }\n"); |
|
||||
html.append(" th, td { padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }\n"); |
|
||||
html.append(" th { background: #3498db; color: white; font-weight: bold; }\n"); |
|
||||
html.append(" tr:hover { background: #f8f9fa; }\n"); |
|
||||
html.append(" .stat-card { display: inline-block; width: 200px; padding: 20px; margin: 10px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; text-align: center; }\n"); |
|
||||
html.append(" .stat-card h3 { font-size: 36px; margin-bottom: 10px; }\n"); |
|
||||
html.append(" .stat-card p { font-size: 14px; opacity: 0.9; }\n"); |
|
||||
html.append(" .chart-container { text-align: center; margin: 20px 0; }\n"); |
|
||||
html.append(" .chart-container img { max-width: 100%; height: auto; border: 1px solid #ddd; border-radius: 5px; }\n"); |
|
||||
html.append(" .summary { background: #e8f4f8; padding: 20px; border-radius: 10px; margin-bottom: 30px; }\n"); |
|
||||
html.append(" .summary h3 { color: #2c3e50; margin-bottom: 15px; }\n"); |
|
||||
html.append(" .summary ul { list-style-position: inside; color: #555; }\n"); |
|
||||
html.append(" .summary li { margin: 8px 0; }\n"); |
|
||||
html.append(" </style>\n"); |
|
||||
html.append("</head>\n"); |
|
||||
html.append("<body>\n"); |
|
||||
html.append(" <div class=\"container\">\n"); |
|
||||
html.append(" <h1>图文帖子数据分析报告</h1>\n"); |
|
||||
html.append(" <p class=\"subtitle\">生成时间: ").append(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))).append("</p>\n"); |
|
||||
|
|
||||
html.append(generateSummarySection(analyzer)); |
|
||||
html.append(generateSentimentSection(analyzer)); |
|
||||
html.append(generateEngagementSection(analyzer)); |
|
||||
html.append(generateAuthorSection(analyzer)); |
|
||||
html.append(generateChartsSection()); |
|
||||
|
|
||||
html.append(" </div>\n"); |
|
||||
html.append("</body>\n"); |
|
||||
html.append("</html>"); |
|
||||
|
|
||||
return html.toString(); |
|
||||
} |
|
||||
|
|
||||
private static String generateSummarySection(PostAnalyzer analyzer) { |
|
||||
StringBuilder section = new StringBuilder(); |
|
||||
|
|
||||
int totalPosts = analyzer.getPosts().size(); |
|
||||
double avgLikes = analyzer.getPosts().stream() |
|
||||
.mapToInt(PostInfo::getLikeCount) |
|
||||
.average() |
|
||||
.orElse(0); |
|
||||
|
|
||||
section.append(" <div class=\"section\">\n"); |
|
||||
section.append(" <div class=\"stat-card\">\n"); |
|
||||
section.append(" <h3>").append(totalPosts).append("</h3>\n"); |
|
||||
section.append(" <p>帖子总数</p>\n"); |
|
||||
section.append(" </div>\n"); |
|
||||
section.append(" <div class=\"stat-card\">\n"); |
|
||||
section.append(" <h3>").append(String.format("%.1f", avgLikes)).append("</h3>\n"); |
|
||||
section.append(" <p>平均点赞</p>\n"); |
|
||||
section.append(" </div>\n"); |
|
||||
section.append(" </div>\n"); |
|
||||
|
|
||||
section.append(" <div class=\"summary\">\n"); |
|
||||
section.append(" <h3>分析摘要</h3>\n"); |
|
||||
section.append(" <ul>\n"); |
|
||||
section.append(" <li>本次分析共收集 ").append(totalPosts).append(" 条图文帖子数据</li>\n"); |
|
||||
section.append(" <li>数据来源:D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用</li>\n"); |
|
||||
section.append(" <li>分析内容包括情感倾向分布、互动指标、热门作者等多个维度</li>\n"); |
|
||||
section.append(" <li>通过数据可视化展示分析结果,便于直观理解</li>\n"); |
|
||||
section.append(" </ul>\n"); |
|
||||
section.append(" </div>\n"); |
|
||||
|
|
||||
return section.toString(); |
|
||||
} |
|
||||
|
|
||||
private static String generateSentimentSection(PostAnalyzer analyzer) { |
|
||||
StringBuilder section = new StringBuilder(); |
|
||||
Map<String, Long> sentimentData = analyzer.getSentimentDistributionData(); |
|
||||
|
|
||||
section.append(" <div class=\"section\">\n"); |
|
||||
section.append(" <h2>情感倾向分布分析</h2>\n"); |
|
||||
section.append(" <table>\n"); |
|
||||
section.append(" <tr><th>情感倾向</th><th>帖子数量</th><th>占比</th></tr>\n"); |
|
||||
|
|
||||
long total = sentimentData.values().stream().mapToLong(Long::longValue).sum(); |
|
||||
|
|
||||
for (Map.Entry<String, Long> entry : sentimentData.entrySet()) { |
|
||||
double percent = (entry.getValue() * 100.0) / total; |
|
||||
section.append(" <tr><td>").append(entry.getKey()) |
|
||||
.append("</td><td>").append(entry.getValue()) |
|
||||
.append("</td><td>").append(String.format("%.1f%%", percent)) |
|
||||
.append("</td></tr>\n"); |
|
||||
} |
|
||||
|
|
||||
section.append(" </table>\n"); |
|
||||
section.append(" </div>\n"); |
|
||||
|
|
||||
return section.toString(); |
|
||||
} |
|
||||
|
|
||||
private static String generateEngagementSection(PostAnalyzer analyzer) { |
|
||||
StringBuilder section = new StringBuilder(); |
|
||||
Map<String, Double> engagementData = analyzer.getEngagementData(); |
|
||||
|
|
||||
section.append(" <div class=\"section\">\n"); |
|
||||
section.append(" <h2>互动指标分析</h2>\n"); |
|
||||
section.append(" <table>\n"); |
|
||||
section.append(" <tr><th>指标</th><th>平均值</th></tr>\n"); |
|
||||
|
|
||||
for (Map.Entry<String, Double> entry : engagementData.entrySet()) { |
|
||||
section.append(" <tr><td>").append(entry.getKey()) |
|
||||
.append("</td><td>").append(String.format("%.1f", entry.getValue())) |
|
||||
.append("</td></tr>\n"); |
|
||||
} |
|
||||
|
|
||||
section.append(" </table>\n"); |
|
||||
section.append(" </div>\n"); |
|
||||
|
|
||||
return section.toString(); |
|
||||
} |
|
||||
|
|
||||
private static String generateAuthorSection(PostAnalyzer analyzer) { |
|
||||
StringBuilder section = new StringBuilder(); |
|
||||
Map<String, Integer> authorData = analyzer.getAuthorPostCount(); |
|
||||
|
|
||||
section.append(" <div class=\"section\">\n"); |
|
||||
section.append(" <h2>热门作者排行TOP10</h2>\n"); |
|
||||
section.append(" <table>\n"); |
|
||||
section.append(" <tr><th>排名</th><th>作者</th><th>帖子数量</th></tr>\n"); |
|
||||
|
|
||||
int rank = 1; |
|
||||
for (Map.Entry<String, Integer> entry : authorData.entrySet()) { |
|
||||
section.append(" <tr><td>").append(rank++) |
|
||||
.append("</td><td>").append(entry.getKey()) |
|
||||
.append("</td><td>").append(entry.getValue()) |
|
||||
.append("</td></tr>\n"); |
|
||||
} |
|
||||
|
|
||||
section.append(" </table>\n"); |
|
||||
section.append(" </div>\n"); |
|
||||
|
|
||||
return section.toString(); |
|
||||
} |
|
||||
|
|
||||
private static String generateChartsSection() { |
|
||||
StringBuilder section = new StringBuilder(); |
|
||||
|
|
||||
section.append(" <div class=\"section\">\n"); |
|
||||
section.append(" <h2>数据可视化图表</h2>\n"); |
|
||||
section.append(" <div class=\"chart-container\">\n"); |
|
||||
section.append(" <h3>情感倾向分布</h3>\n"); |
|
||||
section.append(" <img src=\"../charts/sentiment_distribution.png\" alt=\"情感倾向分布图\">\n"); |
|
||||
section.append(" </div>\n"); |
|
||||
section.append(" <div class=\"chart-container\">\n"); |
|
||||
section.append(" <h3>互动指标分析</h3>\n"); |
|
||||
section.append(" <img src=\"../charts/engagement_metrics.png\" alt=\"互动指标图\">\n"); |
|
||||
section.append(" </div>\n"); |
|
||||
section.append(" <div class=\"chart-container\">\n"); |
|
||||
section.append(" <h3>热门作者排行</h3>\n"); |
|
||||
section.append(" <img src=\"../charts/author_ranking.png\" alt=\"作者排行图\">\n"); |
|
||||
section.append(" </div>\n"); |
|
||||
section.append(" </div>\n"); |
|
||||
|
|
||||
return section.toString(); |
|
||||
} |
|
||||
} |
|
||||
@ -1,67 +0,0 @@ |
|||||
package com.project; |
|
||||
|
|
||||
import com.project.analyzer.PostAnalyzer; |
|
||||
import com.project.chart.SimpleChartGenerator; |
|
||||
import com.project.model.PostInfo; |
|
||||
import com.project.reader.ExcelReader; |
|
||||
import com.project.report.HTMLReportGenerator; |
|
||||
import com.project.storage.DataStorage; |
|
||||
import com.project.util.DataCleaner; |
|
||||
|
|
||||
import java.util.List; |
|
||||
import java.util.Scanner; |
|
||||
|
|
||||
public class Main { |
|
||||
|
|
||||
public static void main(String[] args) { |
|
||||
System.out.println("========================================"); |
|
||||
System.out.println(" Java网络爬虫与数据分析系统"); |
|
||||
System.out.println("========================================\n"); |
|
||||
|
|
||||
String dataFilePath = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx"; |
|
||||
String outputDir = "d:\\java\\project\\data"; |
|
||||
int maxRows = 300; |
|
||||
|
|
||||
try { |
|
||||
System.out.println("开始读取本地数据文件..."); |
|
||||
System.out.println("数据文件: " + dataFilePath); |
|
||||
System.out.println("读取前 " + maxRows + " 条数据"); |
|
||||
|
|
||||
List<PostInfo> rawPosts = ExcelReader.readExcelData(dataFilePath, maxRows); |
|
||||
|
|
||||
if (rawPosts.isEmpty()) { |
|
||||
System.out.println("未获取到任何数据,程序退出"); |
|
||||
return; |
|
||||
} |
|
||||
|
|
||||
System.out.println("\n开始数据清洗..."); |
|
||||
List<PostInfo> cleanedPosts = DataCleaner.cleanPosts(rawPosts); |
|
||||
|
|
||||
System.out.println("\n保存数据到文件..."); |
|
||||
DataStorage.saveToCSV(cleanedPosts, outputDir); |
|
||||
DataStorage.saveToJSON(cleanedPosts, outputDir); |
|
||||
|
|
||||
System.out.println("\n开始数据分析..."); |
|
||||
PostAnalyzer analyzer = new PostAnalyzer(cleanedPosts); |
|
||||
analyzer.analyzeAll(); |
|
||||
|
|
||||
System.out.println("\n生成图表..."); |
|
||||
SimpleChartGenerator.generateAllCharts(analyzer); |
|
||||
|
|
||||
System.out.println("\n生成HTML报告..."); |
|
||||
HTMLReportGenerator.generateReport(analyzer); |
|
||||
|
|
||||
System.out.println("\n========================================"); |
|
||||
System.out.println(" 程序执行完成!"); |
|
||||
System.out.println("========================================"); |
|
||||
System.out.println("\n输出文件位置:"); |
|
||||
System.out.println("- 数据文件: " + outputDir); |
|
||||
System.out.println("- 图表文件: d:\\java\\project\\charts"); |
|
||||
System.out.println("- 报告文件: d:\\java\\project\\reports"); |
|
||||
|
|
||||
} catch (Exception e) { |
|
||||
System.err.println("程序执行出错: " + e.getMessage()); |
|
||||
e.printStackTrace(); |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
@ -1,200 +0,0 @@ |
|||||
package com.project.analyzer; |
|
||||
|
|
||||
import com.project.model.PostInfo; |
|
||||
|
|
||||
import java.util.*; |
|
||||
import java.util.stream.Collectors; |
|
||||
|
|
||||
public class PostAnalyzer { |
|
||||
|
|
||||
private final List<PostInfo> posts; |
|
||||
|
|
||||
public PostAnalyzer(List<PostInfo> posts) { |
|
||||
this.posts = posts; |
|
||||
} |
|
||||
|
|
||||
public List<PostInfo> getPosts() { |
|
||||
return posts; |
|
||||
} |
|
||||
|
|
||||
public void analyzeAll() { |
|
||||
System.out.println("\n========== 数据分析报告 ==========\n"); |
|
||||
|
|
||||
analyzeSentimentDistribution(); |
|
||||
analyzeEngagementMetrics(); |
|
||||
analyzePopularAuthors(); |
|
||||
analyzeContentLength(); |
|
||||
analyzeTemporalTrends(); |
|
||||
|
|
||||
System.out.println("\n========== 分析完成 ==========\n"); |
|
||||
} |
|
||||
|
|
||||
public void analyzeSentimentDistribution() { |
|
||||
System.out.println("【情感倾向分布分析】"); |
|
||||
System.out.println("----------------------------------------"); |
|
||||
|
|
||||
Map<String, Long> sentimentCounts = posts.stream() |
|
||||
.collect(Collectors.groupingBy( |
|
||||
PostInfo::getSentiment, |
|
||||
Collectors.counting() |
|
||||
)); |
|
||||
|
|
||||
System.out.printf("%-20s %s%n", "情感倾向", "帖子数量"); |
|
||||
System.out.println("----------------------------------------"); |
|
||||
|
|
||||
sentimentCounts.entrySet().stream() |
|
||||
.sorted(Map.Entry.<String, Long>comparingByValue().reversed()) |
|
||||
.forEach(entry -> System.out.printf("%-20s %d%n", entry.getKey(), entry.getValue())); |
|
||||
|
|
||||
System.out.println(); |
|
||||
} |
|
||||
|
|
||||
public void analyzeEngagementMetrics() { |
|
||||
System.out.println("【互动指标分析】"); |
|
||||
System.out.println("----------------------------------------"); |
|
||||
|
|
||||
double avgLikes = posts.stream() |
|
||||
.mapToInt(PostInfo::getLikeCount) |
|
||||
.average() |
|
||||
.orElse(0); |
|
||||
|
|
||||
double avgComments = posts.stream() |
|
||||
.mapToInt(PostInfo::getCommentCount) |
|
||||
.average() |
|
||||
.orElse(0); |
|
||||
|
|
||||
double avgViews = posts.stream() |
|
||||
.mapToInt(PostInfo::getViewCount) |
|
||||
.average() |
|
||||
.orElse(0); |
|
||||
|
|
||||
System.out.printf("平均点赞数: %.1f%n", avgLikes); |
|
||||
System.out.printf("平均评论数: %.1f%n", avgComments); |
|
||||
System.out.printf("平均浏览量: %.1f%n", avgViews); |
|
||||
|
|
||||
System.out.println(); |
|
||||
} |
|
||||
|
|
||||
public void analyzePopularAuthors() { |
|
||||
System.out.println("【热门作者排行】"); |
|
||||
System.out.println("----------------------------------------"); |
|
||||
System.out.printf("%-30s %10s %10s %10s%n", "作者", "帖子数", "总点赞", "总评论"); |
|
||||
System.out.println("----------------------------------------"); |
|
||||
|
|
||||
Map<String, List<PostInfo>> authorPosts = posts.stream() |
|
||||
.collect(Collectors.groupingBy(PostInfo::getAuthor)); |
|
||||
|
|
||||
authorPosts.entrySet().stream() |
|
||||
.sorted(Map.Entry.<String, List<PostInfo>>comparingByValue((a, b) -> b.size() - a.size())) |
|
||||
.limit(10) |
|
||||
.forEach(entry -> { |
|
||||
String author = entry.getKey(); |
|
||||
List<PostInfo> authorPostList = entry.getValue(); |
|
||||
int postCount = authorPostList.size(); |
|
||||
int totalLikes = authorPostList.stream().mapToInt(PostInfo::getLikeCount).sum(); |
|
||||
int totalComments = authorPostList.stream().mapToInt(PostInfo::getCommentCount).sum(); |
|
||||
|
|
||||
System.out.printf("%-30s %10d %10d %10d%n", |
|
||||
author.length() > 28 ? author.substring(0, 28) : author, |
|
||||
postCount, totalLikes, totalComments); |
|
||||
}); |
|
||||
|
|
||||
System.out.println(); |
|
||||
} |
|
||||
|
|
||||
public void analyzeContentLength() { |
|
||||
System.out.println("【内容长度分析】"); |
|
||||
System.out.println("----------------------------------------"); |
|
||||
|
|
||||
double avgLength = posts.stream() |
|
||||
.mapToInt(post -> post.getContent().length()) |
|
||||
.average() |
|
||||
.orElse(0); |
|
||||
|
|
||||
int maxLength = posts.stream() |
|
||||
.mapToInt(post -> post.getContent().length()) |
|
||||
.max() |
|
||||
.orElse(0); |
|
||||
|
|
||||
int minLength = posts.stream() |
|
||||
.mapToInt(post -> post.getContent().length()) |
|
||||
.min() |
|
||||
.orElse(0); |
|
||||
|
|
||||
System.out.printf("平均内容长度: %.1f 字符%n", avgLength); |
|
||||
System.out.printf("最长内容: %d 字符%n", maxLength); |
|
||||
System.out.printf("最短内容: %d 字符%n", minLength); |
|
||||
|
|
||||
System.out.println(); |
|
||||
} |
|
||||
|
|
||||
public void analyzeTemporalTrends() { |
|
||||
System.out.println("【时间趋势分析】"); |
|
||||
System.out.println("----------------------------------------"); |
|
||||
|
|
||||
Map<String, Long> monthlyPosts = posts.stream() |
|
||||
.filter(post -> post.getPostDate() != null) |
|
||||
.collect(Collectors.groupingBy( |
|
||||
post -> post.getPostDate().format(java.time.format.DateTimeFormatter.ofPattern("yyyy-MM")), |
|
||||
Collectors.counting() |
|
||||
)); |
|
||||
|
|
||||
System.out.printf("%-10s %s%n", "月份", "帖子数量"); |
|
||||
System.out.println("----------------------------------------"); |
|
||||
|
|
||||
monthlyPosts.entrySet().stream() |
|
||||
.sorted(Map.Entry.comparingByKey()) |
|
||||
.forEach(entry -> System.out.printf("%-10s %d%n", entry.getKey(), entry.getValue())); |
|
||||
|
|
||||
System.out.println(); |
|
||||
} |
|
||||
|
|
||||
public Map<String, Long> getSentimentDistributionData() { |
|
||||
return posts.stream() |
|
||||
.collect(Collectors.groupingBy( |
|
||||
PostInfo::getSentiment, |
|
||||
Collectors.counting() |
|
||||
)); |
|
||||
} |
|
||||
|
|
||||
public Map<String, Double> getEngagementData() { |
|
||||
Map<String, Double> engagementData = new LinkedHashMap<>(); |
|
||||
|
|
||||
double avgLikes = posts.stream() |
|
||||
.mapToInt(PostInfo::getLikeCount) |
|
||||
.average() |
|
||||
.orElse(0); |
|
||||
|
|
||||
double avgComments = posts.stream() |
|
||||
.mapToInt(PostInfo::getCommentCount) |
|
||||
.average() |
|
||||
.orElse(0); |
|
||||
|
|
||||
double avgViews = posts.stream() |
|
||||
.mapToInt(PostInfo::getViewCount) |
|
||||
.average() |
|
||||
.orElse(0); |
|
||||
|
|
||||
engagementData.put("点赞", avgLikes); |
|
||||
engagementData.put("评论", avgComments); |
|
||||
engagementData.put("浏览", avgViews); |
|
||||
|
|
||||
return engagementData; |
|
||||
} |
|
||||
|
|
||||
public Map<String, Integer> getAuthorPostCount() { |
|
||||
return posts.stream() |
|
||||
.collect(Collectors.groupingBy( |
|
||||
PostInfo::getAuthor, |
|
||||
Collectors.summingInt(post -> 1) |
|
||||
)).entrySet().stream() |
|
||||
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed()) |
|
||||
.limit(10) |
|
||||
.collect(Collectors.toMap( |
|
||||
Map.Entry::getKey, |
|
||||
Map.Entry::getValue, |
|
||||
(e1, e2) -> e1, |
|
||||
LinkedHashMap::new |
|
||||
)); |
|
||||
} |
|
||||
} |
|
||||
@ -1,127 +0,0 @@ |
|||||
import java.time.LocalDate; |
|
||||
|
|
||||
public class PostInfo { |
|
||||
private String title; |
|
||||
private String content; |
|
||||
private String author; |
|
||||
private LocalDate postDate; |
|
||||
private int likeCount; |
|
||||
private int commentCount; |
|
||||
private int viewCount; |
|
||||
private String tags; |
|
||||
private String sentiment; |
|
||||
|
|
||||
public PostInfo() { |
|
||||
} |
|
||||
|
|
||||
public PostInfo(String title, String content, String author, LocalDate postDate, |
|
||||
int likeCount, int commentCount, int viewCount, String tags, String sentiment) { |
|
||||
this.title = title; |
|
||||
this.content = content; |
|
||||
this.author = author; |
|
||||
this.postDate = postDate; |
|
||||
this.likeCount = likeCount; |
|
||||
this.commentCount = commentCount; |
|
||||
this.viewCount = viewCount; |
|
||||
this.tags = tags; |
|
||||
this.sentiment = sentiment; |
|
||||
} |
|
||||
|
|
||||
public String getTitle() { |
|
||||
return title; |
|
||||
} |
|
||||
|
|
||||
public void setTitle(String title) { |
|
||||
this.title = title; |
|
||||
} |
|
||||
|
|
||||
public String getContent() { |
|
||||
return content; |
|
||||
} |
|
||||
|
|
||||
public void setContent(String content) { |
|
||||
this.content = content; |
|
||||
} |
|
||||
|
|
||||
public String getAuthor() { |
|
||||
return author; |
|
||||
} |
|
||||
|
|
||||
public void setAuthor(String author) { |
|
||||
this.author = author; |
|
||||
} |
|
||||
|
|
||||
public LocalDate getPostDate() { |
|
||||
return postDate; |
|
||||
} |
|
||||
|
|
||||
public void setPostDate(LocalDate postDate) { |
|
||||
this.postDate = postDate; |
|
||||
} |
|
||||
|
|
||||
public int getLikeCount() { |
|
||||
return likeCount; |
|
||||
} |
|
||||
|
|
||||
public void setLikeCount(int likeCount) { |
|
||||
this.likeCount = likeCount; |
|
||||
} |
|
||||
|
|
||||
public int getCommentCount() { |
|
||||
return commentCount; |
|
||||
} |
|
||||
|
|
||||
public void setCommentCount(int commentCount) { |
|
||||
this.commentCount = commentCount; |
|
||||
} |
|
||||
|
|
||||
public int getViewCount() { |
|
||||
return viewCount; |
|
||||
} |
|
||||
|
|
||||
public void setViewCount(int viewCount) { |
|
||||
this.viewCount = viewCount; |
|
||||
} |
|
||||
|
|
||||
public String getTags() { |
|
||||
return tags; |
|
||||
} |
|
||||
|
|
||||
public void setTags(String tags) { |
|
||||
this.tags = tags; |
|
||||
} |
|
||||
|
|
||||
public String getSentiment() { |
|
||||
return sentiment; |
|
||||
} |
|
||||
|
|
||||
public void setSentiment(String sentiment) { |
|
||||
this.sentiment = sentiment; |
|
||||
} |
|
||||
|
|
||||
@Override |
|
||||
public String toString() { |
|
||||
return "PostInfo{" + |
|
||||
"title='" + title + '\'' + |
|
||||
", author='" + author + '\'' + |
|
||||
", postDate=" + postDate + |
|
||||
", likeCount=" + likeCount + |
|
||||
", commentCount=" + commentCount + |
|
||||
", viewCount=" + viewCount + |
|
||||
", sentiment='" + sentiment + '\'' + |
|
||||
'}'; |
|
||||
} |
|
||||
|
|
||||
public String toCSV() { |
|
||||
return String.format("\"%s\",\"%s\",\"%s\",\"%s\",%d,%d,%d,\"%s\",\"%s\"", |
|
||||
title != null ? title.replace("\"", "\"\"") : "", |
|
||||
content != null ? content.replace("\"", "\"\"").replace("\n", " ") : "", |
|
||||
author != null ? author.replace("\"", "\"\"") : "", |
|
||||
postDate != null ? postDate.toString() : "", |
|
||||
likeCount, |
|
||||
commentCount, |
|
||||
viewCount, |
|
||||
tags != null ? tags.replace("\"", "\"\"") : "", |
|
||||
sentiment != null ? sentiment.replace("\"", "\"\"") : ""); |
|
||||
} |
|
||||
} |
|
||||
@ -1,50 +0,0 @@ |
|||||
import java.io.*; |
|
||||
import java.util.*; |
|
||||
import java.util.regex.*; |
|
||||
|
|
||||
public class ProcessRegressionData { |
|
||||
public static void main(String[] args) { |
|
||||
String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).xlsx"; |
|
||||
String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新)_回归.xlsx"; |
|
||||
|
|
||||
System.out.println("========================================"); |
|
||||
System.out.println(" 处理回归数据"); |
|
||||
System.out.println("========================================"); |
|
||||
System.out.println("输入文件: " + inputFile); |
|
||||
System.out.println("输出文件: " + outputFile); |
|
||||
System.out.println(); |
|
||||
|
|
||||
// 检查文件是否存在
|
|
||||
File file = new File(inputFile); |
|
||||
if (!file.exists()) { |
|
||||
System.out.println("错误: 输入文件不存在!"); |
|
||||
return; |
|
||||
} |
|
||||
|
|
||||
System.out.println("输入文件大小: " + (file.length() / 1024) + " KB"); |
|
||||
System.out.println("\n注意: 这是一个简化版本,用于演示处理逻辑。"); |
|
||||
System.out.println("实际处理需要使用Apache POI库来读取和写入Excel文件。"); |
|
||||
System.out.println(); |
|
||||
System.out.println("处理逻辑:"); |
|
||||
System.out.println("1. 读取原始数据"); |
|
||||
System.out.println("2. 识别列: helpfull( Y ), 帖子评论总数( X1 ), 评论1-5内容列"); |
|
||||
System.out.println("3. 计算 X2-X6:"); |
|
||||
System.out.println(" - X2: 评论长度平均值(剔空格后的字符数)"); |
|
||||
System.out.println(" - X3: 评论复杂度平均值(按空格拆分的分词数)"); |
|
||||
System.out.println(" - X4: X2/X3(X3为0时记0)"); |
|
||||
System.out.println(" - X5: 情感性平均值(正面=1、中性=0、负面=-1)"); |
|
||||
System.out.println(" - X6: 信息丰富度平均值(含数字/链接/表情各1分)"); |
|
||||
System.out.println("4. 数据清洗: 确保所有值为纯数字,无空值/错误值"); |
|
||||
System.out.println("5. 保存到新文件"); |
|
||||
System.out.println(); |
|
||||
System.out.println("由于数据量较大(3万+行),建议使用Python的pandas库处理。"); |
|
||||
System.out.println("请确保Python脚本能够完整执行,可能需要增加内存或分批处理。"); |
|
||||
System.out.println(); |
|
||||
System.out.println("========================================"); |
|
||||
System.out.println(" 建议使用以下Python命令运行"); |
|
||||
System.out.println("========================================"); |
|
||||
System.out.println("cd d:\\java\\project"); |
|
||||
System.out.println("python process_300_rows.py (测试前300行)"); |
|
||||
System.out.println("python process_all_rows.py (处理全部数据)"); |
|
||||
} |
|
||||
} |
|
||||
@ -1,2 +0,0 @@ |
|||||
# java |
|
||||
|
|
||||
@ -1,165 +0,0 @@ |
|||||
package com.project.chart; |
|
||||
|
|
||||
import com.project.analyzer.PostAnalyzer; |
|
||||
|
|
||||
import java.awt.*; |
|
||||
import java.awt.image.BufferedImage; |
|
||||
import java.io.File; |
|
||||
import java.io.IOException; |
|
||||
import java.nio.file.Files; |
|
||||
import java.nio.file.Paths; |
|
||||
import java.util.Map; |
|
||||
import javax.imageio.ImageIO; |
|
||||
|
|
||||
public class SimpleChartGenerator { |
|
||||
|
|
||||
private static final String OUTPUT_DIR = "d:\\java\\project\\charts"; |
|
||||
private static final int WIDTH = 800; |
|
||||
private static final int HEIGHT = 600; |
|
||||
|
|
||||
public static void generateAllCharts(PostAnalyzer analyzer) { |
|
||||
try { |
|
||||
Files.createDirectories(Paths.get(OUTPUT_DIR)); |
|
||||
|
|
||||
generateSentimentChart(analyzer); |
|
||||
generateEngagementChart(analyzer); |
|
||||
generateAuthorChart(analyzer); |
|
||||
|
|
||||
System.out.println("\n所有图表已生成,保存在: " + OUTPUT_DIR); |
|
||||
|
|
||||
} catch (IOException e) { |
|
||||
System.err.println("创建图表目录时出错: " + e.getMessage()); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
public static void generateSentimentChart(PostAnalyzer analyzer) { |
|
||||
Map<String, Long> data = analyzer.getSentimentDistributionData(); |
|
||||
|
|
||||
BufferedImage image = new BufferedImage(WIDTH, HEIGHT, BufferedImage.TYPE_INT_RGB); |
|
||||
Graphics2D g2d = image.createGraphics(); |
|
||||
|
|
||||
g2d.setColor(Color.WHITE); |
|
||||
g2d.fillRect(0, 0, WIDTH, HEIGHT); |
|
||||
|
|
||||
g2d.setColor(Color.BLACK); |
|
||||
g2d.setFont(new Font("宋体", Font.BOLD, 24)); |
|
||||
g2d.drawString("情感倾向分布", 300, 40); |
|
||||
|
|
||||
int barWidth = 150; |
|
||||
int startX = 200; |
|
||||
int startY = 500; |
|
||||
int maxHeight = 400; |
|
||||
|
|
||||
long maxValue = data.values().stream().max(Long::compare).orElse(1L); |
|
||||
|
|
||||
int index = 0; |
|
||||
for (Map.Entry<String, Long> entry : data.entrySet()) { |
|
||||
int barHeight = (int) ((entry.getValue() * 1.0 / maxValue) * maxHeight); |
|
||||
|
|
||||
g2d.setColor(new Color(70, 130, 180)); |
|
||||
g2d.fillRect(startX + index * (barWidth + 50), startY - barHeight, barWidth, barHeight); |
|
||||
|
|
||||
g2d.setColor(Color.BLACK); |
|
||||
g2d.setFont(new Font("宋体", Font.PLAIN, 14)); |
|
||||
g2d.drawString(entry.getKey(), startX + index * (barWidth + 50) + 50, startY + 20); |
|
||||
g2d.drawString(String.valueOf(entry.getValue()), startX + index * (barWidth + 50) + 60, startY - barHeight - 5); |
|
||||
|
|
||||
index++; |
|
||||
} |
|
||||
|
|
||||
g2d.dispose(); |
|
||||
saveImage(image, "sentiment_distribution.png"); |
|
||||
} |
|
||||
|
|
||||
public static void generateEngagementChart(PostAnalyzer analyzer) { |
|
||||
Map<String, Double> data = analyzer.getEngagementData(); |
|
||||
|
|
||||
BufferedImage image = new BufferedImage(WIDTH, HEIGHT, BufferedImage.TYPE_INT_RGB); |
|
||||
Graphics2D g2d = image.createGraphics(); |
|
||||
|
|
||||
g2d.setColor(Color.WHITE); |
|
||||
g2d.fillRect(0, 0, WIDTH, HEIGHT); |
|
||||
|
|
||||
g2d.setColor(Color.BLACK); |
|
||||
g2d.setFont(new Font("宋体", Font.BOLD, 24)); |
|
||||
g2d.drawString("互动指标分析", 300, 40); |
|
||||
|
|
||||
int barWidth = 150; |
|
||||
int startX = 200; |
|
||||
int startY = 500; |
|
||||
int maxHeight = 400; |
|
||||
|
|
||||
double maxValue = data.values().stream().max(Double::compare).orElse(1.0); |
|
||||
|
|
||||
int index = 0; |
|
||||
for (Map.Entry<String, Double> entry : data.entrySet()) { |
|
||||
int barHeight = (int) ((entry.getValue() / maxValue) * maxHeight); |
|
||||
|
|
||||
g2d.setColor(new Color(60, 179, 113)); |
|
||||
g2d.fillRect(startX + index * (barWidth + 50), startY - barHeight, barWidth, barHeight); |
|
||||
|
|
||||
g2d.setColor(Color.BLACK); |
|
||||
g2d.setFont(new Font("宋体", Font.PLAIN, 14)); |
|
||||
g2d.drawString(entry.getKey(), startX + index * (barWidth + 50) + 60, startY + 20); |
|
||||
g2d.drawString(String.format("%.1f", entry.getValue()), startX + index * (barWidth + 50) + 50, startY - barHeight - 5); |
|
||||
|
|
||||
index++; |
|
||||
} |
|
||||
|
|
||||
g2d.dispose(); |
|
||||
saveImage(image, "engagement_metrics.png"); |
|
||||
} |
|
||||
|
|
||||
public static void generateAuthorChart(PostAnalyzer analyzer) { |
|
||||
Map<String, Integer> data = analyzer.getAuthorPostCount(); |
|
||||
|
|
||||
BufferedImage image = new BufferedImage(WIDTH, HEIGHT, BufferedImage.TYPE_INT_RGB); |
|
||||
Graphics2D g2d = image.createGraphics(); |
|
||||
|
|
||||
g2d.setColor(Color.WHITE); |
|
||||
g2d.fillRect(0, 0, WIDTH, HEIGHT); |
|
||||
|
|
||||
g2d.setColor(Color.BLACK); |
|
||||
g2d.setFont(new Font("宋体", Font.BOLD, 24)); |
|
||||
g2d.drawString("热门作者排行TOP10", 280, 40); |
|
||||
|
|
||||
int barHeight = 35; |
|
||||
int startY = 80; |
|
||||
int startX = 200; |
|
||||
int maxWidth = 500; |
|
||||
|
|
||||
int maxValue = data.values().stream().max(Integer::compare).orElse(1); |
|
||||
|
|
||||
int index = 0; |
|
||||
for (Map.Entry<String, Integer> entry : data.entrySet()) { |
|
||||
int barWidth = (int) ((entry.getValue() * 1.0 / maxValue) * maxWidth); |
|
||||
|
|
||||
g2d.setColor(new Color(255, 140, 0)); |
|
||||
g2d.fillRect(startX, startY + index * (barHeight + 10), barWidth, barHeight); |
|
||||
|
|
||||
g2d.setColor(Color.BLACK); |
|
||||
g2d.setFont(new Font("宋体", Font.PLAIN, 12)); |
|
||||
String author = entry.getKey(); |
|
||||
if (author.length() > 15) { |
|
||||
author = author.substring(0, 15) + "..."; |
|
||||
} |
|
||||
g2d.drawString(author, 50, startY + index * (barHeight + 10) + 23); |
|
||||
g2d.drawString(String.valueOf(entry.getValue()), startX + barWidth + 10, startY + index * (barHeight + 10) + 23); |
|
||||
|
|
||||
index++; |
|
||||
} |
|
||||
|
|
||||
g2d.dispose(); |
|
||||
saveImage(image, "author_ranking.png"); |
|
||||
} |
|
||||
|
|
||||
private static void saveImage(BufferedImage image, String filename) { |
|
||||
try { |
|
||||
File file = new File(OUTPUT_DIR, filename); |
|
||||
ImageIO.write(image, "PNG", file); |
|
||||
System.out.println("图表已保存: " + file.getAbsolutePath()); |
|
||||
} catch (IOException e) { |
|
||||
System.err.println("保存图表失败: " + e.getMessage()); |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
@ -1,59 +0,0 @@ |
|||||
import java.io.*; |
|
||||
import java.util.ArrayList; |
|
||||
import java.util.List; |
|
||||
|
|
||||
public class SimpleDataCleaner { |
|
||||
|
|
||||
public static void main(String[] args) { |
|
||||
String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx"; |
|
||||
String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).csv"; |
|
||||
|
|
||||
System.out.println("========================================"); |
|
||||
System.out.println(" 简单数据清洗脚本"); |
|
||||
System.out.println("========================================"); |
|
||||
System.out.println("输入文件: " + inputFile); |
|
||||
System.out.println("输出文件: " + outputFile); |
|
||||
System.out.println(); |
|
||||
|
|
||||
// 检查文件是否存在
|
|
||||
File input = new File(inputFile); |
|
||||
if (!input.exists()) { |
|
||||
System.out.println("错误: 输入文件不存在!"); |
|
||||
return; |
|
||||
} |
|
||||
|
|
||||
System.out.println("文件大小: " + (input.length() / 1024) + " KB"); |
|
||||
|
|
||||
// 由于.xlsx是二进制格式,我们直接复制文件并重命名
|
|
||||
// 实际项目中应该使用Apache POI等库来处理Excel文件
|
|
||||
try { |
|
||||
File output = new File(outputFile); |
|
||||
|
|
||||
// 确保输出目录存在
|
|
||||
File parentDir = output.getParentFile(); |
|
||||
if (parentDir != null && !parentDir.exists()) { |
|
||||
parentDir.mkdirs(); |
|
||||
} |
|
||||
|
|
||||
// 复制文件
|
|
||||
try (FileInputStream fis = new FileInputStream(input); |
|
||||
FileOutputStream fos = new FileOutputStream(output)) { |
|
||||
|
|
||||
byte[] buffer = new byte[1024]; |
|
||||
int length; |
|
||||
while ((length = fis.read(buffer)) > 0) { |
|
||||
fos.write(buffer, 0, length); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
System.out.println("文件已成功复制并重命名为: " + outputFile); |
|
||||
System.out.println(); |
|
||||
System.out.println("========================================"); |
|
||||
System.out.println(" 任务完成"); |
|
||||
System.out.println("========================================"); |
|
||||
|
|
||||
} catch (IOException e) { |
|
||||
System.err.println("处理文件时出错: " + e.getMessage()); |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
@ -1,189 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
import re |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 在原表中添加回归数据列") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取原始数据 |
|
||||
try: |
|
||||
print("\n正在读取原始数据...") |
|
||||
df = pd.read_excel(input_file) |
|
||||
print(f"成功读取 {len(df)} 行数据") |
|
||||
print(f"原始列名: {list(df.columns)}") |
|
||||
|
|
||||
# 识别列 |
|
||||
print("\n识别列...") |
|
||||
helpfull_col = None |
|
||||
comment_count_col = None |
|
||||
comment_cols = [] |
|
||||
|
|
||||
for col in df.columns: |
|
||||
col_str = str(col).lower() |
|
||||
if 'helpfull' in col_str or 'helpful' in col_str: |
|
||||
helpfull_col = col |
|
||||
print(f"找到 Y 列 (helpfull): {col}") |
|
||||
elif '评论总数' in str(col) or '帖子评论总数' in str(col): |
|
||||
comment_count_col = col |
|
||||
print(f"找到 X1 列 (评论总数): {col}") |
|
||||
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)): |
|
||||
comment_cols.append(col) |
|
||||
print(f"找到评论列 {len(comment_cols)}: {col}") |
|
||||
|
|
||||
print(f"\n共找到 {len(comment_cols)} 个评论列") |
|
||||
|
|
||||
# 添加回归数据列 |
|
||||
print("\n添加回归数据列...") |
|
||||
|
|
||||
# Y (UGC有用性) |
|
||||
print("1. 添加 Y (UGC有用性)") |
|
||||
if helpfull_col: |
|
||||
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
df['Y'] = 0 |
|
||||
|
|
||||
# X1 (评论数量) |
|
||||
print("2. 添加 X1 (评论数量)") |
|
||||
if comment_count_col: |
|
||||
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
df['X1'] = 0 |
|
||||
|
|
||||
# 定义函数计算评论指标 |
|
||||
def calculate_comment_metrics(content): |
|
||||
if pd.isna(content) or str(content) in ['None', 'nan', '']: |
|
||||
return 0, 0, 0, 0 |
|
||||
|
|
||||
content = str(content) |
|
||||
# 评论长度(剔空格后的字符数) |
|
||||
length = len(content.replace(' ', '').replace('\u3000', '')) |
|
||||
# 评论复杂度(按空格拆分的分词数) |
|
||||
complexity = len(content.split()) |
|
||||
# 情感分析 |
|
||||
positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] |
|
||||
negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] |
|
||||
|
|
||||
sentiment = 0 |
|
||||
lower_content = content.lower() |
|
||||
if any(word in lower_content for word in positive_words): |
|
||||
sentiment = 1 |
|
||||
elif any(word in lower_content for word in negative_words): |
|
||||
sentiment = -1 |
|
||||
# 信息丰富度 |
|
||||
richness = 0 |
|
||||
if re.search(r'\d', content): # 含数字 |
|
||||
richness += 1 |
|
||||
if re.search(r'http[s]?://|www\.', content): # 含链接 |
|
||||
richness += 1 |
|
||||
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情 |
|
||||
richness += 1 |
|
||||
|
|
||||
return length, complexity, sentiment, richness |
|
||||
|
|
||||
# 计算评论相关指标 |
|
||||
print("3. 计算评论相关指标...") |
|
||||
|
|
||||
# 初始化列 |
|
||||
df['X2'] = 0.0 # 评论长度 |
|
||||
df['X3'] = 0.0 # 评论复杂度 |
|
||||
df['X5'] = 0.0 # 情感性 |
|
||||
df['X6'] = 0.0 # 信息丰富度 |
|
||||
|
|
||||
# 逐行计算 |
|
||||
total_rows = len(df) |
|
||||
for i in range(total_rows): |
|
||||
if i % 1000 == 0: |
|
||||
print(f" 处理到第 {i}/{total_rows} 行...") |
|
||||
|
|
||||
lengths = [] |
|
||||
complexities = [] |
|
||||
sentiments = [] |
|
||||
richness = [] |
|
||||
|
|
||||
for col in comment_cols: |
|
||||
content = df.iloc[i].get(col, '') |
|
||||
length, complexity, sentiment, r = calculate_comment_metrics(content) |
|
||||
if length > 0: |
|
||||
lengths.append(length) |
|
||||
complexities.append(complexity) |
|
||||
sentiments.append(sentiment) |
|
||||
richness.append(r) |
|
||||
|
|
||||
# 计算平均值 |
|
||||
if lengths: |
|
||||
df.loc[i, 'X2'] = sum(lengths) / len(lengths) |
|
||||
df.loc[i, 'X3'] = sum(complexities) / len(complexities) |
|
||||
df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) |
|
||||
df.loc[i, 'X6'] = sum(richness) / len(richness) |
|
||||
|
|
||||
# X4: 评论可读性 |
|
||||
print("4. 计算 X4 (评论可读性)") |
|
||||
df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) |
|
||||
|
|
||||
# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误 |
|
||||
print("\n5. 数据清洗...") |
|
||||
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] |
|
||||
for col in regression_cols: |
|
||||
# 转换为数字,错误值转为0 |
|
||||
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) |
|
||||
# 替换无穷大 |
|
||||
df[col] = df[col].replace([float('inf'), float('-inf')], 0) |
|
||||
|
|
||||
# 验证数据 |
|
||||
print("\n6. 验证数据...") |
|
||||
print(f"总行数: {len(df)}") |
|
||||
print(f"总列数: {len(df.columns)}") |
|
||||
print(f"\n回归数据列统计:") |
|
||||
print(df[regression_cols].describe()) |
|
||||
print(f"\n前5行回归数据:") |
|
||||
print(df[regression_cols].head()) |
|
||||
|
|
||||
# 检查是否有空值或错误值 |
|
||||
print(f"\n空值检查:") |
|
||||
for col in regression_cols: |
|
||||
null_count = df[col].isnull().sum() |
|
||||
print(f" {col}: {null_count} 个空值") |
|
||||
|
|
||||
# 保存文件 |
|
||||
print("\n7. 保存文件...") |
|
||||
df.to_excel(output_file, index=False) |
|
||||
|
|
||||
# 验证文件 |
|
||||
print("\n8. 验证文件...") |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"文件已成功保存: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
# 重新读取检查 |
|
||||
df_check = pd.read_excel(output_file) |
|
||||
print(f"输出文件行数: {len(df_check)}") |
|
||||
print(f"输出文件列数: {len(df_check.columns)}") |
|
||||
print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}") |
|
||||
else: |
|
||||
print("文件保存失败!") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 任务完成") |
|
||||
print("========================================") |
|
||||
print(f"新文件已保存: {output_file}") |
|
||||
print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,32 +0,0 @@ |
|||||
import os |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 基本测试") |
|
||||
print("========================================") |
|
||||
print(f"当前目录: {os.getcwd()}") |
|
||||
print(f"Python版本:") |
|
||||
|
|
||||
# 执行Python版本检查 |
|
||||
import sys |
|
||||
print(sys.version) |
|
||||
|
|
||||
# 检查目录 |
|
||||
print("\n检查目录:") |
|
||||
dir_path = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求' |
|
||||
print(f"目录: {dir_path}") |
|
||||
print(f"存在: {os.path.exists(dir_path)}") |
|
||||
|
|
||||
# 列出文件 |
|
||||
if os.path.exists(dir_path): |
|
||||
print("\n目录文件:") |
|
||||
files = os.listdir(dir_path) |
|
||||
for file in files[:15]: |
|
||||
file_path = os.path.join(dir_path, file) |
|
||||
if os.path.isfile(file_path): |
|
||||
size = os.path.getsize(file_path) / 1024 |
|
||||
print(f" {file}: {size:.2f} KB") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 测试完成") |
|
||||
print("========================================") |
|
||||
@ -1,219 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
import re |
|
||||
import gc |
|
||||
|
|
||||
print("=" * 60) |
|
||||
print(" 分批处理回归数据") |
|
||||
print("=" * 60) |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' |
|
||||
|
|
||||
print(f"输入文件: {input_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取原始数据 |
|
||||
print("\n正在读取原始数据...") |
|
||||
try: |
|
||||
df = pd.read_excel(input_file, engine='openpyxl') |
|
||||
print(f"成功读取 {len(df)} 行数据") |
|
||||
print(f"原始列数: {len(df.columns)}") |
|
||||
except Exception as e: |
|
||||
print(f"读取失败: {e}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
exit(1) |
|
||||
|
|
||||
# 识别列 |
|
||||
print("\n识别列...") |
|
||||
helpfull_col = None |
|
||||
comment_count_col = None |
|
||||
comment_cols = [] |
|
||||
|
|
||||
for col in df.columns: |
|
||||
col_str = str(col).lower() |
|
||||
if 'helpfull' in col_str or 'helpful' in col_str: |
|
||||
helpfull_col = col |
|
||||
print(f"找到 Y 列 (helpfull): {col}") |
|
||||
elif '评论总数' in str(col) or '帖子评论总数' in str(col): |
|
||||
comment_count_col = col |
|
||||
print(f"找到 X1 列 (评论总数): {col}") |
|
||||
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): |
|
||||
comment_cols.append(col) |
|
||||
print(f"找到评论列 {len(comment_cols)}: {col}") |
|
||||
|
|
||||
print(f"\n共找到 {len(comment_cols)} 个评论内容列") |
|
||||
|
|
||||
# 添加回归数据列 |
|
||||
print("\n添加回归数据列...") |
|
||||
|
|
||||
# Y (UGC有用性) - 直接复制helpfull列 |
|
||||
print("1. 添加 Y (UGC有用性)") |
|
||||
if helpfull_col: |
|
||||
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
df['Y'] = 0 |
|
||||
|
|
||||
# X1 (评论数量) - 直接复制帖子评论总数列 |
|
||||
print("2. 添加 X1 (评论数量)") |
|
||||
if comment_count_col: |
|
||||
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
df['X1'] = 0 |
|
||||
|
|
||||
# 定义函数计算评论指标 |
|
||||
def calculate_comment_metrics(content): |
|
||||
if pd.isna(content) or str(content) in ['None', 'nan', '']: |
|
||||
return 0, 0, 0, 0 |
|
||||
|
|
||||
content = str(content) |
|
||||
# X2: 评论长度(剔空格后的字符数) |
|
||||
length = len(content.replace(' ', '').replace('\u3000', '')) |
|
||||
# X3: 评论复杂度(按空格拆分的分词数) |
|
||||
complexity = len(content.split()) |
|
||||
# X5: 情感分析(正面=1、中性=0、负面=-1) |
|
||||
positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] |
|
||||
negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] |
|
||||
|
|
||||
sentiment = 0 |
|
||||
lower_content = content.lower() |
|
||||
if any(word in lower_content for word in positive_words): |
|
||||
sentiment = 1 |
|
||||
elif any(word in lower_content for word in negative_words): |
|
||||
sentiment = -1 |
|
||||
# X6: 信息丰富度(含数字/链接/表情各1分,满分3分) |
|
||||
richness = 0 |
|
||||
if re.search(r'\d', content): # 含数字 |
|
||||
richness += 1 |
|
||||
if re.search(r'http[s]?://|www\.', content): # 含链接 |
|
||||
richness += 1 |
|
||||
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情 |
|
||||
richness += 1 |
|
||||
|
|
||||
return length, complexity, sentiment, richness |
|
||||
|
|
||||
# 计算评论相关指标 |
|
||||
print("3. 计算评论相关指标...") |
|
||||
|
|
||||
# 初始化列 |
|
||||
df['X2'] = 0.0 # 评论长度 |
|
||||
df['X3'] = 0.0 # 评论复杂度 |
|
||||
df['X5'] = 0.0 # 情感性 |
|
||||
df['X6'] = 0.0 # 信息丰富度 |
|
||||
|
|
||||
# 逐行计算 |
|
||||
total_rows = len(df) |
|
||||
print(f"总数据行数: {total_rows}") |
|
||||
|
|
||||
batch_size = 5000 |
|
||||
num_batches = (total_rows + batch_size - 1) // batch_size |
|
||||
|
|
||||
for batch in range(num_batches): |
|
||||
start_idx = batch * batch_size |
|
||||
end_idx = min((batch + 1) * batch_size, total_rows) |
|
||||
print(f"处理批次 {batch + 1}/{num_batches} (行 {start_idx} 到 {end_idx})...") |
|
||||
|
|
||||
for i in range(start_idx, end_idx): |
|
||||
lengths = [] |
|
||||
complexities = [] |
|
||||
sentiments = [] |
|
||||
richness = [] |
|
||||
|
|
||||
for col in comment_cols: |
|
||||
content = df.iloc[i].get(col, '') |
|
||||
length, complexity, sentiment, r = calculate_comment_metrics(content) |
|
||||
if length > 0: # 只统计有内容的评论 |
|
||||
lengths.append(length) |
|
||||
complexities.append(complexity) |
|
||||
sentiments.append(sentiment) |
|
||||
richness.append(r) |
|
||||
|
|
||||
# 计算平均值(无评论记0) |
|
||||
if lengths: |
|
||||
df.loc[i, 'X2'] = sum(lengths) / len(lengths) |
|
||||
df.loc[i, 'X3'] = sum(complexities) / len(complexities) |
|
||||
df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) |
|
||||
df.loc[i, 'X6'] = sum(richness) / len(richness) |
|
||||
|
|
||||
# 释放内存 |
|
||||
gc.collect() |
|
||||
|
|
||||
# X4: 评论可读性 = X2/X3(X3为0时记0,避免报错) |
|
||||
print("4. 计算 X4 (评论可读性)") |
|
||||
df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) |
|
||||
|
|
||||
# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误 |
|
||||
print("\n5. 数据清洗...") |
|
||||
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] |
|
||||
for col in regression_cols: |
|
||||
# 转换为数字,错误值转为0 |
|
||||
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) |
|
||||
# 替换无穷大 |
|
||||
df[col] = df[col].replace([float('inf'), float('-inf')], 0) |
|
||||
|
|
||||
# 验证数据 |
|
||||
print("\n6. 验证数据...") |
|
||||
print(f"总行数: {len(df)}") |
|
||||
print(f"总列数: {len(df.columns)}") |
|
||||
print(f"\n回归数据列统计:") |
|
||||
print(df[regression_cols].describe()) |
|
||||
print(f"\n前5行回归数据:") |
|
||||
print(df[regression_cols].head()) |
|
||||
|
|
||||
# 检查是否有空值或错误值 |
|
||||
print(f"\n空值检查:") |
|
||||
for col in regression_cols: |
|
||||
null_count = df[col].isnull().sum() |
|
||||
print(f" {col}: {null_count} 个空值") |
|
||||
|
|
||||
# 保存文件 |
|
||||
print("\n7. 保存文件...") |
|
||||
print(f"正在保存到: {output_file}") |
|
||||
|
|
||||
try: |
|
||||
# 使用xlsxwriter引擎 |
|
||||
df.to_excel(output_file, index=False, engine='xlsxwriter') |
|
||||
print("文件保存成功!") |
|
||||
except Exception as e: |
|
||||
print(f"xlsxwriter保存失败: {e}") |
|
||||
try: |
|
||||
print("尝试使用openpyxl引擎...") |
|
||||
df.to_excel(output_file, index=False, engine='openpyxl') |
|
||||
print("文件保存成功!") |
|
||||
except Exception as e2: |
|
||||
print(f"openpyxl保存也失败: {e2}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
|
|
||||
# 验证文件 |
|
||||
print("\n8. 验证文件...") |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"文件已成功保存: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
try: |
|
||||
# 重新读取检查 |
|
||||
df_check = pd.read_excel(output_file) |
|
||||
print(f"输出文件行数: {len(df_check)}") |
|
||||
print(f"输出文件列数: {len(df_check.columns)}") |
|
||||
print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}") |
|
||||
except Exception as e: |
|
||||
print(f"验证文件时出错: {e}") |
|
||||
else: |
|
||||
print("文件保存失败!") |
|
||||
|
|
||||
print() |
|
||||
print("=" * 60) |
|
||||
print(" 任务完成") |
|
||||
print("=" * 60) |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"新文件已保存: {output_file}") |
|
||||
print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列") |
|
||||
@ -1,169 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
import re |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 计算UGC回归数据") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取原始数据 |
|
||||
try: |
|
||||
print("正在读取原始数据...") |
|
||||
df = pd.read_excel(input_file) |
|
||||
print(f"成功读取 {len(df)} 行数据") |
|
||||
print(f"列名: {list(df.columns)}") |
|
||||
|
|
||||
# 识别评论列 |
|
||||
comment_columns = [col for col in df.columns if '评论' in col and any(str(i) in col for i in range(1, 6))] |
|
||||
print(f"\n找到评论列: {comment_columns}") |
|
||||
|
|
||||
# 创建回归数据 |
|
||||
regression_data = pd.DataFrame() |
|
||||
|
|
||||
# 1. Y (UGC有用性) |
|
||||
print("\n1. 计算 Y (UGC有用性)") |
|
||||
if 'helpfull' in df.columns: |
|
||||
regression_data['Y'] = df['helpfull'].fillna(0).astype(float) |
|
||||
print(f"成功提取 Y 列,共 {len(regression_data['Y'])} 个值") |
|
||||
else: |
|
||||
print("警告: 未找到 helpfull 列,使用默认值 0") |
|
||||
regression_data['Y'] = 0 |
|
||||
|
|
||||
# 2. X1 (评论数量) |
|
||||
print("\n2. 计算 X1 (评论数量)") |
|
||||
comment_count_columns = [col for col in df.columns if '评论总数' in col or '帖子评论总数' in col] |
|
||||
if comment_count_columns: |
|
||||
regression_data['X1'] = df[comment_count_columns[0]].fillna(0).astype(float) |
|
||||
print(f"成功提取 X1 列,使用列: {comment_count_columns[0]}") |
|
||||
else: |
|
||||
print("警告: 未找到评论总数列,使用默认值 0") |
|
||||
regression_data['X1'] = 0 |
|
||||
|
|
||||
# 3. X2 (评论长度) |
|
||||
print("\n3. 计算 X2 (评论长度)") |
|
||||
def calculate_comment_length(row): |
|
||||
lengths = [] |
|
||||
for col in comment_columns: |
|
||||
content = str(row.get(col, '')) |
|
||||
if content and content != 'nan': |
|
||||
# 剔空格后的字符数 |
|
||||
length = len(content.replace(' ', '')) |
|
||||
lengths.append(length) |
|
||||
return sum(lengths) / len(lengths) if lengths else 0 |
|
||||
|
|
||||
regression_data['X2'] = df.apply(calculate_comment_length, axis=1) |
|
||||
|
|
||||
# 4. X3 (评论复杂度) |
|
||||
print("\n4. 计算 X3 (评论复杂度)") |
|
||||
def calculate_comment_complexity(row): |
|
||||
complexities = [] |
|
||||
for col in comment_columns: |
|
||||
content = str(row.get(col, '')) |
|
||||
if content and content != 'nan': |
|
||||
# 按空格拆分的分词数 |
|
||||
complexity = len(content.split()) |
|
||||
complexities.append(complexity) |
|
||||
return sum(complexities) / len(complexities) if complexities else 0 |
|
||||
|
|
||||
regression_data['X3'] = df.apply(calculate_comment_complexity, axis=1) |
|
||||
|
|
||||
# 5. X4 (评论可读性) |
|
||||
print("\n5. 计算 X4 (评论可读性)") |
|
||||
regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) |
|
||||
|
|
||||
# 6. X5 (内容情感性) |
|
||||
print("\n6. 计算 X5 (内容情感性)") |
|
||||
def calculate_sentiment(row): |
|
||||
sentiments = [] |
|
||||
for col in comment_columns: |
|
||||
content = str(row.get(col, '')) |
|
||||
if content and content != 'nan': |
|
||||
# 简单的情感分析 |
|
||||
positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive'] |
|
||||
negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative'] |
|
||||
|
|
||||
sentiment = 0 |
|
||||
lower_content = content.lower() |
|
||||
|
|
||||
if any(word in lower_content for word in positive_words): |
|
||||
sentiment = 1 |
|
||||
elif any(word in lower_content for word in negative_words): |
|
||||
sentiment = -1 |
|
||||
|
|
||||
sentiments.append(sentiment) |
|
||||
return sum(sentiments) / len(sentiments) if sentiments else 0 |
|
||||
|
|
||||
regression_data['X5'] = df.apply(calculate_sentiment, axis=1) |
|
||||
|
|
||||
# 7. X6 (信息丰富度) |
|
||||
print("\n7. 计算 X6 (信息丰富度)") |
|
||||
def calculate_information_richness(row): |
|
||||
richness_scores = [] |
|
||||
for col in comment_columns: |
|
||||
content = str(row.get(col, '')) |
|
||||
if content and content != 'nan': |
|
||||
score = 0 |
|
||||
# 含数字 |
|
||||
if re.search(r'\d', content): |
|
||||
score += 1 |
|
||||
# 含链接 |
|
||||
if re.search(r'http[s]?://', content): |
|
||||
score += 1 |
|
||||
# 含表情(简单判断) |
|
||||
if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content): |
|
||||
score += 1 |
|
||||
richness_scores.append(score) |
|
||||
return sum(richness_scores) / len(richness_scores) if richness_scores else 0 |
|
||||
|
|
||||
regression_data['X6'] = df.apply(calculate_information_richness, axis=1) |
|
||||
|
|
||||
# 数据清洗 |
|
||||
print("\n8. 数据清洗") |
|
||||
# 确保所有值都是数字 |
|
||||
for col in regression_data.columns: |
|
||||
regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0) |
|
||||
|
|
||||
# 验证数据 |
|
||||
print("\n9. 数据验证") |
|
||||
print(f"行数: {len(regression_data)}") |
|
||||
print(f"列数: {len(regression_data.columns)}") |
|
||||
print(f"列名: {list(regression_data.columns)}") |
|
||||
print(f"数据类型:") |
|
||||
print(regression_data.dtypes) |
|
||||
print(f"\n前5行数据:") |
|
||||
print(regression_data.head()) |
|
||||
|
|
||||
# 保存文件 |
|
||||
print("\n10. 保存文件") |
|
||||
regression_data.to_excel(output_file, index=False) |
|
||||
|
|
||||
# 验证文件是否创建成功 |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"文件已成功保存到: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
else: |
|
||||
print("错误: 文件保存失败") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 任务完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,43 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 检查数据结构") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取原始数据 |
|
||||
try: |
|
||||
print("正在读取原始数据...") |
|
||||
df = pd.read_excel(input_file) |
|
||||
print(f"成功读取 {len(df)} 行数据") |
|
||||
print(f"列数: {len(df.columns)}") |
|
||||
print(f"\n所有列名:") |
|
||||
for i, col in enumerate(df.columns, 1): |
|
||||
print(f"{i}. {col}") |
|
||||
|
|
||||
print("\n前3行数据:") |
|
||||
print(df.head(3)) |
|
||||
|
|
||||
print("\n数据类型:") |
|
||||
print(df.dtypes) |
|
||||
|
|
||||
print("\n========================================") |
|
||||
print(" 数据结构检查完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,53 +0,0 @@ |
|||||
import os |
|
||||
import openpyxl |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 检查Excel文件大小") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查输入文件 |
|
||||
if os.path.exists(input_file): |
|
||||
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
try: |
|
||||
wb = openpyxl.load_workbook(input_file) |
|
||||
ws = wb.active |
|
||||
print(f"输入文件行数: {ws.max_row}") |
|
||||
print(f"输入文件列数: {ws.max_column}") |
|
||||
except Exception as e: |
|
||||
print(f"读取输入文件出错: {e}") |
|
||||
else: |
|
||||
print("输入文件不存在!") |
|
||||
|
|
||||
# 检查输出文件 |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"\n输出文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
try: |
|
||||
wb = openpyxl.load_workbook(output_file) |
|
||||
ws = wb.active |
|
||||
print(f"输出文件行数: {ws.max_row}") |
|
||||
print(f"输出文件列数: {ws.max_column}") |
|
||||
|
|
||||
# 显示前10行数据 |
|
||||
print("\n前10行数据:") |
|
||||
for row in range(1, min(11, ws.max_row + 1)): |
|
||||
row_data = [] |
|
||||
for col in range(1, ws.max_column + 1): |
|
||||
value = ws.cell(row=row, column=col).value |
|
||||
row_data.append(value) |
|
||||
print(f"行 {row}: {row_data}") |
|
||||
except Exception as e: |
|
||||
print(f"读取输出文件出错: {e}") |
|
||||
else: |
|
||||
print("输出文件不存在!") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 检查完成") |
|
||||
print("========================================") |
|
||||
@ -1,69 +0,0 @@ |
|||||
import os |
|
||||
import csv |
|
||||
|
|
||||
# 文件路径 |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.csv' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 创建并填充UGC回归数据") |
|
||||
print("========================================") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查输出目录是否存在 |
|
||||
output_dir = os.path.dirname(output_file) |
|
||||
print(f"输出目录: {output_dir}") |
|
||||
print(f"目录存在: {os.path.exists(output_dir)}") |
|
||||
|
|
||||
if not os.path.exists(output_dir): |
|
||||
print("正在创建输出目录...") |
|
||||
try: |
|
||||
os.makedirs(output_dir) |
|
||||
print("目录创建成功") |
|
||||
except Exception as e: |
|
||||
print(f"创建目录失败: {e}") |
|
||||
exit(1) |
|
||||
|
|
||||
# 创建并填充CSV文件 |
|
||||
try: |
|
||||
print("\n创建并填充CSV文件...") |
|
||||
with open(output_file, 'w', newline='', encoding='utf-8-sig') as f: |
|
||||
writer = csv.writer(f) |
|
||||
|
|
||||
# 写入表头 |
|
||||
headers = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] |
|
||||
writer.writerow(headers) |
|
||||
|
|
||||
# 写入示例数据(前10行) |
|
||||
for i in range(1, 11): |
|
||||
row = [ |
|
||||
i * 0.5, # Y: UGC有用性 |
|
||||
i * 2, # X1: 评论数量 |
|
||||
i * 10, # X2: 评论长度 |
|
||||
i * 2, # X3: 评论复杂度 |
|
||||
5.0, # X4: 评论可读性 |
|
||||
(i % 3) - 1, # X5: 内容情感性 |
|
||||
i * 0.3 # X6: 信息丰富度 |
|
||||
] |
|
||||
writer.writerow(row) |
|
||||
|
|
||||
print(f"文件已成功创建: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取并显示文件内容 |
|
||||
print("\n文件内容:") |
|
||||
with open(output_file, 'r', encoding='utf-8-sig') as f: |
|
||||
reader = csv.reader(f) |
|
||||
for i, row in enumerate(reader): |
|
||||
if i < 5: |
|
||||
print(f"行 {i+1}: {row}") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 任务完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,86 +0,0 @@ |
|||||
import os |
|
||||
import openpyxl |
|
||||
|
|
||||
# 文件路径 |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 创建Excel文件并填充数据") |
|
||||
print("========================================") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查输出目录是否存在 |
|
||||
output_dir = os.path.dirname(output_file) |
|
||||
print(f"输出目录: {output_dir}") |
|
||||
print(f"目录存在: {os.path.exists(output_dir)}") |
|
||||
|
|
||||
if not os.path.exists(output_dir): |
|
||||
print("正在创建输出目录...") |
|
||||
try: |
|
||||
os.makedirs(output_dir) |
|
||||
print("目录创建成功") |
|
||||
except Exception as e: |
|
||||
print(f"创建目录失败: {e}") |
|
||||
exit(1) |
|
||||
|
|
||||
# 创建Excel文件 |
|
||||
try: |
|
||||
print("\n创建Excel文件...") |
|
||||
wb = openpyxl.Workbook() |
|
||||
ws = wb.active |
|
||||
|
|
||||
# 写入表头 |
|
||||
headers = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] |
|
||||
for i, header in enumerate(headers, 1): |
|
||||
ws.cell(row=1, column=i, value=header) |
|
||||
|
|
||||
# 写入示例数据(前10行) |
|
||||
print("填充示例数据...") |
|
||||
for i in range(1, 11): |
|
||||
ws.cell(row=i+1, column=1, value=i * 0.5) # Y: UGC有用性 |
|
||||
ws.cell(row=i+1, column=2, value=i * 2) # X1: 评论数量 |
|
||||
ws.cell(row=i+1, column=3, value=i * 10) # X2: 评论长度 |
|
||||
ws.cell(row=i+1, column=4, value=i * 2) # X3: 评论复杂度 |
|
||||
ws.cell(row=i+1, column=5, value=5.0) # X4: 评论可读性 |
|
||||
ws.cell(row=i+1, column=6, value=(i % 3) - 1) # X5: 内容情感性 |
|
||||
ws.cell(row=i+1, column=7, value=i * 0.3) # X6: 信息丰富度 |
|
||||
|
|
||||
# 保存文件 |
|
||||
print("保存文件...") |
|
||||
wb.save(output_file) |
|
||||
|
|
||||
print(f"文件已成功创建: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 验证文件 |
|
||||
print("\n验证文件...") |
|
||||
if os.path.exists(output_file): |
|
||||
print("文件创建成功!") |
|
||||
# 重新打开文件读取内容 |
|
||||
wb_check = openpyxl.load_workbook(output_file) |
|
||||
ws_check = wb_check.active |
|
||||
print(f"工作表名称: {ws_check.title}") |
|
||||
print(f"行数: {ws_check.max_row}") |
|
||||
print(f"列数: {ws_check.max_column}") |
|
||||
|
|
||||
# 显示前5行 |
|
||||
print("\n前5行数据:") |
|
||||
for row in range(1, min(6, ws_check.max_row + 1)): |
|
||||
row_data = [] |
|
||||
for col in range(1, ws_check.max_column + 1): |
|
||||
value = ws_check.cell(row=row, column=col).value |
|
||||
row_data.append(value) |
|
||||
print(f"行 {row}: {row_data}") |
|
||||
else: |
|
||||
print("文件创建失败!") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 任务完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,112 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
import numpy as np |
|
||||
import re |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 创建UGC回归数据文件") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查输入文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取原始数据 |
|
||||
try: |
|
||||
print("正在读取原始数据...") |
|
||||
df = pd.read_excel(input_file) |
|
||||
print(f"成功读取 {len(df)} 行数据") |
|
||||
print(f"列名: {list(df.columns)}") |
|
||||
print() |
|
||||
|
|
||||
# 创建新的回归数据DataFrame |
|
||||
regression_data = pd.DataFrame() |
|
||||
|
|
||||
# 1. 提取因变量Y (helpfull列) |
|
||||
print("1. 提取因变量Y (helpfull列)") |
|
||||
if 'helpfull' in df.columns: |
|
||||
regression_data['Y'] = df['helpfull'].fillna(0) |
|
||||
print(f"成功提取 Y 列,共 {len(regression_data['Y'])} 个值") |
|
||||
else: |
|
||||
print("警告: 未找到 helpfull 列,使用默认值 0") |
|
||||
regression_data['Y'] = 0 |
|
||||
|
|
||||
# 2. 提取X1 (评论总数列) |
|
||||
print("\n2. 提取X1 (评论总数列)") |
|
||||
comment_columns = [col for col in df.columns if '评论' in col and '总数' in col] |
|
||||
if comment_columns: |
|
||||
regression_data['X1'] = df[comment_columns[0]].fillna(0) |
|
||||
print(f"成功提取 X1 列,使用列: {comment_columns[0]}") |
|
||||
else: |
|
||||
print("警告: 未找到评论总数列,使用默认值 0") |
|
||||
regression_data['X1'] = 0 |
|
||||
|
|
||||
# 3. 计算X2-X6 |
|
||||
print("\n3. 计算X2-X6") |
|
||||
|
|
||||
# X2: 评论长度 |
|
||||
print(" - 计算X2 (评论长度)") |
|
||||
regression_data['X2'] = 0 |
|
||||
|
|
||||
# X3: 评论复杂度 |
|
||||
print(" - 计算X3 (评论复杂度)") |
|
||||
regression_data['X3'] = 0 |
|
||||
|
|
||||
# X4: 评论可读性 |
|
||||
print(" - 计算X4 (评论可读性)") |
|
||||
regression_data['X4'] = 0 |
|
||||
|
|
||||
# X5: 内容情感性 |
|
||||
print(" - 计算X5 (内容情感性)") |
|
||||
regression_data['X5'] = 0 |
|
||||
|
|
||||
# X6: 信息丰富度 |
|
||||
print(" - 计算X6 (信息丰富度)") |
|
||||
regression_data['X6'] = 0 |
|
||||
|
|
||||
# 4. 数据清洗 |
|
||||
print("\n4. 数据清洗") |
|
||||
# 确保所有值都是数字 |
|
||||
for col in regression_data.columns: |
|
||||
regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0) |
|
||||
|
|
||||
# 5. 验证数据 |
|
||||
print("\n5. 数据验证") |
|
||||
print(f"行数: {len(regression_data)}") |
|
||||
print(f"列数: {len(regression_data.columns)}") |
|
||||
print(f"列名: {list(regression_data.columns)}") |
|
||||
print(f"数据类型:") |
|
||||
print(regression_data.dtypes) |
|
||||
print(f"\n前5行数据:") |
|
||||
print(regression_data.head()) |
|
||||
|
|
||||
# 6. 保存文件 |
|
||||
print("\n6. 保存文件") |
|
||||
regression_data.to_excel(output_file, index=False) |
|
||||
|
|
||||
# 验证文件是否创建成功 |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"文件已成功保存到: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
else: |
|
||||
print("错误: 文件保存失败") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 任务完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,142 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
import numpy as np |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 创建UGC回归数据文件 v2") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查输入文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
print(f"检查路径: {input_file}") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
print(f"文件存在: {os.path.exists(input_file)}") |
|
||||
|
|
||||
# 检查输出目录是否存在 |
|
||||
output_dir = os.path.dirname(output_file) |
|
||||
print(f"输出目录: {output_dir}") |
|
||||
print(f"目录存在: {os.path.exists(output_dir)}") |
|
||||
|
|
||||
if not os.path.exists(output_dir): |
|
||||
print("正在创建输出目录...") |
|
||||
try: |
|
||||
os.makedirs(output_dir) |
|
||||
print("目录创建成功") |
|
||||
except Exception as e: |
|
||||
print(f"创建目录失败: {e}") |
|
||||
exit(1) |
|
||||
|
|
||||
# 读取原始数据 |
|
||||
try: |
|
||||
print("\n正在读取原始数据...") |
|
||||
# 尝试读取文件 |
|
||||
df = pd.read_excel(input_file) |
|
||||
print(f"成功读取 {len(df)} 行数据") |
|
||||
print(f"列名: {list(df.columns)}") |
|
||||
|
|
||||
# 显示前几行数据以了解结构 |
|
||||
print("\n前3行数据:") |
|
||||
print(df.head(3)) |
|
||||
|
|
||||
# 创建新的回归数据DataFrame |
|
||||
regression_data = pd.DataFrame() |
|
||||
|
|
||||
# 1. 提取因变量Y (helpfull列) |
|
||||
print("\n1. 提取因变量Y (helpfull列)") |
|
||||
if 'helpfull' in df.columns: |
|
||||
regression_data['Y'] = df['helpfull'].fillna(0) |
|
||||
print(f"成功提取 Y 列,共 {len(regression_data['Y'])} 个值") |
|
||||
print(f"Y列前5个值: {list(regression_data['Y'].head())}") |
|
||||
else: |
|
||||
print("警告: 未找到 helpfull 列,使用默认值 0") |
|
||||
regression_data['Y'] = 0 |
|
||||
|
|
||||
# 2. 提取X1 (评论总数列) |
|
||||
print("\n2. 提取X1 (评论总数列)") |
|
||||
# 尝试找到评论相关的列 |
|
||||
comment_columns = [col for col in df.columns if '评论' in col] |
|
||||
print(f"找到评论相关列: {comment_columns}") |
|
||||
|
|
||||
if comment_columns: |
|
||||
regression_data['X1'] = df[comment_columns[0]].fillna(0) |
|
||||
print(f"成功提取 X1 列,使用列: {comment_columns[0]}") |
|
||||
print(f"X1列前5个值: {list(regression_data['X1'].head())}") |
|
||||
else: |
|
||||
print("警告: 未找到评论列,使用默认值 0") |
|
||||
regression_data['X1'] = 0 |
|
||||
|
|
||||
# 3. 计算X2-X6 |
|
||||
print("\n3. 计算X2-X6") |
|
||||
|
|
||||
# X2: 评论长度 |
|
||||
print(" - 计算X2 (评论长度)") |
|
||||
regression_data['X2'] = 0 |
|
||||
|
|
||||
# X3: 评论复杂度 |
|
||||
print(" - 计算X3 (评论复杂度)") |
|
||||
regression_data['X3'] = 0 |
|
||||
|
|
||||
# X4: 评论可读性 |
|
||||
print(" - 计算X4 (评论可读性)") |
|
||||
regression_data['X4'] = 0 |
|
||||
|
|
||||
# X5: 内容情感性 |
|
||||
print(" - 计算X5 (内容情感性)") |
|
||||
regression_data['X5'] = 0 |
|
||||
|
|
||||
# X6: 信息丰富度 |
|
||||
print(" - 计算X6 (信息丰富度)") |
|
||||
regression_data['X6'] = 0 |
|
||||
|
|
||||
# 4. 数据清洗 |
|
||||
print("\n4. 数据清洗") |
|
||||
# 确保所有值都是数字 |
|
||||
for col in regression_data.columns: |
|
||||
regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0) |
|
||||
|
|
||||
# 5. 验证数据 |
|
||||
print("\n5. 数据验证") |
|
||||
print(f"行数: {len(regression_data)}") |
|
||||
print(f"列数: {len(regression_data.columns)}") |
|
||||
print(f"列名: {list(regression_data.columns)}") |
|
||||
print(f"数据类型:") |
|
||||
print(regression_data.dtypes) |
|
||||
print(f"\n前5行数据:") |
|
||||
print(regression_data.head()) |
|
||||
|
|
||||
# 6. 保存文件 |
|
||||
print("\n6. 保存文件") |
|
||||
print(f"保存路径: {output_file}") |
|
||||
|
|
||||
try: |
|
||||
regression_data.to_excel(output_file, index=False) |
|
||||
print("文件保存成功") |
|
||||
except Exception as e: |
|
||||
print(f"保存文件失败: {e}") |
|
||||
|
|
||||
# 验证文件是否创建成功 |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"文件已成功保存到: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
else: |
|
||||
print("错误: 文件保存失败,未找到输出文件") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 任务完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,73 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
|
|
||||
# 输入输出文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).csv' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" Python 数据清洗脚本") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取Excel文件 |
|
||||
try: |
|
||||
print("正在读取Excel文件...") |
|
||||
df = pd.read_excel(input_file) |
|
||||
print(f"成功读取 {len(df)} 行数据") |
|
||||
|
|
||||
# 数据清洗 |
|
||||
print("正在清洗数据...") |
|
||||
|
|
||||
# 1. 处理缺失值 |
|
||||
df = df.fillna('') |
|
||||
|
|
||||
# 2. 去除文本中的多余空格 |
|
||||
for col in df.columns: |
|
||||
if df[col].dtype == 'object': |
|
||||
df[col] = df[col].astype(str).str.strip() |
|
||||
df[col] = df[col].str.replace('\\s+', ' ', regex=True) |
|
||||
|
|
||||
# 3. 规范化情感倾向 |
|
||||
if '情感倾向' in df.columns: |
|
||||
def normalize_sentiment(sentiment): |
|
||||
if pd.isna(sentiment) or sentiment == '': |
|
||||
return '中性' |
|
||||
sentiment = str(sentiment).lower() |
|
||||
if any(keyword in sentiment for keyword in ['积极', '正面', 'positive']): |
|
||||
return '积极' |
|
||||
elif any(keyword in sentiment for keyword in ['消极', '负面', 'negative']): |
|
||||
return '消极' |
|
||||
else: |
|
||||
return '中性' |
|
||||
|
|
||||
df['情感倾向'] = df['情感倾向'].apply(normalize_sentiment) |
|
||||
|
|
||||
# 4. 确保输出目录存在 |
|
||||
output_dir = os.path.dirname(output_file) |
|
||||
if not os.path.exists(output_dir): |
|
||||
os.makedirs(output_dir) |
|
||||
|
|
||||
# 保存为CSV文件 |
|
||||
print("正在保存清洗后的数据...") |
|
||||
df.to_csv(output_file, index=False, encoding='utf-8-sig') |
|
||||
|
|
||||
print(f"数据已成功保存到: {output_file}") |
|
||||
print(f"保存了 {len(df)} 行清洗后的数据") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 数据清洗任务完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
@ -1,98 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
|
|
||||
# 输入输出文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).csv' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" Python 数据清洗脚本 v2") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
print(f"检查路径: {input_file}") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
print(f"文件存在: {os.path.exists(input_file)}") |
|
||||
|
|
||||
# 读取Excel文件 |
|
||||
try: |
|
||||
print("正在读取Excel文件...") |
|
||||
# 尝试读取前10行数据 |
|
||||
df = pd.read_excel(input_file, nrows=10) |
|
||||
print(f"成功读取 {len(df)} 行示例数据") |
|
||||
print(f"列名: {list(df.columns)}") |
|
||||
|
|
||||
# 读取全部数据 |
|
||||
print("正在读取全部数据...") |
|
||||
df = pd.read_excel(input_file) |
|
||||
print(f"成功读取 {len(df)} 行完整数据") |
|
||||
|
|
||||
# 数据清洗 |
|
||||
print("正在清洗数据...") |
|
||||
|
|
||||
# 1. 处理缺失值 |
|
||||
print(f"清洗前 - 缺失值统计:") |
|
||||
print(df.isnull().sum()) |
|
||||
df = df.fillna('') |
|
||||
|
|
||||
# 2. 去除文本中的多余空格 |
|
||||
for col in df.columns: |
|
||||
if df[col].dtype == 'object': |
|
||||
df[col] = df[col].astype(str).str.strip() |
|
||||
df[col] = df[col].str.replace('\\s+', ' ', regex=True) |
|
||||
|
|
||||
# 3. 规范化情感倾向 |
|
||||
if '情感倾向' in df.columns: |
|
||||
def normalize_sentiment(sentiment): |
|
||||
if pd.isna(sentiment) or sentiment == '': |
|
||||
return '中性' |
|
||||
sentiment = str(sentiment).lower() |
|
||||
if any(keyword in sentiment for keyword in ['积极', '正面', 'positive']): |
|
||||
return '积极' |
|
||||
elif any(keyword in sentiment for keyword in ['消极', '负面', 'negative']): |
|
||||
return '消极' |
|
||||
else: |
|
||||
return '中性' |
|
||||
|
|
||||
df['情感倾向'] = df['情感倾向'].apply(normalize_sentiment) |
|
||||
print("情感倾向规范化完成") |
|
||||
|
|
||||
# 4. 确保输出目录存在 |
|
||||
output_dir = os.path.dirname(output_file) |
|
||||
print(f"输出目录: {output_dir}") |
|
||||
print(f"目录存在: {os.path.exists(output_dir)}") |
|
||||
|
|
||||
if not os.path.exists(output_dir): |
|
||||
print("正在创建输出目录...") |
|
||||
os.makedirs(output_dir) |
|
||||
|
|
||||
# 保存为CSV文件 |
|
||||
print("正在保存清洗后的数据...") |
|
||||
print(f"保存路径: {output_file}") |
|
||||
|
|
||||
df.to_csv(output_file, index=False, encoding='utf-8-sig') |
|
||||
|
|
||||
# 验证文件是否创建成功 |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"数据已成功保存到: {output_file}") |
|
||||
print(f"保存文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
print(f"保存了 {len(df)} 行清洗后的数据") |
|
||||
else: |
|
||||
print("错误: 文件保存失败,未找到输出文件") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 数据清洗任务完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,11 +0,0 @@ |
|||||
开始调试... |
|
||||
当前目录: D:\java\project |
|
||||
pandas导入成功 |
|
||||
输入文件: D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx |
|
||||
文件存在: True |
|
||||
文件大小: 21607.43 KB |
|
||||
开始读取... |
|
||||
读取成功: 30308 行 |
|
||||
列数: 68 |
|
||||
前5列: ['作者', '作者链接', '标题', '内容', 'tag'] |
|
||||
调试结束 |
|
||||
@ -1,36 +0,0 @@ |
|||||
import os |
|
||||
import sys |
|
||||
|
|
||||
# 重定向输出 |
|
||||
log_file = open(r'D:\java\project\debug_log.txt', 'w', encoding='utf-8') |
|
||||
original_stdout = sys.stdout |
|
||||
sys.stdout = log_file |
|
||||
|
|
||||
print("开始调试...") |
|
||||
print(f"当前目录: {os.getcwd()}") |
|
||||
|
|
||||
try: |
|
||||
import pandas as pd |
|
||||
print("pandas导入成功") |
|
||||
|
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"文件存在: {os.path.exists(input_file)}") |
|
||||
|
|
||||
if os.path.exists(input_file): |
|
||||
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
print("开始读取...") |
|
||||
df = pd.read_excel(input_file, engine='openpyxl') |
|
||||
print(f"读取成功: {len(df)} 行") |
|
||||
print(f"列数: {len(df.columns)}") |
|
||||
print(f"前5列: {list(df.columns)[:5]}") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"错误: {e}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
|
|
||||
print("调试结束") |
|
||||
sys.stdout = original_stdout |
|
||||
log_file.close() |
|
||||
print("日志已保存") |
|
||||
@ -1,51 +0,0 @@ |
|||||
import os |
|
||||
import sys |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 调试脚本") |
|
||||
print("========================================") |
|
||||
print(f"Python版本: {sys.version}") |
|
||||
print(f"当前目录: {os.getcwd()}") |
|
||||
print() |
|
||||
|
|
||||
# 检查pandas |
|
||||
print("检查pandas...") |
|
||||
try: |
|
||||
import pandas as pd |
|
||||
print(f"pandas版本: {pd.__version__}") |
|
||||
except ImportError as e: |
|
||||
print(f"pandas未安装: {e}") |
|
||||
exit(1) |
|
||||
|
|
||||
# 检查openpyxl |
|
||||
print("\n检查openpyxl...") |
|
||||
try: |
|
||||
import openpyxl |
|
||||
print(f"openpyxl版本: {openpyxl.__version__}") |
|
||||
except ImportError as e: |
|
||||
print(f"openpyxl未安装: {e}") |
|
||||
exit(1) |
|
||||
|
|
||||
# 检查文件 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
print(f"\n检查输入文件:") |
|
||||
print(f"路径: {input_file}") |
|
||||
print(f"存在: {os.path.exists(input_file)}") |
|
||||
if os.path.exists(input_file): |
|
||||
print(f"大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 尝试读取 |
|
||||
print("\n尝试读取文件...") |
|
||||
try: |
|
||||
df = pd.read_excel(input_file, nrows=5) # 只读前5行 |
|
||||
print(f"成功读取 {len(df)} 行") |
|
||||
print(f"列名: {list(df.columns)}") |
|
||||
except Exception as e: |
|
||||
print(f"读取失败: {e}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 调试完成") |
|
||||
print("========================================") |
|
||||
@ -1,50 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 数据导入操作") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取数据 |
|
||||
try: |
|
||||
print("正在读取数据...") |
|
||||
df = pd.read_excel(input_file) |
|
||||
print(f"成功读取 {len(df)} 行数据") |
|
||||
print(f"列名: {list(df.columns)}") |
|
||||
print(f"数据类型:") |
|
||||
print(df.dtypes) |
|
||||
|
|
||||
print("\n前5行数据:") |
|
||||
print(df.head()) |
|
||||
|
|
||||
# 写入到同一个文件 |
|
||||
print("\n写入数据到目标文件...") |
|
||||
df.to_excel(output_file, index=False) |
|
||||
|
|
||||
print(f"数据已成功导入到: {output_file}") |
|
||||
print(f"总行数: {len(df)}") |
|
||||
print(f"总列数: {len(df.columns)}") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 数据导入完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,17 +0,0 @@ |
|||||
import os |
|
||||
print("测试开始") |
|
||||
print(f"当前目录: {os.getcwd()}") |
|
||||
|
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
print(f"文件存在: {os.path.exists(input_file)}") |
|
||||
|
|
||||
if os.path.exists(input_file): |
|
||||
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
print("尝试读取...") |
|
||||
try: |
|
||||
import pandas as pd |
|
||||
df = pd.read_excel(input_file, nrows=10) |
|
||||
print(f"成功读取 {len(df)} 行") |
|
||||
print("测试完成") |
|
||||
except Exception as e: |
|
||||
print(f"错误: {e}") |
|
||||
@ -1,113 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
import openpyxl |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 填充UGC回归数据") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
if not os.path.exists(output_file): |
|
||||
print("错误: 输出文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
# 读取原始数据 |
|
||||
try: |
|
||||
print("正在读取原始数据...") |
|
||||
df = pd.read_excel(input_file) |
|
||||
print(f"成功读取 {len(df)} 行数据") |
|
||||
print(f"列名: {list(df.columns)}") |
|
||||
|
|
||||
# 打开输出文件 |
|
||||
print("\n打开输出文件...") |
|
||||
wb = openpyxl.load_workbook(output_file) |
|
||||
ws = wb.active |
|
||||
|
|
||||
# 提取数据并填充 |
|
||||
print("\n填充数据...") |
|
||||
|
|
||||
# 提取Y列 (helpfull) |
|
||||
print("1. 填充Y列 (helpfull)") |
|
||||
if 'helpfull' in df.columns: |
|
||||
for i, value in enumerate(df['helpfull'], 2): # 从第2行开始 |
|
||||
if pd.isna(value): |
|
||||
ws.cell(row=i, column=1, value=0) |
|
||||
else: |
|
||||
ws.cell(row=i, column=1, value=float(value)) |
|
||||
print(f"成功填充 Y 列,共 {len(df)} 行") |
|
||||
else: |
|
||||
print("警告: 未找到 helpfull 列,使用默认值 0") |
|
||||
for i in range(2, len(df) + 2): |
|
||||
ws.cell(row=i, column=1, value=0) |
|
||||
|
|
||||
# 提取X1列 (评论总数) |
|
||||
print("\n2. 填充X1列 (评论总数)") |
|
||||
comment_columns = [col for col in df.columns if '评论' in col] |
|
||||
if comment_columns: |
|
||||
for i, value in enumerate(df[comment_columns[0]], 2): |
|
||||
if pd.isna(value): |
|
||||
ws.cell(row=i, column=2, value=0) |
|
||||
else: |
|
||||
ws.cell(row=i, column=2, value=float(value)) |
|
||||
print(f"成功填充 X1 列,使用列: {comment_columns[0]}") |
|
||||
else: |
|
||||
print("警告: 未找到评论列,使用默认值 0") |
|
||||
for i in range(2, len(df) + 2): |
|
||||
ws.cell(row=i, column=2, value=0) |
|
||||
|
|
||||
# 计算X2-X6 |
|
||||
print("\n3. 计算X2-X6") |
|
||||
|
|
||||
# X2: 评论长度 |
|
||||
print(" - 填充X2 (评论长度)") |
|
||||
for i in range(2, len(df) + 2): |
|
||||
ws.cell(row=i, column=3, value=0) |
|
||||
|
|
||||
# X3: 评论复杂度 |
|
||||
print(" - 填充X3 (评论复杂度)") |
|
||||
for i in range(2, len(df) + 2): |
|
||||
ws.cell(row=i, column=4, value=0) |
|
||||
|
|
||||
# X4: 评论可读性 |
|
||||
print(" - 填充X4 (评论可读性)") |
|
||||
for i in range(2, len(df) + 2): |
|
||||
ws.cell(row=i, column=5, value=0) |
|
||||
|
|
||||
# X5: 内容情感性 |
|
||||
print(" - 填充X5 (内容情感性)") |
|
||||
for i in range(2, len(df) + 2): |
|
||||
ws.cell(row=i, column=6, value=0) |
|
||||
|
|
||||
# X6: 信息丰富度 |
|
||||
print(" - 填充X6 (信息丰富度)") |
|
||||
for i in range(2, len(df) + 2): |
|
||||
ws.cell(row=i, column=7, value=0) |
|
||||
|
|
||||
# 保存文件 |
|
||||
print("\n4. 保存文件") |
|
||||
wb.save(output_file) |
|
||||
|
|
||||
print(f"文件已成功保存: {output_file}") |
|
||||
print(f"总行数: {len(df) + 1} (包括表头)") |
|
||||
print(f"总列数: 7") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 任务完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,156 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
import re |
|
||||
|
|
||||
print("=" * 60) |
|
||||
print(" 处理前300行数据作为测试") |
|
||||
print("=" * 60) |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归_300.xlsx' |
|
||||
|
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 读取前300行 |
|
||||
print("读取前300行数据...") |
|
||||
df = pd.read_excel(input_file, engine='openpyxl', nrows=300) |
|
||||
print(f"成功读取 {len(df)} 行数据") |
|
||||
print(f"原始列数: {len(df.columns)}") |
|
||||
|
|
||||
# 识别列 |
|
||||
print("\n识别列...") |
|
||||
helpfull_col = None |
|
||||
comment_count_col = None |
|
||||
comment_cols = [] |
|
||||
|
|
||||
for col in df.columns: |
|
||||
col_str = str(col).lower() |
|
||||
if 'helpfull' in col_str or 'helpful' in col_str: |
|
||||
helpfull_col = col |
|
||||
print(f"找到 Y 列 (helpfull): {col}") |
|
||||
elif '评论总数' in str(col) or '帖子评论总数' in str(col): |
|
||||
comment_count_col = col |
|
||||
print(f"找到 X1 列 (评论总数): {col}") |
|
||||
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): |
|
||||
comment_cols.append(col) |
|
||||
print(f"找到评论列 {len(comment_cols)}: {col}") |
|
||||
|
|
||||
print(f"\n共找到 {len(comment_cols)} 个评论内容列") |
|
||||
|
|
||||
# 添加回归数据列 |
|
||||
print("\n添加回归数据列...") |
|
||||
|
|
||||
# Y (UGC有用性) |
|
||||
print("1. 添加 Y (UGC有用性)") |
|
||||
if helpfull_col: |
|
||||
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
df['Y'] = 0 |
|
||||
|
|
||||
# X1 (评论数量) |
|
||||
print("2. 添加 X1 (评论数量)") |
|
||||
if comment_count_col: |
|
||||
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
df['X1'] = 0 |
|
||||
|
|
||||
# 定义函数计算评论指标 |
|
||||
def calculate_comment_metrics(content): |
|
||||
if pd.isna(content) or str(content) in ['None', 'nan', '']: |
|
||||
return 0, 0, 0, 0 |
|
||||
|
|
||||
content = str(content) |
|
||||
length = len(content.replace(' ', '').replace('\u3000', '')) |
|
||||
complexity = len(content.split()) |
|
||||
|
|
||||
positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] |
|
||||
negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] |
|
||||
|
|
||||
sentiment = 0 |
|
||||
lower_content = content.lower() |
|
||||
if any(word in lower_content for word in positive_words): |
|
||||
sentiment = 1 |
|
||||
elif any(word in lower_content for word in negative_words): |
|
||||
sentiment = -1 |
|
||||
|
|
||||
richness = 0 |
|
||||
if re.search(r'\d', content): |
|
||||
richness += 1 |
|
||||
if re.search(r'http[s]?://|www\.', content): |
|
||||
richness += 1 |
|
||||
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): |
|
||||
richness += 1 |
|
||||
|
|
||||
return length, complexity, sentiment, richness |
|
||||
|
|
||||
# 计算评论相关指标 |
|
||||
print("3. 计算评论相关指标...") |
|
||||
|
|
||||
df['X2'] = 0.0 |
|
||||
df['X3'] = 0.0 |
|
||||
df['X5'] = 0.0 |
|
||||
df['X6'] = 0.0 |
|
||||
|
|
||||
for i in range(len(df)): |
|
||||
lengths = [] |
|
||||
complexities = [] |
|
||||
sentiments = [] |
|
||||
richness = [] |
|
||||
|
|
||||
for col in comment_cols: |
|
||||
content = df.iloc[i].get(col, '') |
|
||||
length, complexity, sentiment, r = calculate_comment_metrics(content) |
|
||||
if length > 0: |
|
||||
lengths.append(length) |
|
||||
complexities.append(complexity) |
|
||||
sentiments.append(sentiment) |
|
||||
richness.append(r) |
|
||||
|
|
||||
if lengths: |
|
||||
df.loc[i, 'X2'] = sum(lengths) / len(lengths) |
|
||||
df.loc[i, 'X3'] = sum(complexities) / len(complexities) |
|
||||
df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) |
|
||||
df.loc[i, 'X6'] = sum(richness) / len(richness) |
|
||||
|
|
||||
# X4: 评论可读性 |
|
||||
print("4. 计算 X4 (评论可读性)") |
|
||||
df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) |
|
||||
|
|
||||
# 数据清洗 |
|
||||
print("\n5. 数据清洗...") |
|
||||
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] |
|
||||
for col in regression_cols: |
|
||||
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) |
|
||||
df[col] = df[col].replace([float('inf'), float('-inf')], 0) |
|
||||
|
|
||||
# 验证数据 |
|
||||
print("\n6. 验证数据...") |
|
||||
print(f"总行数: {len(df)}") |
|
||||
print(f"总列数: {len(df.columns)}") |
|
||||
print(f"\n回归数据列统计:") |
|
||||
print(df[regression_cols].describe()) |
|
||||
print(f"\n前5行回归数据:") |
|
||||
print(df[regression_cols].head()) |
|
||||
|
|
||||
# 保存文件 |
|
||||
print("\n7. 保存文件...") |
|
||||
df.to_excel(output_file, index=False, engine='openpyxl') |
|
||||
|
|
||||
# 验证文件 |
|
||||
print("\n8. 验证文件...") |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"文件已成功保存: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
df_check = pd.read_excel(output_file) |
|
||||
print(f"输出文件行数: {len(df_check)}") |
|
||||
print(f"输出文件列数: {len(df_check.columns)}") |
|
||||
else: |
|
||||
print("文件保存失败!") |
|
||||
|
|
||||
print() |
|
||||
print("=" * 60) |
|
||||
print(" 任务完成") |
|
||||
print("=" * 60) |
|
||||
@ -1,200 +0,0 @@ |
|||||
import os |
|
||||
import openpyxl |
|
||||
import re |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 根据实际原始数据计算回归数据") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取原始数据 |
|
||||
try: |
|
||||
print("正在读取原始数据...") |
|
||||
wb_input = openpyxl.load_workbook(input_file) |
|
||||
ws_input = wb_input.active |
|
||||
|
|
||||
print(f"工作表名称: {ws_input.title}") |
|
||||
print(f"最大行数: {ws_input.max_row}") |
|
||||
print(f"最大列数: {ws_input.max_column}") |
|
||||
|
|
||||
# 识别列 |
|
||||
print("\n识别列...") |
|
||||
headers = [] |
|
||||
helpfull_col = None |
|
||||
comment_count_col = None |
|
||||
comment_cols = [] |
|
||||
|
|
||||
for col in range(1, ws_input.max_column + 1): |
|
||||
header = ws_input.cell(row=1, column=col).value |
|
||||
headers.append(header) |
|
||||
|
|
||||
if header: |
|
||||
header_str = str(header).lower() |
|
||||
if 'helpfull' in header_str or 'helpful' in header_str: |
|
||||
helpfull_col = col |
|
||||
print(f"找到 Y 列 (helpfull): 列 {col}") |
|
||||
elif '评论总数' in str(header) or '帖子评论总数' in str(header): |
|
||||
comment_count_col = col |
|
||||
print(f"找到 X1 列 (评论总数): 列 {col}") |
|
||||
elif '评论' in str(header) and any(str(i) in str(header) for i in range(1, 6)): |
|
||||
comment_cols.append(col) |
|
||||
print(f"找到评论列 {len(comment_cols)}: 列 {col} - {header}") |
|
||||
|
|
||||
print(f"\n共找到 {len(comment_cols)} 个评论列") |
|
||||
|
|
||||
# 创建或打开输出文件 |
|
||||
if os.path.exists(output_file): |
|
||||
print("\n打开现有输出文件...") |
|
||||
wb_output = openpyxl.load_workbook(output_file) |
|
||||
ws_output = wb_output.active |
|
||||
else: |
|
||||
print("\n创建新的输出文件...") |
|
||||
wb_output = openpyxl.Workbook() |
|
||||
ws_output = wb_output.active |
|
||||
# 写入表头 |
|
||||
headers_output = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] |
|
||||
for i, header in enumerate(headers_output, 1): |
|
||||
ws_output.cell(row=1, column=i, value=header) |
|
||||
|
|
||||
# 计算并填充数据 |
|
||||
print("\n计算并填充数据...") |
|
||||
total_rows = ws_input.max_row - 1 |
|
||||
print(f"总数据行数: {total_rows}") |
|
||||
|
|
||||
# 确保输出文件有足够的行 |
|
||||
if ws_output.max_row < ws_input.max_row: |
|
||||
print(f"扩展输出文件行数到 {ws_input.max_row}...") |
|
||||
|
|
||||
for row in range(2, ws_input.max_row + 1): |
|
||||
if row % 100 == 0: |
|
||||
print(f"处理到第 {row-1} 行...") |
|
||||
if row % 1000 == 0: |
|
||||
print(f"已处理 {row-1} 行,共 {total_rows} 行") |
|
||||
|
|
||||
# Y (UGC有用性) |
|
||||
if helpfull_col: |
|
||||
y_value = ws_input.cell(row=row, column=helpfull_col).value |
|
||||
y_value = float(y_value) if y_value else 0 |
|
||||
else: |
|
||||
y_value = 0 |
|
||||
ws_output.cell(row=row, column=1, value=y_value) |
|
||||
|
|
||||
# X1 (评论数量) |
|
||||
if comment_count_col: |
|
||||
x1_value = ws_input.cell(row=row, column=comment_count_col).value |
|
||||
x1_value = float(x1_value) if x1_value else 0 |
|
||||
else: |
|
||||
x1_value = 0 |
|
||||
ws_output.cell(row=row, column=2, value=x1_value) |
|
||||
|
|
||||
# 计算评论相关指标 |
|
||||
comment_lengths = [] |
|
||||
comment_complexities = [] |
|
||||
comment_sentiments = [] |
|
||||
comment_richness = [] |
|
||||
|
|
||||
for col in comment_cols: |
|
||||
content = str(ws_input.cell(row=row, column=col).value) |
|
||||
if content and content != 'None' and content != 'nan': |
|
||||
# X2: 评论长度(剔空格后的字符数) |
|
||||
length = len(content.replace(' ', '')) |
|
||||
comment_lengths.append(length) |
|
||||
|
|
||||
# X3: 评论复杂度(按空格拆分的分词数) |
|
||||
complexity = len(content.split()) |
|
||||
comment_complexities.append(complexity) |
|
||||
|
|
||||
# X5: 内容情感性(正面=1、中性=0、负面=-1) |
|
||||
positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent'] |
|
||||
negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor'] |
|
||||
|
|
||||
sentiment = 0 |
|
||||
lower_content = content.lower() |
|
||||
|
|
||||
if any(word in lower_content for word in positive_words): |
|
||||
sentiment = 1 |
|
||||
elif any(word in lower_content for word in negative_words): |
|
||||
sentiment = -1 |
|
||||
comment_sentiments.append(sentiment) |
|
||||
|
|
||||
# X6: 信息丰富度(含数字/链接/表情各1分,满分3分) |
|
||||
richness = 0 |
|
||||
if re.search(r'\d', content): |
|
||||
richness += 1 |
|
||||
if re.search(r'http[s]?://', content): |
|
||||
richness += 1 |
|
||||
if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content): |
|
||||
richness += 1 |
|
||||
comment_richness.append(richness) |
|
||||
|
|
||||
# X2: 评论长度平均值 |
|
||||
x2_value = sum(comment_lengths) / len(comment_lengths) if comment_lengths else 0 |
|
||||
ws_output.cell(row=row, column=3, value=x2_value) |
|
||||
|
|
||||
# X3: 评论复杂度平均值 |
|
||||
x3_value = sum(comment_complexities) / len(comment_complexities) if comment_complexities else 0 |
|
||||
ws_output.cell(row=row, column=4, value=x3_value) |
|
||||
|
|
||||
# X4: 评论可读性(X2/X3,X3为0时记0) |
|
||||
x4_value = x2_value / x3_value if x3_value > 0 else 0 |
|
||||
ws_output.cell(row=row, column=5, value=x4_value) |
|
||||
|
|
||||
# X5: 内容情感性平均值 |
|
||||
x5_value = sum(comment_sentiments) / len(comment_sentiments) if comment_sentiments else 0 |
|
||||
ws_output.cell(row=row, column=6, value=x5_value) |
|
||||
|
|
||||
# X6: 信息丰富度平均值 |
|
||||
x6_value = sum(comment_richness) / len(comment_richness) if comment_richness else 0 |
|
||||
ws_output.cell(row=row, column=7, value=x6_value) |
|
||||
|
|
||||
# 保存文件 |
|
||||
print("\n保存文件...") |
|
||||
wb_output.save(output_file) |
|
||||
|
|
||||
print(f"文件已成功保存: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
print(f"处理完成,共 {total_rows} 行数据") |
|
||||
|
|
||||
# 验证文件 |
|
||||
print("\n验证文件...") |
|
||||
if os.path.exists(output_file): |
|
||||
print("文件保存成功!") |
|
||||
# 重新打开文件检查 |
|
||||
wb_check = openpyxl.load_workbook(output_file) |
|
||||
ws_check = wb_check.active |
|
||||
print(f"输出文件行数: {ws_check.max_row - 1}") |
|
||||
print(f"输出文件列数: {ws_check.max_column}") |
|
||||
|
|
||||
# 显示前5行数据 |
|
||||
print("\n前5行数据:") |
|
||||
for row in range(1, min(6, ws_check.max_row + 1)): |
|
||||
row_data = [] |
|
||||
for col in range(1, ws_check.max_column + 1): |
|
||||
value = ws_check.cell(row=row, column=col).value |
|
||||
row_data.append(value) |
|
||||
print(f"行 {row}: {row_data}") |
|
||||
else: |
|
||||
print("文件保存失败!") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 任务完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,190 +0,0 @@ |
|||||
import os |
|
||||
import openpyxl |
|
||||
import re |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 处理所有数据") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取原始数据 |
|
||||
try: |
|
||||
print("正在读取原始数据...") |
|
||||
wb_input = openpyxl.load_workbook(input_file) |
|
||||
ws_input = wb_input.active |
|
||||
|
|
||||
print(f"工作表名称: {ws_input.title}") |
|
||||
print(f"最大行数: {ws_input.max_row}") |
|
||||
print(f"最大列数: {ws_input.max_column}") |
|
||||
|
|
||||
# 识别列 |
|
||||
print("\n识别列...") |
|
||||
headers = [] |
|
||||
helpfull_col = None |
|
||||
comment_count_col = None |
|
||||
comment_cols = [] |
|
||||
|
|
||||
for col in range(1, ws_input.max_column + 1): |
|
||||
header = ws_input.cell(row=1, column=col).value |
|
||||
headers.append(header) |
|
||||
|
|
||||
if header: |
|
||||
header_str = str(header).lower() |
|
||||
if 'helpfull' in header_str or 'helpful' in header_str: |
|
||||
helpfull_col = col |
|
||||
print(f"找到 Y 列 (helpfull): 列 {col}") |
|
||||
elif '评论总数' in str(header) or '帖子评论总数' in str(header): |
|
||||
comment_count_col = col |
|
||||
print(f"找到 X1 列 (评论总数): 列 {col}") |
|
||||
elif '评论' in str(header) and any(str(i) in str(header) for i in range(1, 6)): |
|
||||
comment_cols.append(col) |
|
||||
print(f"找到评论列 {len(comment_cols)}: 列 {col} - {header}") |
|
||||
|
|
||||
print(f"\n共找到 {len(comment_cols)} 个评论列") |
|
||||
|
|
||||
# 创建新的输出文件 |
|
||||
print("\n创建新的输出文件...") |
|
||||
wb_output = openpyxl.Workbook() |
|
||||
ws_output = wb_output.active |
|
||||
|
|
||||
# 写入表头 |
|
||||
headers_output = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] |
|
||||
for i, header in enumerate(headers_output, 1): |
|
||||
ws_output.cell(row=1, column=i, value=header) |
|
||||
|
|
||||
# 计算并填充数据 |
|
||||
print("\n计算并填充数据...") |
|
||||
total_rows = ws_input.max_row - 1 |
|
||||
print(f"总数据行数: {total_rows}") |
|
||||
|
|
||||
for row in range(2, ws_input.max_row + 1): |
|
||||
if row % 1000 == 0: |
|
||||
print(f"处理到第 {row-1} 行...") |
|
||||
|
|
||||
# Y (UGC有用性) |
|
||||
if helpfull_col: |
|
||||
y_value = ws_input.cell(row=row, column=helpfull_col).value |
|
||||
y_value = float(y_value) if y_value else 0 |
|
||||
else: |
|
||||
y_value = 0 |
|
||||
ws_output.cell(row=row, column=1, value=y_value) |
|
||||
|
|
||||
# X1 (评论数量) |
|
||||
if comment_count_col: |
|
||||
x1_value = ws_input.cell(row=row, column=comment_count_col).value |
|
||||
x1_value = float(x1_value) if x1_value else 0 |
|
||||
else: |
|
||||
x1_value = 0 |
|
||||
ws_output.cell(row=row, column=2, value=x1_value) |
|
||||
|
|
||||
# 计算评论相关指标 |
|
||||
comment_lengths = [] |
|
||||
comment_complexities = [] |
|
||||
comment_sentiments = [] |
|
||||
comment_richness = [] |
|
||||
|
|
||||
for col in comment_cols: |
|
||||
content = str(ws_input.cell(row=row, column=col).value) |
|
||||
if content and content != 'None' and content != 'nan': |
|
||||
# X2: 评论长度(剔空格后的字符数) |
|
||||
length = len(content.replace(' ', '')) |
|
||||
comment_lengths.append(length) |
|
||||
|
|
||||
# X3: 评论复杂度(按空格拆分的分词数) |
|
||||
complexity = len(content.split()) |
|
||||
comment_complexities.append(complexity) |
|
||||
|
|
||||
# X5: 内容情感性(正面=1、中性=0、负面=-1) |
|
||||
positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent'] |
|
||||
negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor'] |
|
||||
|
|
||||
sentiment = 0 |
|
||||
lower_content = content.lower() |
|
||||
|
|
||||
if any(word in lower_content for word in positive_words): |
|
||||
sentiment = 1 |
|
||||
elif any(word in lower_content for word in negative_words): |
|
||||
sentiment = -1 |
|
||||
comment_sentiments.append(sentiment) |
|
||||
|
|
||||
# X6: 信息丰富度(含数字/链接/表情各1分,满分3分) |
|
||||
richness = 0 |
|
||||
if re.search(r'\d', content): |
|
||||
richness += 1 |
|
||||
if re.search(r'http[s]?://', content): |
|
||||
richness += 1 |
|
||||
if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content): |
|
||||
richness += 1 |
|
||||
comment_richness.append(richness) |
|
||||
|
|
||||
# X2: 评论长度平均值 |
|
||||
x2_value = sum(comment_lengths) / len(comment_lengths) if comment_lengths else 0 |
|
||||
ws_output.cell(row=row, column=3, value=x2_value) |
|
||||
|
|
||||
# X3: 评论复杂度平均值 |
|
||||
x3_value = sum(comment_complexities) / len(comment_complexities) if comment_complexities else 0 |
|
||||
ws_output.cell(row=row, column=4, value=x3_value) |
|
||||
|
|
||||
# X4: 评论可读性(X2/X3,X3为0时记0) |
|
||||
x4_value = x2_value / x3_value if x3_value > 0 else 0 |
|
||||
ws_output.cell(row=row, column=5, value=x4_value) |
|
||||
|
|
||||
# X5: 内容情感性平均值 |
|
||||
x5_value = sum(comment_sentiments) / len(comment_sentiments) if comment_sentiments else 0 |
|
||||
ws_output.cell(row=row, column=6, value=x5_value) |
|
||||
|
|
||||
# X6: 信息丰富度平均值 |
|
||||
x6_value = sum(comment_richness) / len(comment_richness) if comment_richness else 0 |
|
||||
ws_output.cell(row=row, column=7, value=x6_value) |
|
||||
|
|
||||
# 保存文件 |
|
||||
print("\n保存文件...") |
|
||||
wb_output.save(output_file) |
|
||||
|
|
||||
print(f"文件已成功保存: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
print(f"处理完成,共 {total_rows} 行数据") |
|
||||
|
|
||||
# 验证文件 |
|
||||
print("\n验证文件...") |
|
||||
if os.path.exists(output_file): |
|
||||
print("文件保存成功!") |
|
||||
# 重新打开文件检查 |
|
||||
wb_check = openpyxl.load_workbook(output_file) |
|
||||
ws_check = wb_check.active |
|
||||
print(f"输出文件行数: {ws_check.max_row - 1}") |
|
||||
print(f"输出文件列数: {ws_check.max_column}") |
|
||||
|
|
||||
# 显示前5行数据 |
|
||||
print("\n前5行数据:") |
|
||||
for row in range(1, min(6, ws_check.max_row + 1)): |
|
||||
row_data = [] |
|
||||
for col in range(1, ws_check.max_column + 1): |
|
||||
value = ws_check.cell(row=row, column=col).value |
|
||||
row_data.append(value) |
|
||||
print(f"行 {row}: {row_data}") |
|
||||
else: |
|
||||
print("文件保存失败!") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 任务完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,157 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
import re |
|
||||
|
|
||||
print("=" * 60) |
|
||||
print(" 处理全部数据") |
|
||||
print("=" * 60) |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' |
|
||||
|
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 读取全部数据 |
|
||||
print("读取全部数据...") |
|
||||
df = pd.read_excel(input_file, engine='openpyxl') |
|
||||
print(f"成功读取 {len(df)} 行数据") |
|
||||
print(f"原始列数: {len(df.columns)}") |
|
||||
|
|
||||
# 识别列 |
|
||||
print("\n识别列...") |
|
||||
helpfull_col = None |
|
||||
comment_count_col = None |
|
||||
comment_cols = [] |
|
||||
|
|
||||
for col in df.columns: |
|
||||
col_str = str(col).lower() |
|
||||
if 'helpfull' in col_str or 'helpful' in col_str: |
|
||||
helpfull_col = col |
|
||||
print(f"找到 Y 列 (helpfull): {col}") |
|
||||
elif '评论总数' in str(col) or '帖子评论总数' in str(col): |
|
||||
comment_count_col = col |
|
||||
print(f"找到 X1 列 (评论总数): {col}") |
|
||||
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): |
|
||||
comment_cols.append(col) |
|
||||
|
|
||||
print(f"\n共找到 {len(comment_cols)} 个评论内容列") |
|
||||
|
|
||||
# 添加回归数据列 |
|
||||
print("\n添加回归数据列...") |
|
||||
|
|
||||
# Y (UGC有用性) |
|
||||
print("1. 添加 Y (UGC有用性)") |
|
||||
if helpfull_col: |
|
||||
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
df['Y'] = 0 |
|
||||
|
|
||||
# X1 (评论数量) |
|
||||
print("2. 添加 X1 (评论数量)") |
|
||||
if comment_count_col: |
|
||||
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
df['X1'] = 0 |
|
||||
|
|
||||
# 定义函数计算评论指标 |
|
||||
def calculate_comment_metrics(content): |
|
||||
if pd.isna(content) or str(content) in ['None', 'nan', '']: |
|
||||
return 0, 0, 0, 0 |
|
||||
|
|
||||
content = str(content) |
|
||||
length = len(content.replace(' ', '').replace('\u3000', '')) |
|
||||
complexity = len(content.split()) |
|
||||
|
|
||||
positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] |
|
||||
negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] |
|
||||
|
|
||||
sentiment = 0 |
|
||||
lower_content = content.lower() |
|
||||
if any(word in lower_content for word in positive_words): |
|
||||
sentiment = 1 |
|
||||
elif any(word in lower_content for word in negative_words): |
|
||||
sentiment = -1 |
|
||||
|
|
||||
richness = 0 |
|
||||
if re.search(r'\d', content): |
|
||||
richness += 1 |
|
||||
if re.search(r'http[s]?://|www\.', content): |
|
||||
richness += 1 |
|
||||
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): |
|
||||
richness += 1 |
|
||||
|
|
||||
return length, complexity, sentiment, richness |
|
||||
|
|
||||
# 计算评论相关指标 |
|
||||
print("3. 计算评论相关指标...") |
|
||||
print(f"总数据行数: {len(df)}") |
|
||||
|
|
||||
df['X2'] = 0.0 |
|
||||
df['X3'] = 0.0 |
|
||||
df['X5'] = 0.0 |
|
||||
df['X6'] = 0.0 |
|
||||
|
|
||||
for i in range(len(df)): |
|
||||
if i % 1000 == 0: |
|
||||
print(f" 处理第 {i}/{len(df)} 行...") |
|
||||
|
|
||||
lengths = [] |
|
||||
complexities = [] |
|
||||
sentiments = [] |
|
||||
richness = [] |
|
||||
|
|
||||
for col in comment_cols: |
|
||||
content = df.iloc[i].get(col, '') |
|
||||
length, complexity, sentiment, r = calculate_comment_metrics(content) |
|
||||
if length > 0: |
|
||||
lengths.append(length) |
|
||||
complexities.append(complexity) |
|
||||
sentiments.append(sentiment) |
|
||||
richness.append(r) |
|
||||
|
|
||||
if lengths: |
|
||||
df.loc[i, 'X2'] = sum(lengths) / len(lengths) |
|
||||
df.loc[i, 'X3'] = sum(complexities) / len(complexities) |
|
||||
df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) |
|
||||
df.loc[i, 'X6'] = sum(richness) / len(richness) |
|
||||
|
|
||||
# X4: 评论可读性 |
|
||||
print("4. 计算 X4 (评论可读性)") |
|
||||
df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) |
|
||||
|
|
||||
# 数据清洗 |
|
||||
print("\n5. 数据清洗...") |
|
||||
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] |
|
||||
for col in regression_cols: |
|
||||
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) |
|
||||
df[col] = df[col].replace([float('inf'), float('-inf')], 0) |
|
||||
|
|
||||
# 验证数据 |
|
||||
print("\n6. 验证数据...") |
|
||||
print(f"总行数: {len(df)}") |
|
||||
print(f"总列数: {len(df.columns)}") |
|
||||
print(f"\n回归数据列统计:") |
|
||||
print(df[regression_cols].describe()) |
|
||||
|
|
||||
# 保存文件 |
|
||||
print("\n7. 保存文件...") |
|
||||
df.to_excel(output_file, index=False, engine='openpyxl') |
|
||||
|
|
||||
# 验证文件 |
|
||||
print("\n8. 验证文件...") |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"文件已成功保存: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
df_check = pd.read_excel(output_file) |
|
||||
print(f"输出文件行数: {len(df_check)}") |
|
||||
print(f"输出文件列数: {len(df_check.columns)}") |
|
||||
else: |
|
||||
print("文件保存失败!") |
|
||||
|
|
||||
print() |
|
||||
print("=" * 60) |
|
||||
print(" 任务完成") |
|
||||
print("=" * 60) |
|
||||
@ -1,180 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
import re |
|
||||
|
|
||||
print("=" * 60) |
|
||||
print(" 高效处理全部数据") |
|
||||
print("=" * 60) |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' |
|
||||
|
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 首先读取表头来识别列 |
|
||||
print("1. 读取表头...") |
|
||||
df_header = pd.read_excel(input_file, engine='openpyxl', nrows=0) |
|
||||
print(f"总列数: {len(df_header.columns)}") |
|
||||
|
|
||||
# 识别列 |
|
||||
helpfull_col = None |
|
||||
comment_count_col = None |
|
||||
comment_cols = [] |
|
||||
|
|
||||
for col in df_header.columns: |
|
||||
col_str = str(col).lower() |
|
||||
if 'helpfull' in col_str or 'helpful' in col_str: |
|
||||
helpfull_col = col |
|
||||
print(f"找到 Y 列 (helpfull): {col}") |
|
||||
elif '评论总数' in str(col) or '帖子评论总数' in str(col): |
|
||||
comment_count_col = col |
|
||||
print(f"找到 X1 列 (评论总数): {col}") |
|
||||
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): |
|
||||
comment_cols.append(col) |
|
||||
|
|
||||
print(f"共找到 {len(comment_cols)} 个评论内容列") |
|
||||
|
|
||||
# 定义函数计算评论指标 |
|
||||
def calculate_comment_metrics(content): |
|
||||
if pd.isna(content) or str(content) in ['None', 'nan', '']: |
|
||||
return 0, 0, 0, 0 |
|
||||
|
|
||||
content = str(content) |
|
||||
length = len(content.replace(' ', '').replace('\u3000', '')) |
|
||||
complexity = len(content.split()) |
|
||||
|
|
||||
positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] |
|
||||
negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] |
|
||||
|
|
||||
sentiment = 0 |
|
||||
lower_content = content.lower() |
|
||||
if any(word in lower_content for word in positive_words): |
|
||||
sentiment = 1 |
|
||||
elif any(word in lower_content for word in negative_words): |
|
||||
sentiment = -1 |
|
||||
|
|
||||
richness = 0 |
|
||||
if re.search(r'\d', content): |
|
||||
richness += 1 |
|
||||
if re.search(r'http[s]?://|www\.', content): |
|
||||
richness += 1 |
|
||||
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): |
|
||||
richness += 1 |
|
||||
|
|
||||
return length, complexity, sentiment, richness |
|
||||
|
|
||||
# 分批处理数据 |
|
||||
print("\n2. 分批处理数据...") |
|
||||
batch_size = 5000 |
|
||||
batch_num = 0 |
|
||||
all_data = [] |
|
||||
|
|
||||
while True: |
|
||||
skip_rows = batch_num * batch_size + 1 if batch_num > 0 else 0 |
|
||||
nrows = batch_size |
|
||||
|
|
||||
print(f" 处理批次 {batch_num + 1} (跳过 {skip_rows} 行,读取 {nrows} 行)...") |
|
||||
|
|
||||
try: |
|
||||
if batch_num == 0: |
|
||||
df_batch = pd.read_excel(input_file, engine='openpyxl', nrows=nrows) |
|
||||
else: |
|
||||
df_batch = pd.read_excel(input_file, engine='openpyxl', skiprows=skip_rows, nrows=nrows, header=None) |
|
||||
df_batch.columns = df_header.columns |
|
||||
except Exception as e: |
|
||||
print(f" 读取完成或出错: {e}") |
|
||||
break |
|
||||
|
|
||||
if len(df_batch) == 0: |
|
||||
print(" 没有更多数据") |
|
||||
break |
|
||||
|
|
||||
print(f" 读取了 {len(df_batch)} 行") |
|
||||
|
|
||||
# 添加Y和X1 |
|
||||
if helpfull_col: |
|
||||
df_batch['Y'] = pd.to_numeric(df_batch[helpfull_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
df_batch['Y'] = 0 |
|
||||
|
|
||||
if comment_count_col: |
|
||||
df_batch['X1'] = pd.to_numeric(df_batch[comment_count_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
df_batch['X1'] = 0 |
|
||||
|
|
||||
# 初始化X2-X6 |
|
||||
df_batch['X2'] = 0.0 |
|
||||
df_batch['X3'] = 0.0 |
|
||||
df_batch['X5'] = 0.0 |
|
||||
df_batch['X6'] = 0.0 |
|
||||
|
|
||||
# 计算评论指标 |
|
||||
for i in range(len(df_batch)): |
|
||||
lengths = [] |
|
||||
complexities = [] |
|
||||
sentiments = [] |
|
||||
richness = [] |
|
||||
|
|
||||
for col in comment_cols: |
|
||||
content = df_batch.iloc[i].get(col, '') |
|
||||
length, complexity, sentiment, r = calculate_comment_metrics(content) |
|
||||
if length > 0: |
|
||||
lengths.append(length) |
|
||||
complexities.append(complexity) |
|
||||
sentiments.append(sentiment) |
|
||||
richness.append(r) |
|
||||
|
|
||||
if lengths: |
|
||||
df_batch.loc[i, 'X2'] = sum(lengths) / len(lengths) |
|
||||
df_batch.loc[i, 'X3'] = sum(complexities) / len(complexities) |
|
||||
df_batch.loc[i, 'X5'] = sum(sentiments) / len(sentiments) |
|
||||
df_batch.loc[i, 'X6'] = sum(richness) / len(richness) |
|
||||
|
|
||||
# 计算X4 |
|
||||
df_batch['X4'] = df_batch.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) |
|
||||
|
|
||||
# 数据清洗 |
|
||||
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] |
|
||||
for col in regression_cols: |
|
||||
df_batch[col] = pd.to_numeric(df_batch[col], errors='coerce').fillna(0) |
|
||||
df_batch[col] = df_batch[col].replace([float('inf'), float('-inf')], 0) |
|
||||
|
|
||||
all_data.append(df_batch) |
|
||||
batch_num += 1 |
|
||||
|
|
||||
print(f" 批次 {batch_num} 完成,当前总行数: {sum(len(d) for d in all_data)}") |
|
||||
|
|
||||
# 合并所有数据 |
|
||||
print("\n3. 合并数据...") |
|
||||
df_final = pd.concat(all_data, ignore_index=True) |
|
||||
print(f"合并后总行数: {len(df_final)}") |
|
||||
|
|
||||
# 验证数据 |
|
||||
print("\n4. 验证数据...") |
|
||||
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] |
|
||||
print(f"总列数: {len(df_final.columns)}") |
|
||||
print(f"\n回归数据列统计:") |
|
||||
print(df_final[regression_cols].describe()) |
|
||||
|
|
||||
# 保存文件 |
|
||||
print("\n5. 保存文件...") |
|
||||
df_final.to_excel(output_file, index=False, engine='openpyxl') |
|
||||
|
|
||||
# 验证文件 |
|
||||
print("\n6. 验证文件...") |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"文件已成功保存: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
df_check = pd.read_excel(output_file) |
|
||||
print(f"输出文件行数: {len(df_check)}") |
|
||||
print(f"输出文件列数: {len(df_check.columns)}") |
|
||||
else: |
|
||||
print("文件保存失败!") |
|
||||
|
|
||||
print() |
|
||||
print("=" * 60) |
|
||||
print(" 任务完成") |
|
||||
print("=" * 60) |
|
||||
@ -1,177 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
import re |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 处理大型Excel文件") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取原始数据 |
|
||||
try: |
|
||||
print("正在读取原始数据...") |
|
||||
# 使用pandas读取Excel文件,设置引擎为openpyxl |
|
||||
df = pd.read_excel(input_file, engine='openpyxl') |
|
||||
print(f"成功读取 {len(df)} 行数据") |
|
||||
print(f"列名: {list(df.columns)}") |
|
||||
|
|
||||
# 识别列 |
|
||||
print("\n识别列...") |
|
||||
helpfull_col = None |
|
||||
comment_count_col = None |
|
||||
comment_cols = [] |
|
||||
|
|
||||
for col in df.columns: |
|
||||
col_str = str(col).lower() |
|
||||
if 'helpfull' in col_str or 'helpful' in col_str: |
|
||||
helpfull_col = col |
|
||||
print(f"找到 Y 列 (helpfull): {col}") |
|
||||
elif '评论总数' in str(col) or '帖子评论总数' in str(col): |
|
||||
comment_count_col = col |
|
||||
print(f"找到 X1 列 (评论总数): {col}") |
|
||||
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)): |
|
||||
comment_cols.append(col) |
|
||||
print(f"找到评论列 {len(comment_cols)}: {col}") |
|
||||
|
|
||||
print(f"\n共找到 {len(comment_cols)} 个评论列") |
|
||||
|
|
||||
# 创建回归数据 |
|
||||
print("\n创建回归数据...") |
|
||||
regression_data = pd.DataFrame() |
|
||||
|
|
||||
# Y (UGC有用性) |
|
||||
print("1. 计算 Y (UGC有用性)") |
|
||||
if helpfull_col: |
|
||||
regression_data['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
regression_data['Y'] = 0 |
|
||||
|
|
||||
# X1 (评论数量) |
|
||||
print("2. 计算 X1 (评论数量)") |
|
||||
if comment_count_col: |
|
||||
regression_data['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
regression_data['X1'] = 0 |
|
||||
|
|
||||
# 定义函数计算评论指标 |
|
||||
def calculate_comment_metrics(content): |
|
||||
if pd.isna(content) or str(content) in ['None', 'nan']: |
|
||||
return 0, 0, 0, 0 |
|
||||
|
|
||||
content = str(content) |
|
||||
# 评论长度 |
|
||||
length = len(content.replace(' ', '')) |
|
||||
# 评论复杂度 |
|
||||
complexity = len(content.split()) |
|
||||
# 情感分析 |
|
||||
positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent'] |
|
||||
negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor'] |
|
||||
|
|
||||
sentiment = 0 |
|
||||
lower_content = content.lower() |
|
||||
if any(word in lower_content for word in positive_words): |
|
||||
sentiment = 1 |
|
||||
elif any(word in lower_content for word in negative_words): |
|
||||
sentiment = -1 |
|
||||
# 信息丰富度 |
|
||||
richness = 0 |
|
||||
if re.search(r'\d', content): |
|
||||
richness += 1 |
|
||||
if re.search(r'http[s]?://', content): |
|
||||
richness += 1 |
|
||||
if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content): |
|
||||
richness += 1 |
|
||||
|
|
||||
return length, complexity, sentiment, richness |
|
||||
|
|
||||
# 计算评论相关指标 |
|
||||
print("3. 计算评论相关指标...") |
|
||||
|
|
||||
# 初始化列 |
|
||||
regression_data['X2'] = 0 # 评论长度 |
|
||||
regression_data['X3'] = 0 # 评论复杂度 |
|
||||
regression_data['X5'] = 0 # 情感性 |
|
||||
regression_data['X6'] = 0 # 信息丰富度 |
|
||||
|
|
||||
# 逐行计算 |
|
||||
total_rows = len(df) |
|
||||
for i in range(total_rows): |
|
||||
if i % 1000 == 0: |
|
||||
print(f"处理到第 {i} 行...") |
|
||||
|
|
||||
lengths = [] |
|
||||
complexities = [] |
|
||||
sentiments = [] |
|
||||
richness = [] |
|
||||
|
|
||||
for col in comment_cols: |
|
||||
content = df.iloc[i].get(col, '') |
|
||||
length, complexity, sentiment, r = calculate_comment_metrics(content) |
|
||||
if length > 0: |
|
||||
lengths.append(length) |
|
||||
complexities.append(complexity) |
|
||||
sentiments.append(sentiment) |
|
||||
richness.append(r) |
|
||||
|
|
||||
# 计算平均值 |
|
||||
if lengths: |
|
||||
regression_data.loc[i, 'X2'] = sum(lengths) / len(lengths) |
|
||||
regression_data.loc[i, 'X3'] = sum(complexities) / len(complexities) |
|
||||
regression_data.loc[i, 'X5'] = sum(sentiments) / len(sentiments) |
|
||||
regression_data.loc[i, 'X6'] = sum(richness) / len(richness) |
|
||||
|
|
||||
# X4: 评论可读性 |
|
||||
print("4. 计算 X4 (评论可读性)") |
|
||||
regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) |
|
||||
|
|
||||
# 数据清洗 |
|
||||
print("\n5. 数据清洗...") |
|
||||
for col in regression_data.columns: |
|
||||
regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0) |
|
||||
|
|
||||
# 验证数据 |
|
||||
print("\n6. 验证数据...") |
|
||||
print(f"行数: {len(regression_data)}") |
|
||||
print(f"列数: {len(regression_data.columns)}") |
|
||||
print(f"列名: {list(regression_data.columns)}") |
|
||||
print(f"\n前5行数据:") |
|
||||
print(regression_data.head()) |
|
||||
|
|
||||
# 保存文件 |
|
||||
print("\n7. 保存文件...") |
|
||||
regression_data.to_excel(output_file, index=False) |
|
||||
|
|
||||
# 验证文件 |
|
||||
print("\n8. 验证文件...") |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"文件已成功保存: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
# 重新读取检查 |
|
||||
df_check = pd.read_excel(output_file) |
|
||||
print(f"输出文件行数: {len(df_check)}") |
|
||||
print(f"输出文件列数: {len(df_check.columns)}") |
|
||||
else: |
|
||||
print("文件保存失败!") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 任务完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,9 +0,0 @@ |
|||||
======================================== |
|
||||
在原表中添加回归数据列 |
|
||||
======================================== |
|
||||
输入文件: D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx |
|
||||
输出文件: D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx |
|
||||
|
|
||||
输入文件大小: 21607.43 KB |
|
||||
|
|
||||
正在读取原始数据... |
|
||||
@ -1,192 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
import re |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 在原表中添加回归数据列") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取原始数据 |
|
||||
try: |
|
||||
print("\n正在读取原始数据...") |
|
||||
df = pd.read_excel(input_file) |
|
||||
print(f"成功读取 {len(df)} 行数据") |
|
||||
print(f"原始列数: {len(df.columns)}") |
|
||||
|
|
||||
# 识别列 |
|
||||
print("\n识别列...") |
|
||||
helpfull_col = None |
|
||||
comment_count_col = None |
|
||||
comment_cols = [] |
|
||||
|
|
||||
for col in df.columns: |
|
||||
col_str = str(col).lower() |
|
||||
if 'helpfull' in col_str or 'helpful' in col_str: |
|
||||
helpfull_col = col |
|
||||
print(f"找到 Y 列 (helpfull): {col}") |
|
||||
elif '评论总数' in str(col) or '帖子评论总数' in str(col): |
|
||||
comment_count_col = col |
|
||||
print(f"找到 X1 列 (评论总数): {col}") |
|
||||
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): |
|
||||
comment_cols.append(col) |
|
||||
print(f"找到评论列 {len(comment_cols)}: {col}") |
|
||||
|
|
||||
print(f"\n共找到 {len(comment_cols)} 个评论内容列") |
|
||||
|
|
||||
# 添加回归数据列 |
|
||||
print("\n添加回归数据列...") |
|
||||
|
|
||||
# Y (UGC有用性) - 直接复制helpfull列 |
|
||||
print("1. 添加 Y (UGC有用性)") |
|
||||
if helpfull_col: |
|
||||
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
df['Y'] = 0 |
|
||||
|
|
||||
# X1 (评论数量) - 直接复制帖子评论总数列 |
|
||||
print("2. 添加 X1 (评论数量)") |
|
||||
if comment_count_col: |
|
||||
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
df['X1'] = 0 |
|
||||
|
|
||||
# 定义函数计算评论指标 |
|
||||
def calculate_comment_metrics(content): |
|
||||
if pd.isna(content) or str(content) in ['None', 'nan', '']: |
|
||||
return 0, 0, 0, 0 |
|
||||
|
|
||||
content = str(content) |
|
||||
# X2: 评论长度(剔空格后的字符数) |
|
||||
length = len(content.replace(' ', '').replace('\u3000', '')) |
|
||||
# X3: 评论复杂度(按空格拆分的分词数) |
|
||||
complexity = len(content.split()) |
|
||||
# X5: 情感分析(正面=1、中性=0、负面=-1) |
|
||||
positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] |
|
||||
negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] |
|
||||
|
|
||||
sentiment = 0 |
|
||||
lower_content = content.lower() |
|
||||
if any(word in lower_content for word in positive_words): |
|
||||
sentiment = 1 |
|
||||
elif any(word in lower_content for word in negative_words): |
|
||||
sentiment = -1 |
|
||||
# X6: 信息丰富度(含数字/链接/表情各1分,满分3分) |
|
||||
richness = 0 |
|
||||
if re.search(r'\d', content): # 含数字 |
|
||||
richness += 1 |
|
||||
if re.search(r'http[s]?://|www\.', content): # 含链接 |
|
||||
richness += 1 |
|
||||
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情 |
|
||||
richness += 1 |
|
||||
|
|
||||
return length, complexity, sentiment, richness |
|
||||
|
|
||||
# 计算评论相关指标 |
|
||||
print("3. 计算评论相关指标...") |
|
||||
|
|
||||
# 初始化列 |
|
||||
df['X2'] = 0.0 # 评论长度 |
|
||||
df['X3'] = 0.0 # 评论复杂度 |
|
||||
df['X5'] = 0.0 # 情感性 |
|
||||
df['X6'] = 0.0 # 信息丰富度 |
|
||||
|
|
||||
# 逐行计算 |
|
||||
total_rows = len(df) |
|
||||
print(f"总数据行数: {total_rows}") |
|
||||
|
|
||||
for i in range(total_rows): |
|
||||
if i % 1000 == 0: |
|
||||
print(f" 处理第 {i}/{total_rows} 行...") |
|
||||
|
|
||||
lengths = [] |
|
||||
complexities = [] |
|
||||
sentiments = [] |
|
||||
richness = [] |
|
||||
|
|
||||
for col in comment_cols: |
|
||||
content = df.iloc[i].get(col, '') |
|
||||
length, complexity, sentiment, r = calculate_comment_metrics(content) |
|
||||
if length > 0: # 只统计有内容的评论 |
|
||||
lengths.append(length) |
|
||||
complexities.append(complexity) |
|
||||
sentiments.append(sentiment) |
|
||||
richness.append(r) |
|
||||
|
|
||||
# 计算平均值(无评论记0) |
|
||||
if lengths: |
|
||||
df.loc[i, 'X2'] = sum(lengths) / len(lengths) |
|
||||
df.loc[i, 'X3'] = sum(complexities) / len(complexities) |
|
||||
df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) |
|
||||
df.loc[i, 'X6'] = sum(richness) / len(richness) |
|
||||
|
|
||||
# X4: 评论可读性 = X2/X3(X3为0时记0,避免报错) |
|
||||
print("4. 计算 X4 (评论可读性)") |
|
||||
df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) |
|
||||
|
|
||||
# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误 |
|
||||
print("\n5. 数据清洗...") |
|
||||
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] |
|
||||
for col in regression_cols: |
|
||||
# 转换为数字,错误值转为0 |
|
||||
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) |
|
||||
# 替换无穷大 |
|
||||
df[col] = df[col].replace([float('inf'), float('-inf')], 0) |
|
||||
|
|
||||
# 验证数据 |
|
||||
print("\n6. 验证数据...") |
|
||||
print(f"总行数: {len(df)}") |
|
||||
print(f"总列数: {len(df.columns)}") |
|
||||
print(f"\n回归数据列统计:") |
|
||||
print(df[regression_cols].describe()) |
|
||||
print(f"\n前5行回归数据:") |
|
||||
print(df[regression_cols].head()) |
|
||||
|
|
||||
# 检查是否有空值或错误值 |
|
||||
print(f"\n空值检查:") |
|
||||
for col in regression_cols: |
|
||||
null_count = df[col].isnull().sum() |
|
||||
print(f" {col}: {null_count} 个空值") |
|
||||
|
|
||||
# 保存文件 |
|
||||
print("\n7. 保存文件...") |
|
||||
print(f"正在保存到: {output_file}") |
|
||||
df.to_excel(output_file, index=False, engine='openpyxl') |
|
||||
|
|
||||
# 验证文件 |
|
||||
print("\n8. 验证文件...") |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"文件已成功保存: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
# 重新读取检查 |
|
||||
df_check = pd.read_excel(output_file) |
|
||||
print(f"输出文件行数: {len(df_check)}") |
|
||||
print(f"输出文件列数: {len(df_check.columns)}") |
|
||||
print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}") |
|
||||
else: |
|
||||
print("文件保存失败!") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 任务完成") |
|
||||
print("========================================") |
|
||||
print(f"新文件已保存: {output_file}") |
|
||||
print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,202 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
import re |
|
||||
|
|
||||
print("=" * 60) |
|
||||
print(" 使用CSV处理回归数据") |
|
||||
print("=" * 60) |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' |
|
||||
|
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取原始数据 |
|
||||
print("\n正在读取原始数据...") |
|
||||
try: |
|
||||
df = pd.read_excel(input_file, engine='openpyxl') |
|
||||
print(f"成功读取 {len(df)} 行数据") |
|
||||
print(f"原始列数: {len(df.columns)}") |
|
||||
except Exception as e: |
|
||||
print(f"读取失败: {e}") |
|
||||
exit(1) |
|
||||
|
|
||||
# 识别列 |
|
||||
print("\n识别列...") |
|
||||
helpfull_col = None |
|
||||
comment_count_col = None |
|
||||
comment_cols = [] |
|
||||
|
|
||||
for col in df.columns: |
|
||||
col_str = str(col).lower() |
|
||||
if 'helpfull' in col_str or 'helpful' in col_str: |
|
||||
helpfull_col = col |
|
||||
print(f"找到 Y 列 (helpfull): {col}") |
|
||||
elif '评论总数' in str(col) or '帖子评论总数' in str(col): |
|
||||
comment_count_col = col |
|
||||
print(f"找到 X1 列 (评论总数): {col}") |
|
||||
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): |
|
||||
comment_cols.append(col) |
|
||||
print(f"找到评论列 {len(comment_cols)}: {col}") |
|
||||
|
|
||||
print(f"\n共找到 {len(comment_cols)} 个评论内容列") |
|
||||
|
|
||||
# 添加回归数据列 |
|
||||
print("\n添加回归数据列...") |
|
||||
|
|
||||
# Y (UGC有用性) - 直接复制helpfull列 |
|
||||
print("1. 添加 Y (UGC有用性)") |
|
||||
if helpfull_col: |
|
||||
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
df['Y'] = 0 |
|
||||
|
|
||||
# X1 (评论数量) - 直接复制帖子评论总数列 |
|
||||
print("2. 添加 X1 (评论数量)") |
|
||||
if comment_count_col: |
|
||||
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
df['X1'] = 0 |
|
||||
|
|
||||
# 定义函数计算评论指标 |
|
||||
def calculate_comment_metrics(content): |
|
||||
if pd.isna(content) or str(content) in ['None', 'nan', '']: |
|
||||
return 0, 0, 0, 0 |
|
||||
|
|
||||
content = str(content) |
|
||||
# X2: 评论长度(剔空格后的字符数) |
|
||||
length = len(content.replace(' ', '').replace('\u3000', '')) |
|
||||
# X3: 评论复杂度(按空格拆分的分词数) |
|
||||
complexity = len(content.split()) |
|
||||
# X5: 情感分析(正面=1、中性=0、负面=-1) |
|
||||
positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] |
|
||||
negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] |
|
||||
|
|
||||
sentiment = 0 |
|
||||
lower_content = content.lower() |
|
||||
if any(word in lower_content for word in positive_words): |
|
||||
sentiment = 1 |
|
||||
elif any(word in lower_content for word in negative_words): |
|
||||
sentiment = -1 |
|
||||
# X6: 信息丰富度(含数字/链接/表情各1分,满分3分) |
|
||||
richness = 0 |
|
||||
if re.search(r'\d', content): # 含数字 |
|
||||
richness += 1 |
|
||||
if re.search(r'http[s]?://|www\.', content): # 含链接 |
|
||||
richness += 1 |
|
||||
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情 |
|
||||
richness += 1 |
|
||||
|
|
||||
return length, complexity, sentiment, richness |
|
||||
|
|
||||
# 计算评论相关指标 |
|
||||
print("3. 计算评论相关指标...") |
|
||||
|
|
||||
# 初始化列 |
|
||||
df['X2'] = 0.0 # 评论长度 |
|
||||
df['X3'] = 0.0 # 评论复杂度 |
|
||||
df['X5'] = 0.0 # 情感性 |
|
||||
df['X6'] = 0.0 # 信息丰富度 |
|
||||
|
|
||||
# 逐行计算 |
|
||||
total_rows = len(df) |
|
||||
print(f"总数据行数: {total_rows}") |
|
||||
|
|
||||
for i in range(total_rows): |
|
||||
if i % 1000 == 0: |
|
||||
print(f" 处理第 {i}/{total_rows} 行...") |
|
||||
|
|
||||
lengths = [] |
|
||||
complexities = [] |
|
||||
sentiments = [] |
|
||||
richness = [] |
|
||||
|
|
||||
for col in comment_cols: |
|
||||
content = df.iloc[i].get(col, '') |
|
||||
length, complexity, sentiment, r = calculate_comment_metrics(content) |
|
||||
if length > 0: # 只统计有内容的评论 |
|
||||
lengths.append(length) |
|
||||
complexities.append(complexity) |
|
||||
sentiments.append(sentiment) |
|
||||
richness.append(r) |
|
||||
|
|
||||
# 计算平均值(无评论记0) |
|
||||
if lengths: |
|
||||
df.loc[i, 'X2'] = sum(lengths) / len(lengths) |
|
||||
df.loc[i, 'X3'] = sum(complexities) / len(complexities) |
|
||||
df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) |
|
||||
df.loc[i, 'X6'] = sum(richness) / len(richness) |
|
||||
|
|
||||
# X4: 评论可读性 = X2/X3(X3为0时记0,避免报错) |
|
||||
print("4. 计算 X4 (评论可读性)") |
|
||||
df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) |
|
||||
|
|
||||
# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误 |
|
||||
print("\n5. 数据清洗...") |
|
||||
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] |
|
||||
for col in regression_cols: |
|
||||
# 转换为数字,错误值转为0 |
|
||||
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) |
|
||||
# 替换无穷大 |
|
||||
df[col] = df[col].replace([float('inf'), float('-inf')], 0) |
|
||||
|
|
||||
# 验证数据 |
|
||||
print("\n6. 验证数据...") |
|
||||
print(f"总行数: {len(df)}") |
|
||||
print(f"总列数: {len(df.columns)}") |
|
||||
print(f"\n回归数据列统计:") |
|
||||
print(df[regression_cols].describe()) |
|
||||
print(f"\n前5行回归数据:") |
|
||||
print(df[regression_cols].head()) |
|
||||
|
|
||||
# 检查是否有空值或错误值 |
|
||||
print(f"\n空值检查:") |
|
||||
for col in regression_cols: |
|
||||
null_count = df[col].isnull().sum() |
|
||||
print(f" {col}: {null_count} 个空值") |
|
||||
|
|
||||
# 保存为CSV中间文件 |
|
||||
print("\n7. 保存为CSV中间文件...") |
|
||||
csv_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\temp_regression.csv' |
|
||||
df.to_csv(csv_file, index=False, encoding='utf-8-sig') |
|
||||
print(f"CSV文件已保存: {csv_file}") |
|
||||
print(f"CSV文件大小: {os.path.getsize(csv_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 从CSV读取并保存为Excel |
|
||||
print("\n8. 转换为Excel文件...") |
|
||||
df_csv = pd.read_csv(csv_file, encoding='utf-8-sig') |
|
||||
df_csv.to_excel(output_file, index=False, engine='openpyxl') |
|
||||
|
|
||||
# 验证文件 |
|
||||
print("\n9. 验证文件...") |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"文件已成功保存: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
# 重新读取检查 |
|
||||
df_check = pd.read_excel(output_file) |
|
||||
print(f"输出文件行数: {len(df_check)}") |
|
||||
print(f"输出文件列数: {len(df_check.columns)}") |
|
||||
print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}") |
|
||||
|
|
||||
# 删除临时CSV文件 |
|
||||
os.remove(csv_file) |
|
||||
print(f"\n临时CSV文件已删除") |
|
||||
else: |
|
||||
print("文件保存失败!") |
|
||||
|
|
||||
print() |
|
||||
print("=" * 60) |
|
||||
print(" 任务完成") |
|
||||
print("=" * 60) |
|
||||
print(f"新文件已保存: {output_file}") |
|
||||
print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列") |
|
||||
@ -1,168 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
import re |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 使用pandas处理所有数据") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取原始数据 |
|
||||
try: |
|
||||
print("正在读取原始数据...") |
|
||||
df = pd.read_excel(input_file) |
|
||||
print(f"成功读取 {len(df)} 行数据") |
|
||||
print(f"列名: {list(df.columns)}") |
|
||||
|
|
||||
# 识别列 |
|
||||
print("\n识别列...") |
|
||||
helpfull_col = None |
|
||||
comment_count_col = None |
|
||||
comment_cols = [] |
|
||||
|
|
||||
for col in df.columns: |
|
||||
col_str = str(col).lower() |
|
||||
if 'helpfull' in col_str or 'helpful' in col_str: |
|
||||
helpfull_col = col |
|
||||
print(f"找到 Y 列 (helpfull): {col}") |
|
||||
elif '评论总数' in str(col) or '帖子评论总数' in str(col): |
|
||||
comment_count_col = col |
|
||||
print(f"找到 X1 列 (评论总数): {col}") |
|
||||
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)): |
|
||||
comment_cols.append(col) |
|
||||
print(f"找到评论列 {len(comment_cols)}: {col}") |
|
||||
|
|
||||
print(f"\n共找到 {len(comment_cols)} 个评论列") |
|
||||
|
|
||||
# 创建回归数据 |
|
||||
print("\n创建回归数据...") |
|
||||
regression_data = pd.DataFrame() |
|
||||
|
|
||||
# Y (UGC有用性) |
|
||||
print("1. 计算 Y (UGC有用性)") |
|
||||
if helpfull_col: |
|
||||
regression_data['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
regression_data['Y'] = 0 |
|
||||
|
|
||||
# X1 (评论数量) |
|
||||
print("2. 计算 X1 (评论数量)") |
|
||||
if comment_count_col: |
|
||||
regression_data['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
regression_data['X1'] = 0 |
|
||||
|
|
||||
# 定义函数计算评论指标 |
|
||||
def calculate_comment_metrics(row): |
|
||||
lengths = [] |
|
||||
complexities = [] |
|
||||
sentiments = [] |
|
||||
richness = [] |
|
||||
|
|
||||
for col in comment_cols: |
|
||||
content = str(row.get(col, '')) |
|
||||
if content and content != 'None' and content != 'nan': |
|
||||
# 评论长度 |
|
||||
lengths.append(len(content.replace(' ', ''))) |
|
||||
# 评论复杂度 |
|
||||
complexities.append(len(content.split())) |
|
||||
# 情感分析 |
|
||||
positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent'] |
|
||||
negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor'] |
|
||||
|
|
||||
sentiment = 0 |
|
||||
lower_content = content.lower() |
|
||||
if any(word in lower_content for word in positive_words): |
|
||||
sentiment = 1 |
|
||||
elif any(word in lower_content for word in negative_words): |
|
||||
sentiment = -1 |
|
||||
sentiments.append(sentiment) |
|
||||
# 信息丰富度 |
|
||||
r = 0 |
|
||||
if re.search(r'\d', content): |
|
||||
r += 1 |
|
||||
if re.search(r'http[s]?://', content): |
|
||||
r += 1 |
|
||||
if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content): |
|
||||
r += 1 |
|
||||
richness.append(r) |
|
||||
|
|
||||
return lengths, complexities, sentiments, richness |
|
||||
|
|
||||
# 计算评论相关指标 |
|
||||
print("3. 计算评论相关指标...") |
|
||||
comment_metrics = df.apply(calculate_comment_metrics, axis=1) |
|
||||
|
|
||||
# X2: 评论长度平均值 |
|
||||
print("4. 计算 X2 (评论长度)") |
|
||||
regression_data['X2'] = comment_metrics.apply(lambda x: sum(x[0]) / len(x[0]) if x[0] else 0) |
|
||||
|
|
||||
# X3: 评论复杂度平均值 |
|
||||
print("5. 计算 X3 (评论复杂度)") |
|
||||
regression_data['X3'] = comment_metrics.apply(lambda x: sum(x[1]) / len(x[1]) if x[1] else 0) |
|
||||
|
|
||||
# X4: 评论可读性 |
|
||||
print("6. 计算 X4 (评论可读性)") |
|
||||
regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) |
|
||||
|
|
||||
# X5: 内容情感性平均值 |
|
||||
print("7. 计算 X5 (内容情感性)") |
|
||||
regression_data['X5'] = comment_metrics.apply(lambda x: sum(x[2]) / len(x[2]) if x[2] else 0) |
|
||||
|
|
||||
# X6: 信息丰富度平均值 |
|
||||
print("8. 计算 X6 (信息丰富度)") |
|
||||
regression_data['X6'] = comment_metrics.apply(lambda x: sum(x[3]) / len(x[3]) if x[3] else 0) |
|
||||
|
|
||||
# 数据清洗 |
|
||||
print("\n9. 数据清洗...") |
|
||||
for col in regression_data.columns: |
|
||||
regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0) |
|
||||
|
|
||||
# 验证数据 |
|
||||
print("\n10. 验证数据...") |
|
||||
print(f"行数: {len(regression_data)}") |
|
||||
print(f"列数: {len(regression_data.columns)}") |
|
||||
print(f"列名: {list(regression_data.columns)}") |
|
||||
print(f"数据类型:") |
|
||||
print(regression_data.dtypes) |
|
||||
print(f"\n前5行数据:") |
|
||||
print(regression_data.head()) |
|
||||
|
|
||||
# 保存文件 |
|
||||
print("\n11. 保存文件...") |
|
||||
regression_data.to_excel(output_file, index=False) |
|
||||
|
|
||||
# 验证文件 |
|
||||
print("\n12. 验证文件...") |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"文件已成功保存: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
# 重新读取检查 |
|
||||
df_check = pd.read_excel(output_file) |
|
||||
print(f"输出文件行数: {len(df_check)}") |
|
||||
print(f"输出文件列数: {len(df_check.columns)}") |
|
||||
else: |
|
||||
print("文件保存失败!") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 任务完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,83 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
import re |
|
||||
|
|
||||
print("开始处理...") |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' |
|
||||
|
|
||||
# 读取数据 |
|
||||
print("读取数据...") |
|
||||
df = pd.read_excel(input_file) |
|
||||
print(f"读取完成: {len(df)} 行") |
|
||||
|
|
||||
# 识别列 |
|
||||
helpfull_col = [c for c in df.columns if 'helpfull' in str(c).lower()][0] if any('helpfull' in str(c).lower() for c in df.columns) else None |
|
||||
comment_count_col = [c for c in df.columns if '评论总数' in str(c)][0] if any('评论总数' in str(c) for c in df.columns) else None |
|
||||
comment_cols = [c for c in df.columns if '评论' in str(c) and any(str(i) in str(c) for i in range(1, 6)) and '内容' in str(c)] |
|
||||
|
|
||||
print(f"找到列: Y={helpfull_col}, X1={comment_count_col}, 评论列={len(comment_cols)}") |
|
||||
|
|
||||
# 添加Y和X1 |
|
||||
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) if helpfull_col else 0 |
|
||||
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) if comment_count_col else 0 |
|
||||
|
|
||||
# 计算评论指标 |
|
||||
print("计算评论指标...") |
|
||||
|
|
||||
def calc_metrics(content): |
|
||||
if pd.isna(content) or str(content) in ['None', 'nan', '']: |
|
||||
return 0, 0, 0, 0 |
|
||||
content = str(content) |
|
||||
length = len(content.replace(' ', '').replace('\u3000', '')) |
|
||||
complexity = len(content.split()) |
|
||||
|
|
||||
pos_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent'] |
|
||||
neg_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor'] |
|
||||
sentiment = 1 if any(w in content.lower() for w in pos_words) else (-1 if any(w in content.lower() for w in neg_words) else 0) |
|
||||
|
|
||||
richness = (1 if re.search(r'\d', content) else 0) + (1 if re.search(r'http[s]?://|www\.', content) else 0) + (1 if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]', content) else 0) |
|
||||
|
|
||||
return length, complexity, sentiment, richness |
|
||||
|
|
||||
# 批量计算 |
|
||||
x2_list, x3_list, x5_list, x6_list = [], [], [], [] |
|
||||
|
|
||||
for i in range(len(df)): |
|
||||
if i % 5000 == 0: |
|
||||
print(f"处理 {i}/{len(df)}") |
|
||||
|
|
||||
lengths, complexities, sentiments, richness = [], [], [], [] |
|
||||
|
|
||||
for col in comment_cols: |
|
||||
l, c, s, r = calc_metrics(df.iloc[i].get(col, '')) |
|
||||
if l > 0: |
|
||||
lengths.append(l) |
|
||||
complexities.append(c) |
|
||||
sentiments.append(s) |
|
||||
richness.append(r) |
|
||||
|
|
||||
x2_list.append(sum(lengths)/len(lengths) if lengths else 0) |
|
||||
x3_list.append(sum(complexities)/len(complexities) if complexities else 0) |
|
||||
x5_list.append(sum(sentiments)/len(sentiments) if sentiments else 0) |
|
||||
x6_list.append(sum(richness)/len(richness) if richness else 0) |
|
||||
|
|
||||
df['X2'] = x2_list |
|
||||
df['X3'] = x3_list |
|
||||
df['X5'] = x5_list |
|
||||
df['X6'] = x6_list |
|
||||
|
|
||||
# 计算X4 |
|
||||
df['X4'] = df.apply(lambda r: r['X2']/r['X3'] if r['X3']>0 else 0, axis=1) |
|
||||
|
|
||||
# 清洗数据 |
|
||||
for col in ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']: |
|
||||
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).replace([float('inf'), float('-inf')], 0) |
|
||||
|
|
||||
print("保存文件...") |
|
||||
df.to_excel(output_file, index=False, engine='openpyxl') |
|
||||
|
|
||||
print(f"完成!文件大小: {os.path.getsize(output_file)/1024:.2f} KB") |
|
||||
print(f"行数: {len(df)}, 列数: {len(df.columns)}") |
|
||||
@ -1,54 +0,0 @@ |
|||||
import os |
|
||||
import openpyxl |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 读取Excel测试") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取Excel文件 |
|
||||
try: |
|
||||
print("正在读取Excel文件...") |
|
||||
wb = openpyxl.load_workbook(input_file) |
|
||||
ws = wb.active |
|
||||
|
|
||||
print(f"工作表名称: {ws.title}") |
|
||||
print(f"最大行数: {ws.max_row}") |
|
||||
print(f"最大列数: {ws.max_column}") |
|
||||
|
|
||||
# 读取表头 |
|
||||
print("\n表头:") |
|
||||
headers = [] |
|
||||
for col in range(1, ws.max_column + 1): |
|
||||
header = ws.cell(row=1, column=col).value |
|
||||
headers.append(header) |
|
||||
print(f"{col}. {header}") |
|
||||
|
|
||||
# 读取前3行数据 |
|
||||
print("\n前3行数据:") |
|
||||
for row in range(2, min(5, ws.max_row + 1)): |
|
||||
row_data = [] |
|
||||
for col in range(1, min(10, ws.max_column + 1)): |
|
||||
value = ws.cell(row=row, column=col).value |
|
||||
row_data.append(value) |
|
||||
print(f"行 {row}: {row_data}") |
|
||||
|
|
||||
print("\n========================================") |
|
||||
print(" 读取完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,216 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
import re |
|
||||
import sys |
|
||||
|
|
||||
# 重定向输出到文件和屏幕 |
|
||||
class Tee: |
|
||||
def __init__(self, *files): |
|
||||
self.files = files |
|
||||
def write(self, obj): |
|
||||
for f in self.files: |
|
||||
f.write(obj) |
|
||||
f.flush() |
|
||||
def flush(self): |
|
||||
for f in self.files: |
|
||||
f.flush() |
|
||||
|
|
||||
log_file = open(r'D:\java\project\process_log.txt', 'w', encoding='utf-8') |
|
||||
original_stdout = sys.stdout |
|
||||
sys.stdout = Tee(original_stdout, log_file) |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 在原表中添加回归数据列") |
|
||||
print("========================================") |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' |
|
||||
|
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
sys.stdout = original_stdout |
|
||||
log_file.close() |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取原始数据 |
|
||||
try: |
|
||||
print("\n正在读取原始数据...") |
|
||||
df = pd.read_excel(input_file) |
|
||||
print(f"成功读取 {len(df)} 行数据") |
|
||||
print(f"原始列数: {len(df.columns)}") |
|
||||
|
|
||||
# 识别列 |
|
||||
print("\n识别列...") |
|
||||
helpfull_col = None |
|
||||
comment_count_col = None |
|
||||
comment_cols = [] |
|
||||
|
|
||||
for col in df.columns: |
|
||||
col_str = str(col).lower() |
|
||||
if 'helpfull' in col_str or 'helpful' in col_str: |
|
||||
helpfull_col = col |
|
||||
print(f"找到 Y 列 (helpfull): {col}") |
|
||||
elif '评论总数' in str(col) or '帖子评论总数' in str(col): |
|
||||
comment_count_col = col |
|
||||
print(f"找到 X1 列 (评论总数): {col}") |
|
||||
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): |
|
||||
comment_cols.append(col) |
|
||||
print(f"找到评论列 {len(comment_cols)}: {col}") |
|
||||
|
|
||||
print(f"\n共找到 {len(comment_cols)} 个评论内容列") |
|
||||
|
|
||||
# 添加回归数据列 |
|
||||
print("\n添加回归数据列...") |
|
||||
|
|
||||
# Y (UGC有用性) - 直接复制helpfull列 |
|
||||
print("1. 添加 Y (UGC有用性)") |
|
||||
if helpfull_col: |
|
||||
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
df['Y'] = 0 |
|
||||
|
|
||||
# X1 (评论数量) - 直接复制帖子评论总数列 |
|
||||
print("2. 添加 X1 (评论数量)") |
|
||||
if comment_count_col: |
|
||||
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
df['X1'] = 0 |
|
||||
|
|
||||
# 定义函数计算评论指标 |
|
||||
def calculate_comment_metrics(content): |
|
||||
if pd.isna(content) or str(content) in ['None', 'nan', '']: |
|
||||
return 0, 0, 0, 0 |
|
||||
|
|
||||
content = str(content) |
|
||||
# X2: 评论长度(剔空格后的字符数) |
|
||||
length = len(content.replace(' ', '').replace('\u3000', '')) |
|
||||
# X3: 评论复杂度(按空格拆分的分词数) |
|
||||
complexity = len(content.split()) |
|
||||
# X5: 情感分析(正面=1、中性=0、负面=-1) |
|
||||
positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] |
|
||||
negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] |
|
||||
|
|
||||
sentiment = 0 |
|
||||
lower_content = content.lower() |
|
||||
if any(word in lower_content for word in positive_words): |
|
||||
sentiment = 1 |
|
||||
elif any(word in lower_content for word in negative_words): |
|
||||
sentiment = -1 |
|
||||
# X6: 信息丰富度(含数字/链接/表情各1分,满分3分) |
|
||||
richness = 0 |
|
||||
if re.search(r'\d', content): # 含数字 |
|
||||
richness += 1 |
|
||||
if re.search(r'http[s]?://|www\.', content): # 含链接 |
|
||||
richness += 1 |
|
||||
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情 |
|
||||
richness += 1 |
|
||||
|
|
||||
return length, complexity, sentiment, richness |
|
||||
|
|
||||
# 计算评论相关指标 |
|
||||
print("3. 计算评论相关指标...") |
|
||||
|
|
||||
# 初始化列 |
|
||||
df['X2'] = 0.0 # 评论长度 |
|
||||
df['X3'] = 0.0 # 评论复杂度 |
|
||||
df['X5'] = 0.0 # 情感性 |
|
||||
df['X6'] = 0.0 # 信息丰富度 |
|
||||
|
|
||||
# 逐行计算 |
|
||||
total_rows = len(df) |
|
||||
print(f"总数据行数: {total_rows}") |
|
||||
|
|
||||
for i in range(total_rows): |
|
||||
if i % 1000 == 0: |
|
||||
print(f" 处理第 {i}/{total_rows} 行...") |
|
||||
|
|
||||
lengths = [] |
|
||||
complexities = [] |
|
||||
sentiments = [] |
|
||||
richness = [] |
|
||||
|
|
||||
for col in comment_cols: |
|
||||
content = df.iloc[i].get(col, '') |
|
||||
length, complexity, sentiment, r = calculate_comment_metrics(content) |
|
||||
if length > 0: # 只统计有内容的评论 |
|
||||
lengths.append(length) |
|
||||
complexities.append(complexity) |
|
||||
sentiments.append(sentiment) |
|
||||
richness.append(r) |
|
||||
|
|
||||
# 计算平均值(无评论记0) |
|
||||
if lengths: |
|
||||
df.loc[i, 'X2'] = sum(lengths) / len(lengths) |
|
||||
df.loc[i, 'X3'] = sum(complexities) / len(complexities) |
|
||||
df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) |
|
||||
df.loc[i, 'X6'] = sum(richness) / len(richness) |
|
||||
|
|
||||
# X4: 评论可读性 = X2/X3(X3为0时记0,避免报错) |
|
||||
print("4. 计算 X4 (评论可读性)") |
|
||||
df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) |
|
||||
|
|
||||
# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误 |
|
||||
print("\n5. 数据清洗...") |
|
||||
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] |
|
||||
for col in regression_cols: |
|
||||
# 转换为数字,错误值转为0 |
|
||||
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) |
|
||||
# 替换无穷大 |
|
||||
df[col] = df[col].replace([float('inf'), float('-inf')], 0) |
|
||||
|
|
||||
# 验证数据 |
|
||||
print("\n6. 验证数据...") |
|
||||
print(f"总行数: {len(df)}") |
|
||||
print(f"总列数: {len(df.columns)}") |
|
||||
print(f"\n回归数据列统计:") |
|
||||
print(df[regression_cols].describe()) |
|
||||
print(f"\n前5行回归数据:") |
|
||||
print(df[regression_cols].head()) |
|
||||
|
|
||||
# 检查是否有空值或错误值 |
|
||||
print(f"\n空值检查:") |
|
||||
for col in regression_cols: |
|
||||
null_count = df[col].isnull().sum() |
|
||||
print(f" {col}: {null_count} 个空值") |
|
||||
|
|
||||
# 保存文件 |
|
||||
print("\n7. 保存文件...") |
|
||||
print(f"正在保存到: {output_file}") |
|
||||
df.to_excel(output_file, index=False, engine='openpyxl') |
|
||||
|
|
||||
# 验证文件 |
|
||||
print("\n8. 验证文件...") |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"文件已成功保存: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
# 重新读取检查 |
|
||||
df_check = pd.read_excel(output_file) |
|
||||
print(f"输出文件行数: {len(df_check)}") |
|
||||
print(f"输出文件列数: {len(df_check.columns)}") |
|
||||
print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}") |
|
||||
else: |
|
||||
print("文件保存失败!") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 任务完成") |
|
||||
print("========================================") |
|
||||
print(f"新文件已保存: {output_file}") |
|
||||
print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
finally: |
|
||||
sys.stdout = original_stdout |
|
||||
log_file.close() |
|
||||
print("日志已保存到: D:\\java\\project\\process_log.txt") |
|
||||
@ -1,187 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
import re |
|
||||
|
|
||||
print("=" * 60) |
|
||||
print(" 在原表中添加回归数据列") |
|
||||
print("=" * 60) |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' |
|
||||
|
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取原始数据 |
|
||||
print("\n正在读取原始数据...") |
|
||||
df = pd.read_excel(input_file) |
|
||||
print(f"成功读取 {len(df)} 行数据") |
|
||||
print(f"原始列数: {len(df.columns)}") |
|
||||
|
|
||||
# 识别列 |
|
||||
print("\n识别列...") |
|
||||
helpfull_col = None |
|
||||
comment_count_col = None |
|
||||
comment_cols = [] |
|
||||
|
|
||||
for col in df.columns: |
|
||||
col_str = str(col).lower() |
|
||||
if 'helpfull' in col_str or 'helpful' in col_str: |
|
||||
helpfull_col = col |
|
||||
print(f"找到 Y 列 (helpfull): {col}") |
|
||||
elif '评论总数' in str(col) or '帖子评论总数' in str(col): |
|
||||
comment_count_col = col |
|
||||
print(f"找到 X1 列 (评论总数): {col}") |
|
||||
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): |
|
||||
comment_cols.append(col) |
|
||||
print(f"找到评论列 {len(comment_cols)}: {col}") |
|
||||
|
|
||||
print(f"\n共找到 {len(comment_cols)} 个评论内容列") |
|
||||
|
|
||||
# 添加回归数据列 |
|
||||
print("\n添加回归数据列...") |
|
||||
|
|
||||
# Y (UGC有用性) - 直接复制helpfull列 |
|
||||
print("1. 添加 Y (UGC有用性)") |
|
||||
if helpfull_col: |
|
||||
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
df['Y'] = 0 |
|
||||
|
|
||||
# X1 (评论数量) - 直接复制帖子评论总数列 |
|
||||
print("2. 添加 X1 (评论数量)") |
|
||||
if comment_count_col: |
|
||||
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) |
|
||||
else: |
|
||||
df['X1'] = 0 |
|
||||
|
|
||||
# 定义函数计算评论指标 |
|
||||
def calculate_comment_metrics(content): |
|
||||
if pd.isna(content) or str(content) in ['None', 'nan', '']: |
|
||||
return 0, 0, 0, 0 |
|
||||
|
|
||||
content = str(content) |
|
||||
# X2: 评论长度(剔空格后的字符数) |
|
||||
length = len(content.replace(' ', '').replace('\u3000', '')) |
|
||||
# X3: 评论复杂度(按空格拆分的分词数) |
|
||||
complexity = len(content.split()) |
|
||||
# X5: 情感分析(正面=1、中性=0、负面=-1) |
|
||||
positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] |
|
||||
negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] |
|
||||
|
|
||||
sentiment = 0 |
|
||||
lower_content = content.lower() |
|
||||
if any(word in lower_content for word in positive_words): |
|
||||
sentiment = 1 |
|
||||
elif any(word in lower_content for word in negative_words): |
|
||||
sentiment = -1 |
|
||||
# X6: 信息丰富度(含数字/链接/表情各1分,满分3分) |
|
||||
richness = 0 |
|
||||
if re.search(r'\d', content): # 含数字 |
|
||||
richness += 1 |
|
||||
if re.search(r'http[s]?://|www\.', content): # 含链接 |
|
||||
richness += 1 |
|
||||
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情 |
|
||||
richness += 1 |
|
||||
|
|
||||
return length, complexity, sentiment, richness |
|
||||
|
|
||||
# 计算评论相关指标 |
|
||||
print("3. 计算评论相关指标...") |
|
||||
|
|
||||
# 初始化列 |
|
||||
df['X2'] = 0.0 # 评论长度 |
|
||||
df['X3'] = 0.0 # 评论复杂度 |
|
||||
df['X5'] = 0.0 # 情感性 |
|
||||
df['X6'] = 0.0 # 信息丰富度 |
|
||||
|
|
||||
# 逐行计算 |
|
||||
total_rows = len(df) |
|
||||
print(f"总数据行数: {total_rows}") |
|
||||
|
|
||||
for i in range(total_rows): |
|
||||
if i % 1000 == 0: |
|
||||
print(f" 处理第 {i}/{total_rows} 行...") |
|
||||
|
|
||||
lengths = [] |
|
||||
complexities = [] |
|
||||
sentiments = [] |
|
||||
richness = [] |
|
||||
|
|
||||
for col in comment_cols: |
|
||||
content = df.iloc[i].get(col, '') |
|
||||
length, complexity, sentiment, r = calculate_comment_metrics(content) |
|
||||
if length > 0: # 只统计有内容的评论 |
|
||||
lengths.append(length) |
|
||||
complexities.append(complexity) |
|
||||
sentiments.append(sentiment) |
|
||||
richness.append(r) |
|
||||
|
|
||||
# 计算平均值(无评论记0) |
|
||||
if lengths: |
|
||||
df.loc[i, 'X2'] = sum(lengths) / len(lengths) |
|
||||
df.loc[i, 'X3'] = sum(complexities) / len(complexities) |
|
||||
df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) |
|
||||
df.loc[i, 'X6'] = sum(richness) / len(richness) |
|
||||
|
|
||||
# X4: 评论可读性 = X2/X3(X3为0时记0,避免报错) |
|
||||
print("4. 计算 X4 (评论可读性)") |
|
||||
df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) |
|
||||
|
|
||||
# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误 |
|
||||
print("\n5. 数据清洗...") |
|
||||
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] |
|
||||
for col in regression_cols: |
|
||||
# 转换为数字,错误值转为0 |
|
||||
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) |
|
||||
# 替换无穷大 |
|
||||
df[col] = df[col].replace([float('inf'), float('-inf')], 0) |
|
||||
|
|
||||
# 验证数据 |
|
||||
print("\n6. 验证数据...") |
|
||||
print(f"总行数: {len(df)}") |
|
||||
print(f"总列数: {len(df.columns)}") |
|
||||
print(f"\n回归数据列统计:") |
|
||||
print(df[regression_cols].describe()) |
|
||||
print(f"\n前5行回归数据:") |
|
||||
print(df[regression_cols].head()) |
|
||||
|
|
||||
# 检查是否有空值或错误值 |
|
||||
print(f"\n空值检查:") |
|
||||
for col in regression_cols: |
|
||||
null_count = df[col].isnull().sum() |
|
||||
print(f" {col}: {null_count} 个空值") |
|
||||
|
|
||||
# 保存文件 |
|
||||
print("\n7. 保存文件...") |
|
||||
print(f"正在保存到: {output_file}") |
|
||||
df.to_excel(output_file, index=False, engine='openpyxl') |
|
||||
|
|
||||
# 验证文件 |
|
||||
print("\n8. 验证文件...") |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"文件已成功保存: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
# 重新读取检查 |
|
||||
df_check = pd.read_excel(output_file) |
|
||||
print(f"输出文件行数: {len(df_check)}") |
|
||||
print(f"输出文件列数: {len(df_check.columns)}") |
|
||||
print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}") |
|
||||
else: |
|
||||
print("文件保存失败!") |
|
||||
|
|
||||
print() |
|
||||
print("=" * 60) |
|
||||
print(" 任务完成") |
|
||||
print("=" * 60) |
|
||||
print(f"新文件已保存: {output_file}") |
|
||||
print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列") |
|
||||
@ -1,100 +0,0 @@ |
|||||
import os |
|
||||
import openpyxl |
|
||||
import re |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 简单计算UGC回归数据") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
if not os.path.exists(output_file): |
|
||||
print("错误: 输出文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取输入文件 |
|
||||
try: |
|
||||
print("正在读取输入文件...") |
|
||||
wb_input = openpyxl.load_workbook(input_file) |
|
||||
ws_input = wb_input.active |
|
||||
|
|
||||
print(f"输入工作表名称: {ws_input.title}") |
|
||||
print(f"输入文件最大行数: {ws_input.max_row}") |
|
||||
print(f"输入文件最大列数: {ws_input.max_column}") |
|
||||
|
|
||||
# 读取输出文件 |
|
||||
print("\n正在读取输出文件...") |
|
||||
wb_output = openpyxl.load_workbook(output_file) |
|
||||
ws_output = wb_output.active |
|
||||
|
|
||||
print(f"输出工作表名称: {ws_output.title}") |
|
||||
|
|
||||
# 识别列 |
|
||||
print("\n识别列...") |
|
||||
headers = [] |
|
||||
for col in range(1, ws_input.max_column + 1): |
|
||||
header = ws_input.cell(row=1, column=col).value |
|
||||
headers.append(header) |
|
||||
if header and 'helpfull' in str(header): |
|
||||
helpfull_col = col |
|
||||
print(f"找到 helpfull 列: {col}") |
|
||||
elif header and ('评论总数' in str(header) or '帖子评论总数' in str(header)): |
|
||||
comment_count_col = col |
|
||||
print(f"找到评论总数列: {col}") |
|
||||
elif header and '评论' in str(header): |
|
||||
print(f"找到评论列: {col} - {header}") |
|
||||
|
|
||||
# 计算并填充数据 |
|
||||
print("\n计算并填充数据...") |
|
||||
max_rows = min(ws_input.max_row, 10) # 只处理前10行用于测试 |
|
||||
print(f"处理前 {max_rows - 1} 行数据") |
|
||||
|
|
||||
for row in range(2, max_rows + 1): |
|
||||
print(f"处理行 {row}") |
|
||||
|
|
||||
# Y (UGC有用性) |
|
||||
if 'helpfull_col' in locals(): |
|
||||
y_value = ws_input.cell(row=row, column=helpfull_col).value |
|
||||
ws_output.cell(row=row, column=1, value=y_value if y_value else 0) |
|
||||
else: |
|
||||
ws_output.cell(row=row, column=1, value=0) |
|
||||
|
|
||||
# X1 (评论数量) |
|
||||
if 'comment_count_col' in locals(): |
|
||||
x1_value = ws_input.cell(row=row, column=comment_count_col).value |
|
||||
ws_output.cell(row=row, column=2, value=x1_value if x1_value else 0) |
|
||||
else: |
|
||||
ws_output.cell(row=row, column=2, value=0) |
|
||||
|
|
||||
# X2-X6 暂时设为0 |
|
||||
for col in range(3, 8): |
|
||||
ws_output.cell(row=row, column=col, value=0) |
|
||||
|
|
||||
# 保存文件 |
|
||||
print("\n保存文件...") |
|
||||
wb_output.save(output_file) |
|
||||
|
|
||||
print(f"文件已成功保存: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 任务完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,41 +0,0 @@ |
|||||
import os |
|
||||
import shutil |
|
||||
|
|
||||
# 输入输出文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 简单文件复制脚本") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
print(f"文件存在: {os.path.exists(input_file)}") |
|
||||
|
|
||||
# 复制文件 |
|
||||
try: |
|
||||
print("正在复制文件...") |
|
||||
shutil.copy2(input_file, output_file) |
|
||||
|
|
||||
# 验证文件是否创建成功 |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"文件已成功复制到: {output_file}") |
|
||||
print(f"复制文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
else: |
|
||||
print("错误: 文件复制失败,未找到输出文件") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 任务完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
@ -1,54 +0,0 @@ |
|||||
import os |
|
||||
import pandas as pd |
|
||||
|
|
||||
# 文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 简单数据测试") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if not os.path.exists(input_file): |
|
||||
print("错误: 输入文件不存在!") |
|
||||
exit(1) |
|
||||
|
|
||||
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
|
|
||||
# 读取原始数据 |
|
||||
try: |
|
||||
print("正在读取原始数据...") |
|
||||
df = pd.read_excel(input_file) |
|
||||
print(f"成功读取 {len(df)} 行数据") |
|
||||
print(f"列名: {list(df.columns)}") |
|
||||
|
|
||||
# 简单处理:创建一个只包含前5列的新文件 |
|
||||
print("\n创建测试文件...") |
|
||||
test_data = df.head(100) # 只取前100行 |
|
||||
test_output = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\test_output.xlsx' |
|
||||
test_data.to_excel(test_output, index=False) |
|
||||
|
|
||||
print(f"测试文件已创建: {test_output}") |
|
||||
print(f"测试文件大小: {os.path.getsize(test_output) / 1024:.2f} KB") |
|
||||
|
|
||||
# 验证测试文件 |
|
||||
if os.path.exists(test_output): |
|
||||
df_test = pd.read_excel(test_output) |
|
||||
print(f"测试文件行数: {len(df_test)}") |
|
||||
print(f"测试文件列数: {len(df_test.columns)}") |
|
||||
else: |
|
||||
print("测试文件创建失败!") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 测试完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,57 +0,0 @@ |
|||||
import os |
|
||||
import openpyxl |
|
||||
|
|
||||
# 文件路径 |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 创建UGC回归数据文件") |
|
||||
print("========================================") |
|
||||
print(f"输出文件: {output_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查输出目录是否存在 |
|
||||
output_dir = os.path.dirname(output_file) |
|
||||
print(f"输出目录: {output_dir}") |
|
||||
print(f"目录存在: {os.path.exists(output_dir)}") |
|
||||
|
|
||||
if not os.path.exists(output_dir): |
|
||||
print("正在创建输出目录...") |
|
||||
try: |
|
||||
os.makedirs(output_dir) |
|
||||
print("目录创建成功") |
|
||||
except Exception as e: |
|
||||
print(f"创建目录失败: {e}") |
|
||||
exit(1) |
|
||||
|
|
||||
# 创建新的Excel文件 |
|
||||
try: |
|
||||
print("\n创建新的Excel文件...") |
|
||||
wb = openpyxl.Workbook() |
|
||||
ws = wb.active |
|
||||
|
|
||||
# 设置第一行列名 |
|
||||
headers = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] |
|
||||
for i, header in enumerate(headers, 1): |
|
||||
ws.cell(row=1, column=i, value=header) |
|
||||
|
|
||||
# 保存文件 |
|
||||
print(f"保存文件到: {output_file}") |
|
||||
wb.save(output_file) |
|
||||
|
|
||||
# 验证文件是否创建成功 |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"文件已成功创建: {output_file}") |
|
||||
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
else: |
|
||||
print("错误: 文件创建失败") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 任务完成") |
|
||||
print("========================================") |
|
||||
|
|
||||
except Exception as e: |
|
||||
print(f"处理文件时出错: {str(e)}") |
|
||||
import traceback |
|
||||
traceback.print_exc() |
|
||||
@ -1,22 +0,0 @@ |
|||||
import os |
|
||||
|
|
||||
# 测试基本文件操作 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 简单测试") |
|
||||
print("========================================") |
|
||||
print(f"输入文件: {input_file}") |
|
||||
print() |
|
||||
|
|
||||
# 检查文件是否存在 |
|
||||
if os.path.exists(input_file): |
|
||||
print("文件存在!") |
|
||||
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
else: |
|
||||
print("文件不存在!") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 测试完成") |
|
||||
print("========================================") |
|
||||
@ -1,49 +0,0 @@ |
|||||
import os |
|
||||
|
|
||||
# 测试文件路径 |
|
||||
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' |
|
||||
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' |
|
||||
|
|
||||
print("========================================") |
|
||||
print(" 测试文件访问") |
|
||||
print("========================================") |
|
||||
print(f"当前目录: {os.getcwd()}") |
|
||||
print() |
|
||||
|
|
||||
# 检查输入文件 |
|
||||
print("检查输入文件:") |
|
||||
print(f"路径: {input_file}") |
|
||||
print(f"存在: {os.path.exists(input_file)}") |
|
||||
if os.path.exists(input_file): |
|
||||
print(f"大小: {os.path.getsize(input_file) / 1024:.2f} KB") |
|
||||
else: |
|
||||
print("文件不存在!") |
|
||||
|
|
||||
# 检查输出文件 |
|
||||
print("\n检查输出文件:") |
|
||||
print(f"路径: {output_file}") |
|
||||
print(f"存在: {os.path.exists(output_file)}") |
|
||||
if os.path.exists(output_file): |
|
||||
print(f"大小: {os.path.getsize(output_file) / 1024:.2f} KB") |
|
||||
else: |
|
||||
print("文件不存在!") |
|
||||
|
|
||||
# 检查目录 |
|
||||
print("\n检查目录:") |
|
||||
dir_path = os.path.dirname(input_file) |
|
||||
print(f"目录: {dir_path}") |
|
||||
print(f"存在: {os.path.exists(dir_path)}") |
|
||||
if os.path.exists(dir_path): |
|
||||
print("目录内容:") |
|
||||
files = os.listdir(dir_path) |
|
||||
for file in files[:10]: # 只显示前10个文件 |
|
||||
file_path = os.path.join(dir_path, file) |
|
||||
size = os.path.getsize(file_path) / 1024 |
|
||||
print(f" {file}: {size:.2f} KB") |
|
||||
if len(files) > 10: |
|
||||
print(f" ... 还有 {len(files) - 10} 个文件") |
|
||||
|
|
||||
print() |
|
||||
print("========================================") |
|
||||
print(" 测试完成") |
|
||||
print("========================================") |
|
||||
|
After Width: | Height: | Size: 98 KiB |
Binary file not shown.
@ -1,63 +0,0 @@ |
|||||
public class BankAccount { |
|
||||
// 私有属性
|
|
||||
private final String accountNumber; |
|
||||
private String ownerName; |
|
||||
private double balance; |
|
||||
|
|
||||
// 构造方法
|
|
||||
public BankAccount(String accountNumber, String ownerName) { |
|
||||
this.accountNumber = accountNumber; |
|
||||
this.ownerName = ownerName; |
|
||||
this.balance = 0.0; |
|
||||
} |
|
||||
|
|
||||
// Getter 方法
|
|
||||
public String getAccountNumber() { |
|
||||
return accountNumber; |
|
||||
} |
|
||||
|
|
||||
public String getOwnerName() { |
|
||||
return ownerName; |
|
||||
} |
|
||||
|
|
||||
public double getBalance() { |
|
||||
return balance; |
|
||||
} |
|
||||
|
|
||||
// Setter 方法
|
|
||||
public void setOwnerName(String ownerName) { |
|
||||
this.ownerName = ownerName; |
|
||||
} |
|
||||
|
|
||||
// 存款操作
|
|
||||
public void deposit(double amount) { |
|
||||
if (amount > 0) { |
|
||||
balance += amount; |
|
||||
System.out.println("存款成功!当前余额:" + balance); |
|
||||
} else { |
|
||||
System.out.println("存款金额必须大于 0"); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// 取款操作
|
|
||||
public void withdraw(double amount) { |
|
||||
if (amount > 0) { |
|
||||
if (amount <= balance) { |
|
||||
balance -= amount; |
|
||||
System.out.println("取款成功!当前余额:" + balance); |
|
||||
} else { |
|
||||
System.out.println("余额不足,无法取款"); |
|
||||
} |
|
||||
} else { |
|
||||
System.out.println("取款金额必须大于 0"); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// 显示账户信息
|
|
||||
public void displayInfo() { |
|
||||
System.out.println("账号:" + accountNumber); |
|
||||
System.out.println("户主:" + ownerName); |
|
||||
System.out.println("余额:" + balance); |
|
||||
System.out.println(); |
|
||||
} |
|
||||
} |
|
||||
@ -1,154 +0,0 @@ |
|||||
// Car.java
|
|
||||
public class Car { |
|
||||
// 私有属性
|
|
||||
private final String licensePlate; |
|
||||
private String brand; |
|
||||
private String model; |
|
||||
private double dailyRent; |
|
||||
private boolean isRented; |
|
||||
|
|
||||
// 静态变量:统计车辆总数
|
|
||||
private static int totalCars = 0; |
|
||||
|
|
||||
// 全参构造方法
|
|
||||
public Car(String licensePlate, String brand, String model, double dailyRent) { |
|
||||
this.licensePlate = licensePlate; |
|
||||
this.brand = brand; |
|
||||
this.model = model; |
|
||||
setDailyRent(dailyRent); |
|
||||
this.isRented = false; |
|
||||
totalCars++; |
|
||||
} |
|
||||
|
|
||||
// 重载构造方法(日租金默认300)
|
|
||||
public Car(String licensePlate, String brand, String model) { |
|
||||
this(licensePlate, brand, model, 300.0); |
|
||||
} |
|
||||
|
|
||||
// Getter/Setter
|
|
||||
public String getLicensePlate() { |
|
||||
return licensePlate; |
|
||||
} |
|
||||
|
|
||||
public String getBrand() { |
|
||||
return brand; |
|
||||
} |
|
||||
|
|
||||
public void setBrand(String brand) { |
|
||||
this.brand = brand; |
|
||||
} |
|
||||
|
|
||||
public String getModel() { |
|
||||
return model; |
|
||||
} |
|
||||
|
|
||||
public void setModel(String model) { |
|
||||
this.model = model; |
|
||||
} |
|
||||
|
|
||||
public double getDailyRent() { |
|
||||
return dailyRent; |
|
||||
} |
|
||||
|
|
||||
public void setDailyRent(double dailyRent) { |
|
||||
if (dailyRent > 0) { |
|
||||
this.dailyRent = dailyRent; |
|
||||
} else { |
|
||||
System.out.println("日租金必须大于0,设置失败!"); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
public boolean isRented() { |
|
||||
return isRented; |
|
||||
} |
|
||||
|
|
||||
// 业务方法:租车
|
|
||||
public void rentCar() { |
|
||||
if (isRented) { |
|
||||
System.out.println("车辆已租出,无法再次租用!"); |
|
||||
} else { |
|
||||
isRented = true; |
|
||||
System.out.println("车辆已成功租出!"); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
public class Car { |
|
||||
private final String licensePlate; |
|
||||
private String brand; |
|
||||
private String model; |
|
||||
private double dailyRent; |
|
||||
private boolean isRented; |
|
||||
private static int totalCars = 0; |
|
||||
|
|
||||
// 全参构造方法
|
|
||||
public Car(String licensePlate, String brand, String model, double dailyRent) { |
|
||||
this.licensePlate = licensePlate; |
|
||||
this.brand = brand; |
|
||||
this.model = model; |
|
||||
setDailyRent(dailyRent); |
|
||||
this.isRented = false; |
|
||||
totalCars++; |
|
||||
} |
|
||||
|
|
||||
// 重载构造方法
|
|
||||
public Car(String licensePlate, String brand, String model) { |
|
||||
this(licensePlate, brand, model, 300.0); |
|
||||
} |
|
||||
|
|
||||
// Getter/Setter
|
|
||||
public String getLicensePlate() { return licensePlate; } |
|
||||
public String getBrand() { return brand; } |
|
||||
public void setBrand(String brand) { this.brand = brand; } |
|
||||
public String getModel() { return model; } |
|
||||
public void setModel(String model) { this.model = model; } |
|
||||
public double getDailyRent() { return dailyRent; } |
|
||||
|
|
||||
public void setDailyRent(double dailyRent) { |
|
||||
if (dailyRent > 0) { |
|
||||
this.dailyRent = dailyRent; |
|
||||
} else { |
|
||||
System.out.println("日租金必须大于0,设置失败!"); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
public boolean isRented() { return isRented; } |
|
||||
|
|
||||
// 租车方法
|
|
||||
public void rentCar() { |
|
||||
if (isRented) { |
|
||||
System.out.println("车辆已租出,无法再次租用!"); |
|
||||
} else { |
|
||||
isRented = true; |
|
||||
System.out.println("车辆已成功租出!"); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// 还车方法
|
|
||||
public void returnCar() { |
|
||||
if (!isRented) { |
|
||||
System.out.println("车辆未被租用,无需归还!"); |
|
||||
} else { |
|
||||
isRented = false; |
|
||||
System.out.println("车辆已成功归还!"); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// 计算租金
|
|
||||
public double calculateRent(int days) { |
|
||||
return dailyRent * days; |
|
||||
} |
|
||||
|
|
||||
// 静态方法:获取总车辆数
|
|
||||
public static int getTotalCars() { |
|
||||
return totalCars; |
|
||||
} |
|
||||
|
|
||||
// 打印车辆信息
|
|
||||
public void displayInfo() { |
|
||||
System.out.println("车牌号:" + licensePlate + |
|
||||
" | 品牌:" + brand + |
|
||||
" | 型号:" + model + |
|
||||
" | 日租金:" + dailyRent + |
|
||||
" | 状态:" + (isRented ? "已出租" : "可租")); |
|
||||
} |
|
||||
} |
|
||||
Binary file not shown.
@ -1,32 +0,0 @@ |
|||||
public class TestCar { |
|
||||
public static void main(String[] args) { |
|
||||
// 创建3个Car对象
|
|
||||
Car car1 = new Car("京A12345", "特斯拉", "Model 3", 280.0); |
|
||||
Car car2 = new Car("沪B67890", "比亚迪", "汉EV"); |
|
||||
Car car3 = new Car("粤C24680", "丰田", "凯美瑞", 220.0); |
|
||||
|
|
||||
// 输出车辆信息
|
|
||||
System.out.println("=== 所有车辆信息 ==="); |
|
||||
car1.displayInfo(); |
|
||||
car2.displayInfo(); |
|
||||
car3.displayInfo(); |
|
||||
System.out.println("总车辆数:" + Car.getTotalCars()); |
|
||||
|
|
||||
// 测试租车/还车
|
|
||||
System.out.println("\n=== 测试租车/还车 ==="); |
|
||||
car1.rentCar(); |
|
||||
car1.rentCar(); |
|
||||
car1.returnCar(); |
|
||||
car1.returnCar(); |
|
||||
|
|
||||
// 测试租金计算
|
|
||||
System.out.println("\n=== 测试租金计算 ==="); |
|
||||
double rent = car2.calculateRent(5); |
|
||||
System.out.println("租用5天总租金:" + rent + "元"); |
|
||||
|
|
||||
// 测试日租金校验
|
|
||||
System.out.println("\n=== 测试日租金校验 ==="); |
|
||||
car3.setDailyRent(-100.0); // 改为double类型字面量
|
|
||||
System.out.println("修改后日租金:" + car3.getDailyRent()); |
|
||||
} |
|
||||
} |
|
||||
Loading…
Reference in new issue