You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

169 lines
6.3 KiB

import os
import pandas as pd
import re
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
print("========================================")
print(" 计算UGC回归数据")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取原始数据
try:
print("正在读取原始数据...")
df = pd.read_excel(input_file)
print(f"成功读取 {len(df)} 行数据")
print(f"列名: {list(df.columns)}")
# 识别评论列
comment_columns = [col for col in df.columns if '评论' in col and any(str(i) in col for i in range(1, 6))]
print(f"\n找到评论列: {comment_columns}")
# 创建回归数据
regression_data = pd.DataFrame()
# 1. Y (UGC有用性)
print("\n1. 计算 Y (UGC有用性)")
if 'helpfull' in df.columns:
regression_data['Y'] = df['helpfull'].fillna(0).astype(float)
print(f"成功提取 Y 列,共 {len(regression_data['Y'])} 个值")
else:
print("警告: 未找到 helpfull 列,使用默认值 0")
regression_data['Y'] = 0
# 2. X1 (评论数量)
print("\n2. 计算 X1 (评论数量)")
comment_count_columns = [col for col in df.columns if '评论总数' in col or '帖子评论总数' in col]
if comment_count_columns:
regression_data['X1'] = df[comment_count_columns[0]].fillna(0).astype(float)
print(f"成功提取 X1 列,使用列: {comment_count_columns[0]}")
else:
print("警告: 未找到评论总数列,使用默认值 0")
regression_data['X1'] = 0
# 3. X2 (评论长度)
print("\n3. 计算 X2 (评论长度)")
def calculate_comment_length(row):
lengths = []
for col in comment_columns:
content = str(row.get(col, ''))
if content and content != 'nan':
# 剔空格后的字符数
length = len(content.replace(' ', ''))
lengths.append(length)
return sum(lengths) / len(lengths) if lengths else 0
regression_data['X2'] = df.apply(calculate_comment_length, axis=1)
# 4. X3 (评论复杂度)
print("\n4. 计算 X3 (评论复杂度)")
def calculate_comment_complexity(row):
complexities = []
for col in comment_columns:
content = str(row.get(col, ''))
if content and content != 'nan':
# 按空格拆分的分词数
complexity = len(content.split())
complexities.append(complexity)
return sum(complexities) / len(complexities) if complexities else 0
regression_data['X3'] = df.apply(calculate_comment_complexity, axis=1)
# 5. X4 (评论可读性)
print("\n5. 计算 X4 (评论可读性)")
regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
# 6. X5 (内容情感性)
print("\n6. 计算 X5 (内容情感性)")
def calculate_sentiment(row):
sentiments = []
for col in comment_columns:
content = str(row.get(col, ''))
if content and content != 'nan':
# 简单的情感分析
positive_words = ['', '', '优秀', '喜欢', '满意', '', 'positive']
negative_words = ['', '糟糕', '不好', '失望', '不满', 'negative']
sentiment = 0
lower_content = content.lower()
if any(word in lower_content for word in positive_words):
sentiment = 1
elif any(word in lower_content for word in negative_words):
sentiment = -1
sentiments.append(sentiment)
return sum(sentiments) / len(sentiments) if sentiments else 0
regression_data['X5'] = df.apply(calculate_sentiment, axis=1)
# 7. X6 (信息丰富度)
print("\n7. 计算 X6 (信息丰富度)")
def calculate_information_richness(row):
richness_scores = []
for col in comment_columns:
content = str(row.get(col, ''))
if content and content != 'nan':
score = 0
# 含数字
if re.search(r'\d', content):
score += 1
# 含链接
if re.search(r'http[s]?://', content):
score += 1
# 含表情(简单判断)
if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content):
score += 1
richness_scores.append(score)
return sum(richness_scores) / len(richness_scores) if richness_scores else 0
regression_data['X6'] = df.apply(calculate_information_richness, axis=1)
# 数据清洗
print("\n8. 数据清洗")
# 确保所有值都是数字
for col in regression_data.columns:
regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0)
# 验证数据
print("\n9. 数据验证")
print(f"行数: {len(regression_data)}")
print(f"列数: {len(regression_data.columns)}")
print(f"列名: {list(regression_data.columns)}")
print(f"数据类型:")
print(regression_data.dtypes)
print(f"\n前5行数据:")
print(regression_data.head())
# 保存文件
print("\n10. 保存文件")
regression_data.to_excel(output_file, index=False)
# 验证文件是否创建成功
if os.path.exists(output_file):
print(f"文件已成功保存到: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
else:
print("错误: 文件保存失败")
print()
print("========================================")
print(" 任务完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()