You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

177 lines
6.2 KiB

import os
import pandas as pd
import re
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
print("========================================")
print(" 处理大型Excel文件")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()
# 检查文件是否存在
if not os.path.exists(input_file):
print("错误: 输入文件不存在!")
exit(1)
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
# 读取原始数据
try:
print("正在读取原始数据...")
# 使用pandas读取Excel文件,设置引擎为openpyxl
df = pd.read_excel(input_file, engine='openpyxl')
print(f"成功读取 {len(df)} 行数据")
print(f"列名: {list(df.columns)}")
# 识别列
print("\n识别列...")
helpfull_col = None
comment_count_col = None
comment_cols = []
for col in df.columns:
col_str = str(col).lower()
if 'helpfull' in col_str or 'helpful' in col_str:
helpfull_col = col
print(f"找到 Y 列 (helpfull): {col}")
elif '评论总数' in str(col) or '帖子评论总数' in str(col):
comment_count_col = col
print(f"找到 X1 列 (评论总数): {col}")
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)):
comment_cols.append(col)
print(f"找到评论列 {len(comment_cols)}: {col}")
print(f"\n共找到 {len(comment_cols)} 个评论列")
# 创建回归数据
print("\n创建回归数据...")
regression_data = pd.DataFrame()
# Y (UGC有用性)
print("1. 计算 Y (UGC有用性)")
if helpfull_col:
regression_data['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
else:
regression_data['Y'] = 0
# X1 (评论数量)
print("2. 计算 X1 (评论数量)")
if comment_count_col:
regression_data['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
else:
regression_data['X1'] = 0
# 定义函数计算评论指标
def calculate_comment_metrics(content):
if pd.isna(content) or str(content) in ['None', 'nan']:
return 0, 0, 0, 0
content = str(content)
# 评论长度
length = len(content.replace(' ', ''))
# 评论复杂度
complexity = len(content.split())
# 情感分析
positive_words = ['', '', '优秀', '喜欢', '满意', '', 'positive', 'good', 'great', 'excellent']
negative_words = ['', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
sentiment = 0
lower_content = content.lower()
if any(word in lower_content for word in positive_words):
sentiment = 1
elif any(word in lower_content for word in negative_words):
sentiment = -1
# 信息丰富度
richness = 0
if re.search(r'\d', content):
richness += 1
if re.search(r'http[s]?://', content):
richness += 1
if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content):
richness += 1
return length, complexity, sentiment, richness
# 计算评论相关指标
print("3. 计算评论相关指标...")
# 初始化列
regression_data['X2'] = 0 # 评论长度
regression_data['X3'] = 0 # 评论复杂度
regression_data['X5'] = 0 # 情感性
regression_data['X6'] = 0 # 信息丰富度
# 逐行计算
total_rows = len(df)
for i in range(total_rows):
if i % 1000 == 0:
print(f"处理到第 {i} 行...")
lengths = []
complexities = []
sentiments = []
richness = []
for col in comment_cols:
content = df.iloc[i].get(col, '')
length, complexity, sentiment, r = calculate_comment_metrics(content)
if length > 0:
lengths.append(length)
complexities.append(complexity)
sentiments.append(sentiment)
richness.append(r)
# 计算平均值
if lengths:
regression_data.loc[i, 'X2'] = sum(lengths) / len(lengths)
regression_data.loc[i, 'X3'] = sum(complexities) / len(complexities)
regression_data.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
regression_data.loc[i, 'X6'] = sum(richness) / len(richness)
# X4: 评论可读性
print("4. 计算 X4 (评论可读性)")
regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
# 数据清洗
print("\n5. 数据清洗...")
for col in regression_data.columns:
regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0)
# 验证数据
print("\n6. 验证数据...")
print(f"行数: {len(regression_data)}")
print(f"列数: {len(regression_data.columns)}")
print(f"列名: {list(regression_data.columns)}")
print(f"\n前5行数据:")
print(regression_data.head())
# 保存文件
print("\n7. 保存文件...")
regression_data.to_excel(output_file, index=False)
# 验证文件
print("\n8. 验证文件...")
if os.path.exists(output_file):
print(f"文件已成功保存: {output_file}")
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
# 重新读取检查
df_check = pd.read_excel(output_file)
print(f"输出文件行数: {len(df_check)}")
print(f"输出文件列数: {len(df_check.columns)}")
else:
print("文件保存失败!")
print()
print("========================================")
print(" 任务完成")
print("========================================")
except Exception as e:
print(f"处理文件时出错: {str(e)}")
import traceback
traceback.print_exc()