You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
216 lines
7.8 KiB
216 lines
7.8 KiB
import os
|
|
import pandas as pd
|
|
import re
|
|
import sys
|
|
|
|
# 重定向输出到文件和屏幕
|
|
class Tee:
|
|
def __init__(self, *files):
|
|
self.files = files
|
|
def write(self, obj):
|
|
for f in self.files:
|
|
f.write(obj)
|
|
f.flush()
|
|
def flush(self):
|
|
for f in self.files:
|
|
f.flush()
|
|
|
|
log_file = open(r'D:\java\project\process_log.txt', 'w', encoding='utf-8')
|
|
original_stdout = sys.stdout
|
|
sys.stdout = Tee(original_stdout, log_file)
|
|
|
|
print("========================================")
|
|
print(" 在原表中添加回归数据列")
|
|
print("========================================")
|
|
|
|
# 文件路径
|
|
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
|
|
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx'
|
|
|
|
print(f"输入文件: {input_file}")
|
|
print(f"输出文件: {output_file}")
|
|
print()
|
|
|
|
# 检查文件是否存在
|
|
if not os.path.exists(input_file):
|
|
print("错误: 输入文件不存在!")
|
|
sys.stdout = original_stdout
|
|
log_file.close()
|
|
exit(1)
|
|
|
|
print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
|
|
|
|
# 读取原始数据
|
|
try:
|
|
print("\n正在读取原始数据...")
|
|
df = pd.read_excel(input_file)
|
|
print(f"成功读取 {len(df)} 行数据")
|
|
print(f"原始列数: {len(df.columns)}")
|
|
|
|
# 识别列
|
|
print("\n识别列...")
|
|
helpfull_col = None
|
|
comment_count_col = None
|
|
comment_cols = []
|
|
|
|
for col in df.columns:
|
|
col_str = str(col).lower()
|
|
if 'helpfull' in col_str or 'helpful' in col_str:
|
|
helpfull_col = col
|
|
print(f"找到 Y 列 (helpfull): {col}")
|
|
elif '评论总数' in str(col) or '帖子评论总数' in str(col):
|
|
comment_count_col = col
|
|
print(f"找到 X1 列 (评论总数): {col}")
|
|
elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
|
|
comment_cols.append(col)
|
|
print(f"找到评论列 {len(comment_cols)}: {col}")
|
|
|
|
print(f"\n共找到 {len(comment_cols)} 个评论内容列")
|
|
|
|
# 添加回归数据列
|
|
print("\n添加回归数据列...")
|
|
|
|
# Y (UGC有用性) - 直接复制helpfull列
|
|
print("1. 添加 Y (UGC有用性)")
|
|
if helpfull_col:
|
|
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
|
|
else:
|
|
df['Y'] = 0
|
|
|
|
# X1 (评论数量) - 直接复制帖子评论总数列
|
|
print("2. 添加 X1 (评论数量)")
|
|
if comment_count_col:
|
|
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
|
|
else:
|
|
df['X1'] = 0
|
|
|
|
# 定义函数计算评论指标
|
|
def calculate_comment_metrics(content):
|
|
if pd.isna(content) or str(content) in ['None', 'nan', '']:
|
|
return 0, 0, 0, 0
|
|
|
|
content = str(content)
|
|
# X2: 评论长度(剔空格后的字符数)
|
|
length = len(content.replace(' ', '').replace('\u3000', ''))
|
|
# X3: 评论复杂度(按空格拆分的分词数)
|
|
complexity = len(content.split())
|
|
# X5: 情感分析(正面=1、中性=0、负面=-1)
|
|
positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
|
|
negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
|
|
|
|
sentiment = 0
|
|
lower_content = content.lower()
|
|
if any(word in lower_content for word in positive_words):
|
|
sentiment = 1
|
|
elif any(word in lower_content for word in negative_words):
|
|
sentiment = -1
|
|
# X6: 信息丰富度(含数字/链接/表情各1分,满分3分)
|
|
richness = 0
|
|
if re.search(r'\d', content): # 含数字
|
|
richness += 1
|
|
if re.search(r'http[s]?://|www\.', content): # 含链接
|
|
richness += 1
|
|
if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情
|
|
richness += 1
|
|
|
|
return length, complexity, sentiment, richness
|
|
|
|
# 计算评论相关指标
|
|
print("3. 计算评论相关指标...")
|
|
|
|
# 初始化列
|
|
df['X2'] = 0.0 # 评论长度
|
|
df['X3'] = 0.0 # 评论复杂度
|
|
df['X5'] = 0.0 # 情感性
|
|
df['X6'] = 0.0 # 信息丰富度
|
|
|
|
# 逐行计算
|
|
total_rows = len(df)
|
|
print(f"总数据行数: {total_rows}")
|
|
|
|
for i in range(total_rows):
|
|
if i % 1000 == 0:
|
|
print(f" 处理第 {i}/{total_rows} 行...")
|
|
|
|
lengths = []
|
|
complexities = []
|
|
sentiments = []
|
|
richness = []
|
|
|
|
for col in comment_cols:
|
|
content = df.iloc[i].get(col, '')
|
|
length, complexity, sentiment, r = calculate_comment_metrics(content)
|
|
if length > 0: # 只统计有内容的评论
|
|
lengths.append(length)
|
|
complexities.append(complexity)
|
|
sentiments.append(sentiment)
|
|
richness.append(r)
|
|
|
|
# 计算平均值(无评论记0)
|
|
if lengths:
|
|
df.loc[i, 'X2'] = sum(lengths) / len(lengths)
|
|
df.loc[i, 'X3'] = sum(complexities) / len(complexities)
|
|
df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
|
|
df.loc[i, 'X6'] = sum(richness) / len(richness)
|
|
|
|
# X4: 评论可读性 = X2/X3(X3为0时记0,避免报错)
|
|
print("4. 计算 X4 (评论可读性)")
|
|
df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
|
|
|
|
# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误
|
|
print("\n5. 数据清洗...")
|
|
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
|
|
for col in regression_cols:
|
|
# 转换为数字,错误值转为0
|
|
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
|
|
# 替换无穷大
|
|
df[col] = df[col].replace([float('inf'), float('-inf')], 0)
|
|
|
|
# 验证数据
|
|
print("\n6. 验证数据...")
|
|
print(f"总行数: {len(df)}")
|
|
print(f"总列数: {len(df.columns)}")
|
|
print(f"\n回归数据列统计:")
|
|
print(df[regression_cols].describe())
|
|
print(f"\n前5行回归数据:")
|
|
print(df[regression_cols].head())
|
|
|
|
# 检查是否有空值或错误值
|
|
print(f"\n空值检查:")
|
|
for col in regression_cols:
|
|
null_count = df[col].isnull().sum()
|
|
print(f" {col}: {null_count} 个空值")
|
|
|
|
# 保存文件
|
|
print("\n7. 保存文件...")
|
|
print(f"正在保存到: {output_file}")
|
|
df.to_excel(output_file, index=False, engine='openpyxl')
|
|
|
|
# 验证文件
|
|
print("\n8. 验证文件...")
|
|
if os.path.exists(output_file):
|
|
print(f"文件已成功保存: {output_file}")
|
|
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
|
|
# 重新读取检查
|
|
df_check = pd.read_excel(output_file)
|
|
print(f"输出文件行数: {len(df_check)}")
|
|
print(f"输出文件列数: {len(df_check.columns)}")
|
|
print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
|
|
else:
|
|
print("文件保存失败!")
|
|
|
|
print()
|
|
print("========================================")
|
|
print(" 任务完成")
|
|
print("========================================")
|
|
print(f"新文件已保存: {output_file}")
|
|
print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")
|
|
|
|
except Exception as e:
|
|
print(f"处理文件时出错: {str(e)}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
finally:
|
|
sys.stdout = original_stdout
|
|
log_file.close()
|
|
print("日志已保存到: D:\\java\\project\\process_log.txt")
|
|
|