You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

83 lines
3.4 KiB

import os
import pandas as pd
import re
print("开始处理...")
# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx'
# 读取数据
print("读取数据...")
df = pd.read_excel(input_file)
print(f"读取完成: {len(df)}")
# 识别列
helpfull_col = [c for c in df.columns if 'helpfull' in str(c).lower()][0] if any('helpfull' in str(c).lower() for c in df.columns) else None
comment_count_col = [c for c in df.columns if '评论总数' in str(c)][0] if any('评论总数' in str(c) for c in df.columns) else None
comment_cols = [c for c in df.columns if '评论' in str(c) and any(str(i) in str(c) for i in range(1, 6)) and '内容' in str(c)]
print(f"找到列: Y={helpfull_col}, X1={comment_count_col}, 评论列={len(comment_cols)}")
# 添加Y和X1
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) if helpfull_col else 0
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) if comment_count_col else 0
# 计算评论指标
print("计算评论指标...")
def calc_metrics(content):
if pd.isna(content) or str(content) in ['None', 'nan', '']:
return 0, 0, 0, 0
content = str(content)
length = len(content.replace(' ', '').replace('\u3000', ''))
complexity = len(content.split())
pos_words = ['', '', '优秀', '喜欢', '满意', '', 'positive', 'good', 'great', 'excellent']
neg_words = ['', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
sentiment = 1 if any(w in content.lower() for w in pos_words) else (-1 if any(w in content.lower() for w in neg_words) else 0)
richness = (1 if re.search(r'\d', content) else 0) + (1 if re.search(r'http[s]?://|www\.', content) else 0) + (1 if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]', content) else 0)
return length, complexity, sentiment, richness
# 批量计算
x2_list, x3_list, x5_list, x6_list = [], [], [], []
for i in range(len(df)):
if i % 5000 == 0:
print(f"处理 {i}/{len(df)}")
lengths, complexities, sentiments, richness = [], [], [], []
for col in comment_cols:
l, c, s, r = calc_metrics(df.iloc[i].get(col, ''))
if l > 0:
lengths.append(l)
complexities.append(c)
sentiments.append(s)
richness.append(r)
x2_list.append(sum(lengths)/len(lengths) if lengths else 0)
x3_list.append(sum(complexities)/len(complexities) if complexities else 0)
x5_list.append(sum(sentiments)/len(sentiments) if sentiments else 0)
x6_list.append(sum(richness)/len(richness) if richness else 0)
df['X2'] = x2_list
df['X3'] = x3_list
df['X5'] = x5_list
df['X6'] = x6_list
# 计算X4
df['X4'] = df.apply(lambda r: r['X2']/r['X3'] if r['X3']>0 else 0, axis=1)
# 清洗数据
for col in ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']:
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).replace([float('inf'), float('-inf')], 0)
print("保存文件...")
df.to_excel(output_file, index=False, engine='openpyxl')
print(f"完成!文件大小: {os.path.getsize(output_file)/1024:.2f} KB")
print(f"行数: {len(df)}, 列数: {len(df.columns)}")