You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
83 lines
3.4 KiB
83 lines
3.4 KiB
import os
|
|
import pandas as pd
|
|
import re
|
|
|
|
print("开始处理...")
|
|
|
|
# 文件路径
|
|
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx'
|
|
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx'
|
|
|
|
# 读取数据
|
|
print("读取数据...")
|
|
df = pd.read_excel(input_file)
|
|
print(f"读取完成: {len(df)} 行")
|
|
|
|
# 识别列
|
|
helpfull_col = [c for c in df.columns if 'helpfull' in str(c).lower()][0] if any('helpfull' in str(c).lower() for c in df.columns) else None
|
|
comment_count_col = [c for c in df.columns if '评论总数' in str(c)][0] if any('评论总数' in str(c) for c in df.columns) else None
|
|
comment_cols = [c for c in df.columns if '评论' in str(c) and any(str(i) in str(c) for i in range(1, 6)) and '内容' in str(c)]
|
|
|
|
print(f"找到列: Y={helpfull_col}, X1={comment_count_col}, 评论列={len(comment_cols)}")
|
|
|
|
# 添加Y和X1
|
|
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) if helpfull_col else 0
|
|
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) if comment_count_col else 0
|
|
|
|
# 计算评论指标
|
|
print("计算评论指标...")
|
|
|
|
def calc_metrics(content):
|
|
if pd.isna(content) or str(content) in ['None', 'nan', '']:
|
|
return 0, 0, 0, 0
|
|
content = str(content)
|
|
length = len(content.replace(' ', '').replace('\u3000', ''))
|
|
complexity = len(content.split())
|
|
|
|
pos_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent']
|
|
neg_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
|
|
sentiment = 1 if any(w in content.lower() for w in pos_words) else (-1 if any(w in content.lower() for w in neg_words) else 0)
|
|
|
|
richness = (1 if re.search(r'\d', content) else 0) + (1 if re.search(r'http[s]?://|www\.', content) else 0) + (1 if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]', content) else 0)
|
|
|
|
return length, complexity, sentiment, richness
|
|
|
|
# 批量计算
|
|
x2_list, x3_list, x5_list, x6_list = [], [], [], []
|
|
|
|
for i in range(len(df)):
|
|
if i % 5000 == 0:
|
|
print(f"处理 {i}/{len(df)}")
|
|
|
|
lengths, complexities, sentiments, richness = [], [], [], []
|
|
|
|
for col in comment_cols:
|
|
l, c, s, r = calc_metrics(df.iloc[i].get(col, ''))
|
|
if l > 0:
|
|
lengths.append(l)
|
|
complexities.append(c)
|
|
sentiments.append(s)
|
|
richness.append(r)
|
|
|
|
x2_list.append(sum(lengths)/len(lengths) if lengths else 0)
|
|
x3_list.append(sum(complexities)/len(complexities) if complexities else 0)
|
|
x5_list.append(sum(sentiments)/len(sentiments) if sentiments else 0)
|
|
x6_list.append(sum(richness)/len(richness) if richness else 0)
|
|
|
|
df['X2'] = x2_list
|
|
df['X3'] = x3_list
|
|
df['X5'] = x5_list
|
|
df['X6'] = x6_list
|
|
|
|
# 计算X4
|
|
df['X4'] = df.apply(lambda r: r['X2']/r['X3'] if r['X3']>0 else 0, axis=1)
|
|
|
|
# 清洗数据
|
|
for col in ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']:
|
|
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).replace([float('inf'), float('-inf')], 0)
|
|
|
|
print("保存文件...")
|
|
df.to_excel(output_file, index=False, engine='openpyxl')
|
|
|
|
print(f"完成!文件大小: {os.path.getsize(output_file)/1024:.2f} KB")
|
|
print(f"行数: {len(df)}, 列数: {len(df.columns)}")
|
|
|