import os
import pandas as pd
import re

print("开始处理...")

# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'

# 读取数据
print("读取数据...")
df = pd.read_excel(input_file)
print(f"读取完成: {len(df)} 行")

# 识别列
helpfull_col = [c for c in df.columns if 'helpfull' in str(c).lower()][0] if any('helpfull' in str(c).lower() for c in df.columns) else None
comment_count_col = [c for c in df.columns if '评论总数' in str(c)][0] if any('评论总数' in str(c) for c in df.columns) else None
comment_cols = [c for c in df.columns if '评论' in str(c) and any(str(i) in str(c) for i in range(1, 6)) and '内容' in str(c)]

print(f"找到列: Y={helpfull_col}, X1={comment_count_col}, 评论列={len(comment_cols)}")

# 添加Y和X1
df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) if helpfull_col else 0
df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) if comment_count_col else 0

# 计算评论指标
print("计算评论指标...")

def calc_metrics(content):
    if pd.isna(content) or str(content) in ['None', 'nan', '']:
        return 0, 0, 0, 0
    content = str(content)
    length = len(content.replace(' ', '').replace('\u3000', ''))
    complexity = len(content.split())
    
    pos_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent']
    neg_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
    sentiment = 1 if any(w in content.lower() for w in pos_words) else (-1 if any(w in content.lower() for w in neg_words) else 0)
    
    richness = (1 if re.search(r'\d', content) else 0) + (1 if re.search(r'http[s]?://|www\.', content) else 0) + (1 if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]', content) else 0)
    
    return length, complexity, sentiment, richness

# 批量计算
x2_list, x3_list, x5_list, x6_list = [], [], [], []

for i in range(len(df)):
    if i % 5000 == 0:
        print(f"处理 {i}/{len(df)}")
    
    lengths, complexities, sentiments, richness = [], [], [], []
    
    for col in comment_cols:
        l, c, s, r = calc_metrics(df.iloc[i].get(col, ''))
        if l > 0:
            lengths.append(l)
            complexities.append(c)
            sentiments.append(s)
            richness.append(r)
    
    x2_list.append(sum(lengths)/len(lengths) if lengths else 0)
    x3_list.append(sum(complexities)/len(complexities) if complexities else 0)
    x5_list.append(sum(sentiments)/len(sentiments) if sentiments else 0)
    x6_list.append(sum(richness)/len(richness) if richness else 0)

df['X2'] = x2_list
df['X3'] = x3_list
df['X5'] = x5_list
df['X6'] = x6_list

# 计算X4
df['X4'] = df.apply(lambda r: r['X2']/r['X3'] if r['X3']>0 else 0, axis=1)

# 清洗数据
for col in ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).replace([float('inf'), float('-inf')], 0)

print("保存文件...")
df.to_excel(output_file, index=False, engine='openpyxl')

print(f"完成！文件大小: {os.path.getsize(output_file)/1024:.2f} KB")
print(f"行数: {len(df)}, 列数: {len(df.columns)}")