import os
import pandas as pd
import re

print("=" * 60)
print("  处理全部数据")
print("=" * 60)

# 文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'

print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()

# 读取全部数据
print("读取全部数据...")
df = pd.read_excel(input_file, engine='openpyxl')
print(f"成功读取 {len(df)} 行数据")
print(f"原始列数: {len(df.columns)}")

# 识别列
print("\n识别列...")
helpfull_col = None
comment_count_col = None
comment_cols = []

for col in df.columns:
    col_str = str(col).lower()
    if 'helpfull' in col_str or 'helpful' in col_str:
        helpfull_col = col
        print(f"找到 Y 列 (helpfull): {col}")
    elif '评论总数' in str(col) or '帖子评论总数' in str(col):
        comment_count_col = col
        print(f"找到 X1 列 (评论总数): {col}")
    elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
        comment_cols.append(col)

print(f"\n共找到 {len(comment_cols)} 个评论内容列")

# 添加回归数据列
print("\n添加回归数据列...")

# Y (UGC有用性)
print("1. 添加 Y (UGC有用性)")
if helpfull_col:
    df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
else:
    df['Y'] = 0

# X1 (评论数量)
print("2. 添加 X1 (评论数量)")
if comment_count_col:
    df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
else:
    df['X1'] = 0

# 定义函数计算评论指标
def calculate_comment_metrics(content):
    if pd.isna(content) or str(content) in ['None', 'nan', '']:
        return 0, 0, 0, 0
    
    content = str(content)
    length = len(content.replace(' ', '').replace('\u3000', ''))
    complexity = len(content.split())
    
    positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
    negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
    
    sentiment = 0
    lower_content = content.lower()
    if any(word in lower_content for word in positive_words):
        sentiment = 1
    elif any(word in lower_content for word in negative_words):
        sentiment = -1
    
    richness = 0
    if re.search(r'\d', content):
        richness += 1
    if re.search(r'http[s]?://|www\.', content):
        richness += 1
    if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):
        richness += 1
    
    return length, complexity, sentiment, richness

# 计算评论相关指标
print("3. 计算评论相关指标...")
print(f"总数据行数: {len(df)}")

df['X2'] = 0.0
df['X3'] = 0.0
df['X5'] = 0.0
df['X6'] = 0.0

for i in range(len(df)):
    if i % 1000 == 0:
        print(f"  处理第 {i}/{len(df)} 行...")
    
    lengths = []
    complexities = []
    sentiments = []
    richness = []
    
    for col in comment_cols:
        content = df.iloc[i].get(col, '')
        length, complexity, sentiment, r = calculate_comment_metrics(content)
        if length > 0:
            lengths.append(length)
            complexities.append(complexity)
            sentiments.append(sentiment)
            richness.append(r)
    
    if lengths:
        df.loc[i, 'X2'] = sum(lengths) / len(lengths)
        df.loc[i, 'X3'] = sum(complexities) / len(complexities)
        df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
        df.loc[i, 'X6'] = sum(richness) / len(richness)

# X4: 评论可读性
print("4. 计算 X4 (评论可读性)")
df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)

# 数据清洗
print("\n5. 数据清洗...")
regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
for col in regression_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    df[col] = df[col].replace([float('inf'), float('-inf')], 0)

# 验证数据
print("\n6. 验证数据...")
print(f"总行数: {len(df)}")
print(f"总列数: {len(df.columns)}")
print(f"\n回归数据列统计:")
print(df[regression_cols].describe())

# 保存文件
print("\n7. 保存文件...")
df.to_excel(output_file, index=False, engine='openpyxl')

# 验证文件
print("\n8. 验证文件...")
if os.path.exists(output_file):
    print(f"文件已成功保存: {output_file}")
    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
    df_check = pd.read_excel(output_file)
    print(f"输出文件行数: {len(df_check)}")
    print(f"输出文件列数: {len(df_check.columns)}")
else:
    print("文件保存失败！")

print()
print("=" * 60)
print("  任务完成")
print("=" * 60)