import os
import pandas as pd

# 输入输出文件路径
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.csv'

print("========================================")
print("  Python 数据清洗脚本")
print("========================================")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print()

# 检查文件是否存在
if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)

print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")

# 读取Excel文件
try:
    print("正在读取Excel文件...")
    df = pd.read_excel(input_file)
    print(f"成功读取 {len(df)} 行数据")
    
    # 数据清洗
    print("正在清洗数据...")
    
    # 1. 处理缺失值
    df = df.fillna('')
    
    # 2. 去除文本中的多余空格
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype(str).str.strip()
            df[col] = df[col].str.replace('\\s+', ' ', regex=True)
    
    # 3. 规范化情感倾向
    if '情感倾向' in df.columns:
        def normalize_sentiment(sentiment):
            if pd.isna(sentiment) or sentiment == '':
                return '中性'
            sentiment = str(sentiment).lower()
            if any(keyword in sentiment for keyword in ['积极', '正面', 'positive']):
                return '积极'
            elif any(keyword in sentiment for keyword in ['消极', '负面', 'negative']):
                return '消极'
            else:
                return '中性'
        
        df['情感倾向'] = df['情感倾向'].apply(normalize_sentiment)
    
    # 4. 确保输出目录存在
    output_dir = os.path.dirname(output_file)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # 保存为CSV文件
    print("正在保存清洗后的数据...")
    df.to_csv(output_file, index=False, encoding='utf-8-sig')
    
    print(f"数据已成功保存到: {output_file}")
    print(f"保存了 {len(df)} 行清洗后的数据")
    
    print()
    print("========================================")
    print("  数据清洗任务完成")
    print("========================================")
    
except Exception as e:
    print(f"处理文件时出错: {str(e)}")