import os import pandas as pd import re # 文件路径 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' print("========================================") print(" 处理大型Excel文件") print("========================================") print(f"输入文件: {input_file}") print(f"输出文件: {output_file}") print() # 检查文件是否存在 if not os.path.exists(input_file): print("错误: 输入文件不存在!") exit(1) print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") # 读取原始数据 try: print("正在读取原始数据...") # 使用pandas读取Excel文件,设置引擎为openpyxl df = pd.read_excel(input_file, engine='openpyxl') print(f"成功读取 {len(df)} 行数据") print(f"列名: {list(df.columns)}") # 识别列 print("\n识别列...") helpfull_col = None comment_count_col = None comment_cols = [] for col in df.columns: col_str = str(col).lower() if 'helpfull' in col_str or 'helpful' in col_str: helpfull_col = col print(f"找到 Y 列 (helpfull): {col}") elif '评论总数' in str(col) or '帖子评论总数' in str(col): comment_count_col = col print(f"找到 X1 列 (评论总数): {col}") elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)): comment_cols.append(col) print(f"找到评论列 {len(comment_cols)}: {col}") print(f"\n共找到 {len(comment_cols)} 个评论列") # 创建回归数据 print("\n创建回归数据...") regression_data = pd.DataFrame() # Y (UGC有用性) print("1. 计算 Y (UGC有用性)") if helpfull_col: regression_data['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) else: regression_data['Y'] = 0 # X1 (评论数量) print("2. 计算 X1 (评论数量)") if comment_count_col: regression_data['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) else: regression_data['X1'] = 0 # 定义函数计算评论指标 def calculate_comment_metrics(content): if pd.isna(content) or str(content) in ['None', 'nan']: return 0, 0, 0, 0 content = str(content) # 评论长度 length = len(content.replace(' ', '')) # 评论复杂度 complexity = len(content.split()) # 情感分析 positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent'] negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor'] sentiment = 0 lower_content = content.lower() if any(word in lower_content for word in positive_words): sentiment = 1 elif any(word in lower_content for word in negative_words): sentiment = -1 # 信息丰富度 richness = 0 if re.search(r'\d', content): richness += 1 if re.search(r'http[s]?://', content): richness += 1 if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content): richness += 1 return length, complexity, sentiment, richness # 计算评论相关指标 print("3. 计算评论相关指标...") # 初始化列 regression_data['X2'] = 0 # 评论长度 regression_data['X3'] = 0 # 评论复杂度 regression_data['X5'] = 0 # 情感性 regression_data['X6'] = 0 # 信息丰富度 # 逐行计算 total_rows = len(df) for i in range(total_rows): if i % 1000 == 0: print(f"处理到第 {i} 行...") lengths = [] complexities = [] sentiments = [] richness = [] for col in comment_cols: content = df.iloc[i].get(col, '') length, complexity, sentiment, r = calculate_comment_metrics(content) if length > 0: lengths.append(length) complexities.append(complexity) sentiments.append(sentiment) richness.append(r) # 计算平均值 if lengths: regression_data.loc[i, 'X2'] = sum(lengths) / len(lengths) regression_data.loc[i, 'X3'] = sum(complexities) / len(complexities) regression_data.loc[i, 'X5'] = sum(sentiments) / len(sentiments) regression_data.loc[i, 'X6'] = sum(richness) / len(richness) # X4: 评论可读性 print("4. 计算 X4 (评论可读性)") regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) # 数据清洗 print("\n5. 数据清洗...") for col in regression_data.columns: regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0) # 验证数据 print("\n6. 验证数据...") print(f"行数: {len(regression_data)}") print(f"列数: {len(regression_data.columns)}") print(f"列名: {list(regression_data.columns)}") print(f"\n前5行数据:") print(regression_data.head()) # 保存文件 print("\n7. 保存文件...") regression_data.to_excel(output_file, index=False) # 验证文件 print("\n8. 验证文件...") if os.path.exists(output_file): print(f"文件已成功保存: {output_file}") print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") # 重新读取检查 df_check = pd.read_excel(output_file) print(f"输出文件行数: {len(df_check)}") print(f"输出文件列数: {len(df_check.columns)}") else: print("文件保存失败!") print() print("========================================") print(" 任务完成") print("========================================") except Exception as e: print(f"处理文件时出错: {str(e)}") import traceback traceback.print_exc()