import os import pandas as pd import re # 文件路径 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' print("========================================") print(" 计算UGC回归数据") print("========================================") print(f"输入文件: {input_file}") print(f"输出文件: {output_file}") print() # 检查文件是否存在 if not os.path.exists(input_file): print("错误: 输入文件不存在!") exit(1) print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") # 读取原始数据 try: print("正在读取原始数据...") df = pd.read_excel(input_file) print(f"成功读取 {len(df)} 行数据") print(f"列名: {list(df.columns)}") # 识别评论列 comment_columns = [col for col in df.columns if '评论' in col and any(str(i) in col for i in range(1, 6))] print(f"\n找到评论列: {comment_columns}") # 创建回归数据 regression_data = pd.DataFrame() # 1. Y (UGC有用性) print("\n1. 计算 Y (UGC有用性)") if 'helpfull' in df.columns: regression_data['Y'] = df['helpfull'].fillna(0).astype(float) print(f"成功提取 Y 列,共 {len(regression_data['Y'])} 个值") else: print("警告: 未找到 helpfull 列,使用默认值 0") regression_data['Y'] = 0 # 2. X1 (评论数量) print("\n2. 计算 X1 (评论数量)") comment_count_columns = [col for col in df.columns if '评论总数' in col or '帖子评论总数' in col] if comment_count_columns: regression_data['X1'] = df[comment_count_columns[0]].fillna(0).astype(float) print(f"成功提取 X1 列,使用列: {comment_count_columns[0]}") else: print("警告: 未找到评论总数列,使用默认值 0") regression_data['X1'] = 0 # 3. X2 (评论长度) print("\n3. 计算 X2 (评论长度)") def calculate_comment_length(row): lengths = [] for col in comment_columns: content = str(row.get(col, '')) if content and content != 'nan': # 剔空格后的字符数 length = len(content.replace(' ', '')) lengths.append(length) return sum(lengths) / len(lengths) if lengths else 0 regression_data['X2'] = df.apply(calculate_comment_length, axis=1) # 4. X3 (评论复杂度) print("\n4. 计算 X3 (评论复杂度)") def calculate_comment_complexity(row): complexities = [] for col in comment_columns: content = str(row.get(col, '')) if content and content != 'nan': # 按空格拆分的分词数 complexity = len(content.split()) complexities.append(complexity) return sum(complexities) / len(complexities) if complexities else 0 regression_data['X3'] = df.apply(calculate_comment_complexity, axis=1) # 5. X4 (评论可读性) print("\n5. 计算 X4 (评论可读性)") regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) # 6. X5 (内容情感性) print("\n6. 计算 X5 (内容情感性)") def calculate_sentiment(row): sentiments = [] for col in comment_columns: content = str(row.get(col, '')) if content and content != 'nan': # 简单的情感分析 positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive'] negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative'] sentiment = 0 lower_content = content.lower() if any(word in lower_content for word in positive_words): sentiment = 1 elif any(word in lower_content for word in negative_words): sentiment = -1 sentiments.append(sentiment) return sum(sentiments) / len(sentiments) if sentiments else 0 regression_data['X5'] = df.apply(calculate_sentiment, axis=1) # 7. X6 (信息丰富度) print("\n7. 计算 X6 (信息丰富度)") def calculate_information_richness(row): richness_scores = [] for col in comment_columns: content = str(row.get(col, '')) if content and content != 'nan': score = 0 # 含数字 if re.search(r'\d', content): score += 1 # 含链接 if re.search(r'http[s]?://', content): score += 1 # 含表情(简单判断) if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content): score += 1 richness_scores.append(score) return sum(richness_scores) / len(richness_scores) if richness_scores else 0 regression_data['X6'] = df.apply(calculate_information_richness, axis=1) # 数据清洗 print("\n8. 数据清洗") # 确保所有值都是数字 for col in regression_data.columns: regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0) # 验证数据 print("\n9. 数据验证") print(f"行数: {len(regression_data)}") print(f"列数: {len(regression_data.columns)}") print(f"列名: {list(regression_data.columns)}") print(f"数据类型:") print(regression_data.dtypes) print(f"\n前5行数据:") print(regression_data.head()) # 保存文件 print("\n10. 保存文件") regression_data.to_excel(output_file, index=False) # 验证文件是否创建成功 if os.path.exists(output_file): print(f"文件已成功保存到: {output_file}") print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") else: print("错误: 文件保存失败") print() print("========================================") print(" 任务完成") print("========================================") except Exception as e: print(f"处理文件时出错: {str(e)}") import traceback traceback.print_exc()