import os import openpyxl import re # 文件路径 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' print("========================================") print(" 处理所有数据") print("========================================") print(f"输入文件: {input_file}") print(f"输出文件: {output_file}") print() # 检查文件是否存在 if not os.path.exists(input_file): print("错误: 输入文件不存在!") exit(1) print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") # 读取原始数据 try: print("正在读取原始数据...") wb_input = openpyxl.load_workbook(input_file) ws_input = wb_input.active print(f"工作表名称: {ws_input.title}") print(f"最大行数: {ws_input.max_row}") print(f"最大列数: {ws_input.max_column}") # 识别列 print("\n识别列...") headers = [] helpfull_col = None comment_count_col = None comment_cols = [] for col in range(1, ws_input.max_column + 1): header = ws_input.cell(row=1, column=col).value headers.append(header) if header: header_str = str(header).lower() if 'helpfull' in header_str or 'helpful' in header_str: helpfull_col = col print(f"找到 Y 列 (helpfull): 列 {col}") elif '评论总数' in str(header) or '帖子评论总数' in str(header): comment_count_col = col print(f"找到 X1 列 (评论总数): 列 {col}") elif '评论' in str(header) and any(str(i) in str(header) for i in range(1, 6)): comment_cols.append(col) print(f"找到评论列 {len(comment_cols)}: 列 {col} - {header}") print(f"\n共找到 {len(comment_cols)} 个评论列") # 创建新的输出文件 print("\n创建新的输出文件...") wb_output = openpyxl.Workbook() ws_output = wb_output.active # 写入表头 headers_output = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] for i, header in enumerate(headers_output, 1): ws_output.cell(row=1, column=i, value=header) # 计算并填充数据 print("\n计算并填充数据...") total_rows = ws_input.max_row - 1 print(f"总数据行数: {total_rows}") for row in range(2, ws_input.max_row + 1): if row % 1000 == 0: print(f"处理到第 {row-1} 行...") # Y (UGC有用性) if helpfull_col: y_value = ws_input.cell(row=row, column=helpfull_col).value y_value = float(y_value) if y_value else 0 else: y_value = 0 ws_output.cell(row=row, column=1, value=y_value) # X1 (评论数量) if comment_count_col: x1_value = ws_input.cell(row=row, column=comment_count_col).value x1_value = float(x1_value) if x1_value else 0 else: x1_value = 0 ws_output.cell(row=row, column=2, value=x1_value) # 计算评论相关指标 comment_lengths = [] comment_complexities = [] comment_sentiments = [] comment_richness = [] for col in comment_cols: content = str(ws_input.cell(row=row, column=col).value) if content and content != 'None' and content != 'nan': # X2: 评论长度(剔空格后的字符数) length = len(content.replace(' ', '')) comment_lengths.append(length) # X3: 评论复杂度(按空格拆分的分词数) complexity = len(content.split()) comment_complexities.append(complexity) # X5: 内容情感性(正面=1、中性=0、负面=-1) positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent'] negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor'] sentiment = 0 lower_content = content.lower() if any(word in lower_content for word in positive_words): sentiment = 1 elif any(word in lower_content for word in negative_words): sentiment = -1 comment_sentiments.append(sentiment) # X6: 信息丰富度(含数字/链接/表情各1分,满分3分) richness = 0 if re.search(r'\d', content): richness += 1 if re.search(r'http[s]?://', content): richness += 1 if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content): richness += 1 comment_richness.append(richness) # X2: 评论长度平均值 x2_value = sum(comment_lengths) / len(comment_lengths) if comment_lengths else 0 ws_output.cell(row=row, column=3, value=x2_value) # X3: 评论复杂度平均值 x3_value = sum(comment_complexities) / len(comment_complexities) if comment_complexities else 0 ws_output.cell(row=row, column=4, value=x3_value) # X4: 评论可读性(X2/X3,X3为0时记0) x4_value = x2_value / x3_value if x3_value > 0 else 0 ws_output.cell(row=row, column=5, value=x4_value) # X5: 内容情感性平均值 x5_value = sum(comment_sentiments) / len(comment_sentiments) if comment_sentiments else 0 ws_output.cell(row=row, column=6, value=x5_value) # X6: 信息丰富度平均值 x6_value = sum(comment_richness) / len(comment_richness) if comment_richness else 0 ws_output.cell(row=row, column=7, value=x6_value) # 保存文件 print("\n保存文件...") wb_output.save(output_file) print(f"文件已成功保存: {output_file}") print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") print(f"处理完成,共 {total_rows} 行数据") # 验证文件 print("\n验证文件...") if os.path.exists(output_file): print("文件保存成功!") # 重新打开文件检查 wb_check = openpyxl.load_workbook(output_file) ws_check = wb_check.active print(f"输出文件行数: {ws_check.max_row - 1}") print(f"输出文件列数: {ws_check.max_column}") # 显示前5行数据 print("\n前5行数据:") for row in range(1, min(6, ws_check.max_row + 1)): row_data = [] for col in range(1, ws_check.max_column + 1): value = ws_check.cell(row=row, column=col).value row_data.append(value) print(f"行 {row}: {row_data}") else: print("文件保存失败!") print() print("========================================") print(" 任务完成") print("========================================") except Exception as e: print(f"处理文件时出错: {str(e)}") import traceback traceback.print_exc()