import os import pandas as pd import re print("开始处理...") # 文件路径 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' # 读取数据 print("读取数据...") df = pd.read_excel(input_file) print(f"读取完成: {len(df)} 行") # 识别列 helpfull_col = [c for c in df.columns if 'helpfull' in str(c).lower()][0] if any('helpfull' in str(c).lower() for c in df.columns) else None comment_count_col = [c for c in df.columns if '评论总数' in str(c)][0] if any('评论总数' in str(c) for c in df.columns) else None comment_cols = [c for c in df.columns if '评论' in str(c) and any(str(i) in str(c) for i in range(1, 6)) and '内容' in str(c)] print(f"找到列: Y={helpfull_col}, X1={comment_count_col}, 评论列={len(comment_cols)}") # 添加Y和X1 df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) if helpfull_col else 0 df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) if comment_count_col else 0 # 计算评论指标 print("计算评论指标...") def calc_metrics(content): if pd.isna(content) or str(content) in ['None', 'nan', '']: return 0, 0, 0, 0 content = str(content) length = len(content.replace(' ', '').replace('\u3000', '')) complexity = len(content.split()) pos_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent'] neg_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor'] sentiment = 1 if any(w in content.lower() for w in pos_words) else (-1 if any(w in content.lower() for w in neg_words) else 0) richness = (1 if re.search(r'\d', content) else 0) + (1 if re.search(r'http[s]?://|www\.', content) else 0) + (1 if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]', content) else 0) return length, complexity, sentiment, richness # 批量计算 x2_list, x3_list, x5_list, x6_list = [], [], [], [] for i in range(len(df)): if i % 5000 == 0: print(f"处理 {i}/{len(df)}") lengths, complexities, sentiments, richness = [], [], [], [] for col in comment_cols: l, c, s, r = calc_metrics(df.iloc[i].get(col, '')) if l > 0: lengths.append(l) complexities.append(c) sentiments.append(s) richness.append(r) x2_list.append(sum(lengths)/len(lengths) if lengths else 0) x3_list.append(sum(complexities)/len(complexities) if complexities else 0) x5_list.append(sum(sentiments)/len(sentiments) if sentiments else 0) x6_list.append(sum(richness)/len(richness) if richness else 0) df['X2'] = x2_list df['X3'] = x3_list df['X5'] = x5_list df['X6'] = x6_list # 计算X4 df['X4'] = df.apply(lambda r: r['X2']/r['X3'] if r['X3']>0 else 0, axis=1) # 清洗数据 for col in ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']: df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).replace([float('inf'), float('-inf')], 0) print("保存文件...") df.to_excel(output_file, index=False, engine='openpyxl') print(f"完成!文件大小: {os.path.getsize(output_file)/1024:.2f} KB") print(f"行数: {len(df)}, 列数: {len(df.columns)}")