import os import pandas as pd import re print("=" * 60) print(" 高效处理全部数据") print("=" * 60) # 文件路径 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' print(f"输入文件: {input_file}") print(f"输出文件: {output_file}") print() # 首先读取表头来识别列 print("1. 读取表头...") df_header = pd.read_excel(input_file, engine='openpyxl', nrows=0) print(f"总列数: {len(df_header.columns)}") # 识别列 helpfull_col = None comment_count_col = None comment_cols = [] for col in df_header.columns: col_str = str(col).lower() if 'helpfull' in col_str or 'helpful' in col_str: helpfull_col = col print(f"找到 Y 列 (helpfull): {col}") elif '评论总数' in str(col) or '帖子评论总数' in str(col): comment_count_col = col print(f"找到 X1 列 (评论总数): {col}") elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): comment_cols.append(col) print(f"共找到 {len(comment_cols)} 个评论内容列") # 定义函数计算评论指标 def calculate_comment_metrics(content): if pd.isna(content) or str(content) in ['None', 'nan', '']: return 0, 0, 0, 0 content = str(content) length = len(content.replace(' ', '').replace('\u3000', '')) complexity = len(content.split()) positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] sentiment = 0 lower_content = content.lower() if any(word in lower_content for word in positive_words): sentiment = 1 elif any(word in lower_content for word in negative_words): sentiment = -1 richness = 0 if re.search(r'\d', content): richness += 1 if re.search(r'http[s]?://|www\.', content): richness += 1 if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): richness += 1 return length, complexity, sentiment, richness # 分批处理数据 print("\n2. 分批处理数据...") batch_size = 5000 batch_num = 0 all_data = [] while True: skip_rows = batch_num * batch_size + 1 if batch_num > 0 else 0 nrows = batch_size print(f" 处理批次 {batch_num + 1} (跳过 {skip_rows} 行,读取 {nrows} 行)...") try: if batch_num == 0: df_batch = pd.read_excel(input_file, engine='openpyxl', nrows=nrows) else: df_batch = pd.read_excel(input_file, engine='openpyxl', skiprows=skip_rows, nrows=nrows, header=None) df_batch.columns = df_header.columns except Exception as e: print(f" 读取完成或出错: {e}") break if len(df_batch) == 0: print(" 没有更多数据") break print(f" 读取了 {len(df_batch)} 行") # 添加Y和X1 if helpfull_col: df_batch['Y'] = pd.to_numeric(df_batch[helpfull_col], errors='coerce').fillna(0) else: df_batch['Y'] = 0 if comment_count_col: df_batch['X1'] = pd.to_numeric(df_batch[comment_count_col], errors='coerce').fillna(0) else: df_batch['X1'] = 0 # 初始化X2-X6 df_batch['X2'] = 0.0 df_batch['X3'] = 0.0 df_batch['X5'] = 0.0 df_batch['X6'] = 0.0 # 计算评论指标 for i in range(len(df_batch)): lengths = [] complexities = [] sentiments = [] richness = [] for col in comment_cols: content = df_batch.iloc[i].get(col, '') length, complexity, sentiment, r = calculate_comment_metrics(content) if length > 0: lengths.append(length) complexities.append(complexity) sentiments.append(sentiment) richness.append(r) if lengths: df_batch.loc[i, 'X2'] = sum(lengths) / len(lengths) df_batch.loc[i, 'X3'] = sum(complexities) / len(complexities) df_batch.loc[i, 'X5'] = sum(sentiments) / len(sentiments) df_batch.loc[i, 'X6'] = sum(richness) / len(richness) # 计算X4 df_batch['X4'] = df_batch.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) # 数据清洗 regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] for col in regression_cols: df_batch[col] = pd.to_numeric(df_batch[col], errors='coerce').fillna(0) df_batch[col] = df_batch[col].replace([float('inf'), float('-inf')], 0) all_data.append(df_batch) batch_num += 1 print(f" 批次 {batch_num} 完成,当前总行数: {sum(len(d) for d in all_data)}") # 合并所有数据 print("\n3. 合并数据...") df_final = pd.concat(all_data, ignore_index=True) print(f"合并后总行数: {len(df_final)}") # 验证数据 print("\n4. 验证数据...") regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] print(f"总列数: {len(df_final.columns)}") print(f"\n回归数据列统计:") print(df_final[regression_cols].describe()) # 保存文件 print("\n5. 保存文件...") df_final.to_excel(output_file, index=False, engine='openpyxl') # 验证文件 print("\n6. 验证文件...") if os.path.exists(output_file): print(f"文件已成功保存: {output_file}") print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") df_check = pd.read_excel(output_file) print(f"输出文件行数: {len(df_check)}") print(f"输出文件列数: {len(df_check.columns)}") else: print("文件保存失败!") print() print("=" * 60) print(" 任务完成") print("=" * 60)