java/project/quick_process.py


								import os

								import pandas as pd

								import re


								print("开始处理...")


								# 文件路径

								input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'

								output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'


								# 读取数据

								print("读取数据...")

								df = pd.read_excel(input_file)

								print(f"读取完成: {len(df)} 行")


								# 识别列

								helpfull_col = [c for c in df.columns if 'helpfull' in str(c).lower()][0] if any('helpfull' in str(c).lower() for c in df.columns) else None

								comment_count_col = [c for c in df.columns if '评论总数' in str(c)][0] if any('评论总数' in str(c) for c in df.columns) else None

								comment_cols = [c for c in df.columns if '评论' in str(c) and any(str(i) in str(c) for i in range(1, 6)) and '内容' in str(c)]


								print(f"找到列: Y={helpfull_col}, X1={comment_count_col}, 评论列={len(comment_cols)}")


								# 添加Y和X1

								df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) if helpfull_col else 0

								df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) if comment_count_col else 0


								# 计算评论指标

								print("计算评论指标...")


								def calc_metrics(content):

								    if pd.isna(content) or str(content) in ['None', 'nan', '']:

								        return 0, 0, 0, 0

								    content = str(content)

								    length = len(content.replace(' ', '').replace('\u3000', ''))

								    complexity = len(content.split())


								    pos_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent']

								    neg_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']

								    sentiment = 1 if any(w in content.lower() for w in pos_words) else (-1 if any(w in content.lower() for w in neg_words) else 0)


								    richness = (1 if re.search(r'\d', content) else 0) + (1 if re.search(r'http[s]?://|www\.', content) else 0) + (1 if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]', content) else 0)


								    return length, complexity, sentiment, richness


								# 批量计算

								x2_list, x3_list, x5_list, x6_list = [], [], [], []


								for i in range(len(df)):

								    if i % 5000 == 0:

								        print(f"处理 {i}/{len(df)}")


								    lengths, complexities, sentiments, richness = [], [], [], []


								    for col in comment_cols:

								        l, c, s, r = calc_metrics(df.iloc[i].get(col, ''))

								        if l > 0:

								            lengths.append(l)

								            complexities.append(c)

								            sentiments.append(s)

								            richness.append(r)


								    x2_list.append(sum(lengths)/len(lengths) if lengths else 0)

								    x3_list.append(sum(complexities)/len(complexities) if complexities else 0)

								    x5_list.append(sum(sentiments)/len(sentiments) if sentiments else 0)

								    x6_list.append(sum(richness)/len(richness) if richness else 0)


								df['X2'] = x2_list

								df['X3'] = x3_list

								df['X5'] = x5_list

								df['X6'] = x6_list


								# 计算X4

								df['X4'] = df.apply(lambda r: r['X2']/r['X3'] if r['X3']>0 else 0, axis=1)


								# 清洗数据

								for col in ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']:

								    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).replace([float('inf'), float('-inf')], 0)


								print("保存文件...")

								df.to_excel(output_file, index=False, engine='openpyxl')


								print(f"完成！文件大小: {os.path.getsize(output_file)/1024:.2f} KB")

								print(f"行数: {len(df)}, 列数: {len(df.columns)}")