java/project/process_efficient.py


								import os

								import pandas as pd

								import re


								print("=" * 60)

								print("  高效处理全部数据")

								print("=" * 60)


								# 文件路径

								input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'

								output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'


								print(f"输入文件: {input_file}")

								print(f"输出文件: {output_file}")

								print()


								# 首先读取表头来识别列

								print("1. 读取表头...")

								df_header = pd.read_excel(input_file, engine='openpyxl', nrows=0)

								print(f"总列数: {len(df_header.columns)}")


								# 识别列

								helpfull_col = None

								comment_count_col = None

								comment_cols = []


								for col in df_header.columns:

								    col_str = str(col).lower()

								    if 'helpfull' in col_str or 'helpful' in col_str:

								        helpfull_col = col

								        print(f"找到 Y 列 (helpfull): {col}")

								    elif '评论总数' in str(col) or '帖子评论总数' in str(col):

								        comment_count_col = col

								        print(f"找到 X1 列 (评论总数): {col}")

								    elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):

								        comment_cols.append(col)


								print(f"共找到 {len(comment_cols)} 个评论内容列")


								# 定义函数计算评论指标

								def calculate_comment_metrics(content):

								    if pd.isna(content) or str(content) in ['None', 'nan', '']:

								        return 0, 0, 0, 0


								    content = str(content)

								    length = len(content.replace(' ', '').replace('\u3000', ''))

								    complexity = len(content.split())


								    positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']

								    negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']


								    sentiment = 0

								    lower_content = content.lower()

								    if any(word in lower_content for word in positive_words):

								        sentiment = 1

								    elif any(word in lower_content for word in negative_words):

								        sentiment = -1


								    richness = 0

								    if re.search(r'\d', content):

								        richness += 1

								    if re.search(r'http[s]?://|www\.', content):

								        richness += 1

								    if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):

								        richness += 1


								    return length, complexity, sentiment, richness


								# 分批处理数据

								print("\n2. 分批处理数据...")

								batch_size = 5000

								batch_num = 0

								all_data = []


								while True:

								    skip_rows = batch_num * batch_size + 1 if batch_num > 0 else 0

								    nrows = batch_size


								    print(f"  处理批次 {batch_num + 1} (跳过 {skip_rows} 行，读取 {nrows} 行)...")


								    try:

								        if batch_num == 0:

								            df_batch = pd.read_excel(input_file, engine='openpyxl', nrows=nrows)

								        else:

								            df_batch = pd.read_excel(input_file, engine='openpyxl', skiprows=skip_rows, nrows=nrows, header=None)

								            df_batch.columns = df_header.columns

								    except Exception as e:

								        print(f"  读取完成或出错: {e}")

								        break


								    if len(df_batch) == 0:

								        print("  没有更多数据")

								        break


								    print(f"  读取了 {len(df_batch)} 行")


								    # 添加Y和X1

								    if helpfull_col:

								        df_batch['Y'] = pd.to_numeric(df_batch[helpfull_col], errors='coerce').fillna(0)

								    else:

								        df_batch['Y'] = 0


								    if comment_count_col:

								        df_batch['X1'] = pd.to_numeric(df_batch[comment_count_col], errors='coerce').fillna(0)

								    else:

								        df_batch['X1'] = 0


								    # 初始化X2-X6

								    df_batch['X2'] = 0.0

								    df_batch['X3'] = 0.0

								    df_batch['X5'] = 0.0

								    df_batch['X6'] = 0.0


								    # 计算评论指标

								    for i in range(len(df_batch)):

								        lengths = []

								        complexities = []

								        sentiments = []

								        richness = []


								        for col in comment_cols:

								            content = df_batch.iloc[i].get(col, '')

								            length, complexity, sentiment, r = calculate_comment_metrics(content)

								            if length > 0:

								                lengths.append(length)

								                complexities.append(complexity)

								                sentiments.append(sentiment)

								                richness.append(r)


								        if lengths:

								            df_batch.loc[i, 'X2'] = sum(lengths) / len(lengths)

								            df_batch.loc[i, 'X3'] = sum(complexities) / len(complexities)

								            df_batch.loc[i, 'X5'] = sum(sentiments) / len(sentiments)

								            df_batch.loc[i, 'X6'] = sum(richness) / len(richness)


								    # 计算X4

								    df_batch['X4'] = df_batch.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)


								    # 数据清洗

								    regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']

								    for col in regression_cols:

								        df_batch[col] = pd.to_numeric(df_batch[col], errors='coerce').fillna(0)

								        df_batch[col] = df_batch[col].replace([float('inf'), float('-inf')], 0)


								    all_data.append(df_batch)

								    batch_num += 1


								    print(f"  批次 {batch_num} 完成，当前总行数: {sum(len(d) for d in all_data)}")


								# 合并所有数据

								print("\n3. 合并数据...")

								df_final = pd.concat(all_data, ignore_index=True)

								print(f"合并后总行数: {len(df_final)}")


								# 验证数据

								print("\n4. 验证数据...")

								regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']

								print(f"总列数: {len(df_final.columns)}")

								print(f"\n回归数据列统计:")

								print(df_final[regression_cols].describe())


								# 保存文件

								print("\n5. 保存文件...")

								df_final.to_excel(output_file, index=False, engine='openpyxl')


								# 验证文件

								print("\n6. 验证文件...")

								if os.path.exists(output_file):

								    print(f"文件已成功保存: {output_file}")

								    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")

								    df_check = pd.read_excel(output_file)

								    print(f"输出文件行数: {len(df_check)}")

								    print(f"输出文件列数: {len(df_check.columns)}")

								else:

								    print("文件保存失败！")


								print()

								print("=" * 60)

								print("  任务完成")

								print("=" * 60)