java/project/process_all_data.py


								import os

								import openpyxl

								import re


								# 文件路径

								input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'

								output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'


								print("========================================")

								print("  处理所有数据")

								print("========================================")

								print(f"输入文件: {input_file}")

								print(f"输出文件: {output_file}")

								print()


								# 检查文件是否存在

								if not os.path.exists(input_file):

								    print("错误: 输入文件不存在！")

								    exit(1)


								print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")


								# 读取原始数据

								try:

								    print("正在读取原始数据...")

								    wb_input = openpyxl.load_workbook(input_file)

								    ws_input = wb_input.active


								    print(f"工作表名称: {ws_input.title}")

								    print(f"最大行数: {ws_input.max_row}")

								    print(f"最大列数: {ws_input.max_column}")


								    # 识别列

								    print("\n识别列...")

								    headers = []

								    helpfull_col = None

								    comment_count_col = None

								    comment_cols = []


								    for col in range(1, ws_input.max_column + 1):

								        header = ws_input.cell(row=1, column=col).value

								        headers.append(header)


								        if header:

								            header_str = str(header).lower()

								            if 'helpfull' in header_str or 'helpful' in header_str:

								                helpfull_col = col

								                print(f"找到 Y 列 (helpfull): 列 {col}")

								            elif '评论总数' in str(header) or '帖子评论总数' in str(header):

								                comment_count_col = col

								                print(f"找到 X1 列 (评论总数): 列 {col}")

								            elif '评论' in str(header) and any(str(i) in str(header) for i in range(1, 6)):

								                comment_cols.append(col)

								                print(f"找到评论列 {len(comment_cols)}: 列 {col} - {header}")


								    print(f"\n共找到 {len(comment_cols)} 个评论列")


								    # 创建新的输出文件

								    print("\n创建新的输出文件...")

								    wb_output = openpyxl.Workbook()

								    ws_output = wb_output.active


								    # 写入表头

								    headers_output = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']

								    for i, header in enumerate(headers_output, 1):

								        ws_output.cell(row=1, column=i, value=header)


								    # 计算并填充数据

								    print("\n计算并填充数据...")

								    total_rows = ws_input.max_row - 1

								    print(f"总数据行数: {total_rows}")


								    for row in range(2, ws_input.max_row + 1):

								        if row % 1000 == 0:

								            print(f"处理到第 {row-1} 行...")


								        # Y (UGC有用性)

								        if helpfull_col:

								            y_value = ws_input.cell(row=row, column=helpfull_col).value

								            y_value = float(y_value) if y_value else 0

								        else:

								            y_value = 0

								        ws_output.cell(row=row, column=1, value=y_value)


								        # X1 (评论数量)

								        if comment_count_col:

								            x1_value = ws_input.cell(row=row, column=comment_count_col).value

								            x1_value = float(x1_value) if x1_value else 0

								        else:

								            x1_value = 0

								        ws_output.cell(row=row, column=2, value=x1_value)


								        # 计算评论相关指标

								        comment_lengths = []

								        comment_complexities = []

								        comment_sentiments = []

								        comment_richness = []


								        for col in comment_cols:

								            content = str(ws_input.cell(row=row, column=col).value)

								            if content and content != 'None' and content != 'nan':

								                # X2: 评论长度（剔空格后的字符数）

								                length = len(content.replace(' ', ''))

								                comment_lengths.append(length)


								                # X3: 评论复杂度（按空格拆分的分词数）

								                complexity = len(content.split())

								                comment_complexities.append(complexity)


								                # X5: 内容情感性（正面=1、中性=0、负面=-1）

								                positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent']

								                negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']


								                sentiment = 0

								                lower_content = content.lower()


								                if any(word in lower_content for word in positive_words):

								                    sentiment = 1

								                elif any(word in lower_content for word in negative_words):

								                    sentiment = -1

								                comment_sentiments.append(sentiment)


								                # X6: 信息丰富度（含数字/链接/表情各1分，满分3分）

								                richness = 0

								                if re.search(r'\d', content):

								                    richness += 1

								                if re.search(r'http[s]?://', content):

								                    richness += 1

								                if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content):

								                    richness += 1

								                comment_richness.append(richness)


								        # X2: 评论长度平均值

								        x2_value = sum(comment_lengths) / len(comment_lengths) if comment_lengths else 0

								        ws_output.cell(row=row, column=3, value=x2_value)


								        # X3: 评论复杂度平均值

								        x3_value = sum(comment_complexities) / len(comment_complexities) if comment_complexities else 0

								        ws_output.cell(row=row, column=4, value=x3_value)


								        # X4: 评论可读性（X2/X3，X3为0时记0）

								        x4_value = x2_value / x3_value if x3_value > 0 else 0

								        ws_output.cell(row=row, column=5, value=x4_value)


								        # X5: 内容情感性平均值

								        x5_value = sum(comment_sentiments) / len(comment_sentiments) if comment_sentiments else 0

								        ws_output.cell(row=row, column=6, value=x5_value)


								        # X6: 信息丰富度平均值

								        x6_value = sum(comment_richness) / len(comment_richness) if comment_richness else 0

								        ws_output.cell(row=row, column=7, value=x6_value)


								    # 保存文件

								    print("\n保存文件...")

								    wb_output.save(output_file)


								    print(f"文件已成功保存: {output_file}")

								    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")

								    print(f"处理完成，共 {total_rows} 行数据")


								    # 验证文件

								    print("\n验证文件...")

								    if os.path.exists(output_file):

								        print("文件保存成功！")

								        # 重新打开文件检查

								        wb_check = openpyxl.load_workbook(output_file)

								        ws_check = wb_check.active

								        print(f"输出文件行数: {ws_check.max_row - 1}")

								        print(f"输出文件列数: {ws_check.max_column}")


								        # 显示前5行数据

								        print("\n前5行数据:")

								        for row in range(1, min(6, ws_check.max_row + 1)):

								            row_data = []

								            for col in range(1, ws_check.max_column + 1):

								                value = ws_check.cell(row=row, column=col).value

								                row_data.append(value)

								            print(f"行 {row}: {row_data}")

								    else:

								        print("文件保存失败！")


								    print()

								    print("========================================")

								    print("  任务完成")

								    print("========================================")


								except Exception as e:

								    print(f"处理文件时出错: {str(e)}")

								    import traceback

								    traceback.print_exc()