java/project/run_with_output.py


								import os

								import pandas as pd

								import re

								import sys


								# 重定向输出到文件和屏幕

								class Tee:

								    def __init__(self, *files):

								        self.files = files

								    def write(self, obj):

								        for f in self.files:

								            f.write(obj)

								            f.flush()

								    def flush(self):

								        for f in self.files:

								            f.flush()


								log_file = open(r'D:\java\project\process_log.txt', 'w', encoding='utf-8')

								original_stdout = sys.stdout

								sys.stdout = Tee(original_stdout, log_file)


								print("========================================")

								print("  在原表中添加回归数据列")

								print("========================================")


								# 文件路径

								input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'

								output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'


								print(f"输入文件: {input_file}")

								print(f"输出文件: {output_file}")

								print()


								# 检查文件是否存在

								if not os.path.exists(input_file):

								    print("错误: 输入文件不存在！")

								    sys.stdout = original_stdout

								    log_file.close()

								    exit(1)


								print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")


								# 读取原始数据

								try:

								    print("\n正在读取原始数据...")

								    df = pd.read_excel(input_file)

								    print(f"成功读取 {len(df)} 行数据")

								    print(f"原始列数: {len(df.columns)}")


								    # 识别列

								    print("\n识别列...")

								    helpfull_col = None

								    comment_count_col = None

								    comment_cols = []


								    for col in df.columns:

								        col_str = str(col).lower()

								        if 'helpfull' in col_str or 'helpful' in col_str:

								            helpfull_col = col

								            print(f"找到 Y 列 (helpfull): {col}")

								        elif '评论总数' in str(col) or '帖子评论总数' in str(col):

								            comment_count_col = col

								            print(f"找到 X1 列 (评论总数): {col}")

								        elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):

								            comment_cols.append(col)

								            print(f"找到评论列 {len(comment_cols)}: {col}")


								    print(f"\n共找到 {len(comment_cols)} 个评论内容列")


								    # 添加回归数据列

								    print("\n添加回归数据列...")


								    # Y (UGC有用性) - 直接复制helpfull列

								    print("1. 添加 Y (UGC有用性)")

								    if helpfull_col:

								        df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)

								    else:

								        df['Y'] = 0


								    # X1 (评论数量) - 直接复制帖子评论总数列

								    print("2. 添加 X1 (评论数量)")

								    if comment_count_col:

								        df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)

								    else:

								        df['X1'] = 0


								    # 定义函数计算评论指标

								    def calculate_comment_metrics(content):

								        if pd.isna(content) or str(content) in ['None', 'nan', '']:

								            return 0, 0, 0, 0


								        content = str(content)

								        # X2: 评论长度（剔空格后的字符数）

								        length = len(content.replace(' ', '').replace('\u3000', ''))

								        # X3: 评论复杂度（按空格拆分的分词数）

								        complexity = len(content.split())

								        # X5: 情感分析（正面=1、中性=0、负面=-1）

								        positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']

								        negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']


								        sentiment = 0

								        lower_content = content.lower()

								        if any(word in lower_content for word in positive_words):

								            sentiment = 1

								        elif any(word in lower_content for word in negative_words):

								            sentiment = -1

								        # X6: 信息丰富度（含数字/链接/表情各1分，满分3分）

								        richness = 0

								        if re.search(r'\d', content):  # 含数字

								            richness += 1

								        if re.search(r'http[s]?://|www\.', content):  # 含链接

								            richness += 1

								        if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):  # 含表情

								            richness += 1


								        return length, complexity, sentiment, richness


								    # 计算评论相关指标

								    print("3. 计算评论相关指标...")


								    # 初始化列

								    df['X2'] = 0.0  # 评论长度

								    df['X3'] = 0.0  # 评论复杂度

								    df['X5'] = 0.0  # 情感性

								    df['X6'] = 0.0  # 信息丰富度


								    # 逐行计算

								    total_rows = len(df)

								    print(f"总数据行数: {total_rows}")


								    for i in range(total_rows):

								        if i % 1000 == 0:

								            print(f"  处理第 {i}/{total_rows} 行...")


								        lengths = []

								        complexities = []

								        sentiments = []

								        richness = []


								        for col in comment_cols:

								            content = df.iloc[i].get(col, '')

								            length, complexity, sentiment, r = calculate_comment_metrics(content)

								            if length > 0:  # 只统计有内容的评论

								                lengths.append(length)

								                complexities.append(complexity)

								                sentiments.append(sentiment)

								                richness.append(r)


								        # 计算平均值（无评论记0）

								        if lengths:

								            df.loc[i, 'X2'] = sum(lengths) / len(lengths)

								            df.loc[i, 'X3'] = sum(complexities) / len(complexities)

								            df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)

								            df.loc[i, 'X6'] = sum(richness) / len(richness)


								    # X4: 评论可读性 = X2/X3（X3为0时记0，避免报错）

								    print("4. 计算 X4 (评论可读性)")

								    df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)


								    # 数据清洗 - 确保所有值都是纯数字，无文本、无空值、无错误

								    print("\n5. 数据清洗...")

								    regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']

								    for col in regression_cols:

								        # 转换为数字，错误值转为0

								        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

								        # 替换无穷大

								        df[col] = df[col].replace([float('inf'), float('-inf')], 0)


								    # 验证数据

								    print("\n6. 验证数据...")

								    print(f"总行数: {len(df)}")

								    print(f"总列数: {len(df.columns)}")

								    print(f"\n回归数据列统计:")

								    print(df[regression_cols].describe())

								    print(f"\n前5行回归数据:")

								    print(df[regression_cols].head())


								    # 检查是否有空值或错误值

								    print(f"\n空值检查:")

								    for col in regression_cols:

								        null_count = df[col].isnull().sum()

								        print(f"  {col}: {null_count} 个空值")


								    # 保存文件

								    print("\n7. 保存文件...")

								    print(f"正在保存到: {output_file}")

								    df.to_excel(output_file, index=False, engine='openpyxl')


								    # 验证文件

								    print("\n8. 验证文件...")

								    if os.path.exists(output_file):

								        print(f"文件已成功保存: {output_file}")

								        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")

								        # 重新读取检查

								        df_check = pd.read_excel(output_file)

								        print(f"输出文件行数: {len(df_check)}")

								        print(f"输出文件列数: {len(df_check.columns)}")

								        print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")

								    else:

								        print("文件保存失败！")


								    print()

								    print("========================================")

								    print("  任务完成")

								    print("========================================")

								    print(f"新文件已保存: {output_file}")

								    print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")


								except Exception as e:

								    print(f"处理文件时出错: {str(e)}")

								    import traceback

								    traceback.print_exc()

								finally:

								    sys.stdout = original_stdout

								    log_file.close()

								    print("日志已保存到: D:\\java\\project\\process_log.txt")