You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
112 lines
3.6 KiB
112 lines
3.6 KiB
import os
|
|
import pandas as pd
|
|
import numpy as np
|
|
import re
|
|
|
|
# 文件路径
|
|
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
|
|
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
|
|
|
|
print("========================================")
|
|
print(" 创建UGC回归数据文件")
|
|
print("========================================")
|
|
print(f"输入文件: {input_file}")
|
|
print(f"输出文件: {output_file}")
|
|
print()
|
|
|
|
# 检查输入文件是否存在
|
|
if not os.path.exists(input_file):
|
|
print("错误: 输入文件不存在!")
|
|
exit(1)
|
|
|
|
print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
|
|
|
|
# 读取原始数据
|
|
try:
|
|
print("正在读取原始数据...")
|
|
df = pd.read_excel(input_file)
|
|
print(f"成功读取 {len(df)} 行数据")
|
|
print(f"列名: {list(df.columns)}")
|
|
print()
|
|
|
|
# 创建新的回归数据DataFrame
|
|
regression_data = pd.DataFrame()
|
|
|
|
# 1. 提取因变量Y (helpfull列)
|
|
print("1. 提取因变量Y (helpfull列)")
|
|
if 'helpfull' in df.columns:
|
|
regression_data['Y'] = df['helpfull'].fillna(0)
|
|
print(f"成功提取 Y 列,共 {len(regression_data['Y'])} 个值")
|
|
else:
|
|
print("警告: 未找到 helpfull 列,使用默认值 0")
|
|
regression_data['Y'] = 0
|
|
|
|
# 2. 提取X1 (评论总数列)
|
|
print("\n2. 提取X1 (评论总数列)")
|
|
comment_columns = [col for col in df.columns if '评论' in col and '总数' in col]
|
|
if comment_columns:
|
|
regression_data['X1'] = df[comment_columns[0]].fillna(0)
|
|
print(f"成功提取 X1 列,使用列: {comment_columns[0]}")
|
|
else:
|
|
print("警告: 未找到评论总数列,使用默认值 0")
|
|
regression_data['X1'] = 0
|
|
|
|
# 3. 计算X2-X6
|
|
print("\n3. 计算X2-X6")
|
|
|
|
# X2: 评论长度
|
|
print(" - 计算X2 (评论长度)")
|
|
regression_data['X2'] = 0
|
|
|
|
# X3: 评论复杂度
|
|
print(" - 计算X3 (评论复杂度)")
|
|
regression_data['X3'] = 0
|
|
|
|
# X4: 评论可读性
|
|
print(" - 计算X4 (评论可读性)")
|
|
regression_data['X4'] = 0
|
|
|
|
# X5: 内容情感性
|
|
print(" - 计算X5 (内容情感性)")
|
|
regression_data['X5'] = 0
|
|
|
|
# X6: 信息丰富度
|
|
print(" - 计算X6 (信息丰富度)")
|
|
regression_data['X6'] = 0
|
|
|
|
# 4. 数据清洗
|
|
print("\n4. 数据清洗")
|
|
# 确保所有值都是数字
|
|
for col in regression_data.columns:
|
|
regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0)
|
|
|
|
# 5. 验证数据
|
|
print("\n5. 数据验证")
|
|
print(f"行数: {len(regression_data)}")
|
|
print(f"列数: {len(regression_data.columns)}")
|
|
print(f"列名: {list(regression_data.columns)}")
|
|
print(f"数据类型:")
|
|
print(regression_data.dtypes)
|
|
print(f"\n前5行数据:")
|
|
print(regression_data.head())
|
|
|
|
# 6. 保存文件
|
|
print("\n6. 保存文件")
|
|
regression_data.to_excel(output_file, index=False)
|
|
|
|
# 验证文件是否创建成功
|
|
if os.path.exists(output_file):
|
|
print(f"文件已成功保存到: {output_file}")
|
|
print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
|
|
else:
|
|
print("错误: 文件保存失败")
|
|
|
|
print()
|
|
print("========================================")
|
|
print(" 任务完成")
|
|
print("========================================")
|
|
|
|
except Exception as e:
|
|
print(f"处理文件时出错: {str(e)}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|