You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
113 lines
3.6 KiB
113 lines
3.6 KiB
import os
|
|
import pandas as pd
|
|
import openpyxl
|
|
|
|
# 文件路径
|
|
input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
|
|
output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
|
|
|
|
print("========================================")
|
|
print(" 填充UGC回归数据")
|
|
print("========================================")
|
|
print(f"输入文件: {input_file}")
|
|
print(f"输出文件: {output_file}")
|
|
print()
|
|
|
|
# 检查文件是否存在
|
|
if not os.path.exists(input_file):
|
|
print("错误: 输入文件不存在!")
|
|
exit(1)
|
|
|
|
if not os.path.exists(output_file):
|
|
print("错误: 输出文件不存在!")
|
|
exit(1)
|
|
|
|
# 读取原始数据
|
|
try:
|
|
print("正在读取原始数据...")
|
|
df = pd.read_excel(input_file)
|
|
print(f"成功读取 {len(df)} 行数据")
|
|
print(f"列名: {list(df.columns)}")
|
|
|
|
# 打开输出文件
|
|
print("\n打开输出文件...")
|
|
wb = openpyxl.load_workbook(output_file)
|
|
ws = wb.active
|
|
|
|
# 提取数据并填充
|
|
print("\n填充数据...")
|
|
|
|
# 提取Y列 (helpfull)
|
|
print("1. 填充Y列 (helpfull)")
|
|
if 'helpfull' in df.columns:
|
|
for i, value in enumerate(df['helpfull'], 2): # 从第2行开始
|
|
if pd.isna(value):
|
|
ws.cell(row=i, column=1, value=0)
|
|
else:
|
|
ws.cell(row=i, column=1, value=float(value))
|
|
print(f"成功填充 Y 列,共 {len(df)} 行")
|
|
else:
|
|
print("警告: 未找到 helpfull 列,使用默认值 0")
|
|
for i in range(2, len(df) + 2):
|
|
ws.cell(row=i, column=1, value=0)
|
|
|
|
# 提取X1列 (评论总数)
|
|
print("\n2. 填充X1列 (评论总数)")
|
|
comment_columns = [col for col in df.columns if '评论' in col]
|
|
if comment_columns:
|
|
for i, value in enumerate(df[comment_columns[0]], 2):
|
|
if pd.isna(value):
|
|
ws.cell(row=i, column=2, value=0)
|
|
else:
|
|
ws.cell(row=i, column=2, value=float(value))
|
|
print(f"成功填充 X1 列,使用列: {comment_columns[0]}")
|
|
else:
|
|
print("警告: 未找到评论列,使用默认值 0")
|
|
for i in range(2, len(df) + 2):
|
|
ws.cell(row=i, column=2, value=0)
|
|
|
|
# 计算X2-X6
|
|
print("\n3. 计算X2-X6")
|
|
|
|
# X2: 评论长度
|
|
print(" - 填充X2 (评论长度)")
|
|
for i in range(2, len(df) + 2):
|
|
ws.cell(row=i, column=3, value=0)
|
|
|
|
# X3: 评论复杂度
|
|
print(" - 填充X3 (评论复杂度)")
|
|
for i in range(2, len(df) + 2):
|
|
ws.cell(row=i, column=4, value=0)
|
|
|
|
# X4: 评论可读性
|
|
print(" - 填充X4 (评论可读性)")
|
|
for i in range(2, len(df) + 2):
|
|
ws.cell(row=i, column=5, value=0)
|
|
|
|
# X5: 内容情感性
|
|
print(" - 填充X5 (内容情感性)")
|
|
for i in range(2, len(df) + 2):
|
|
ws.cell(row=i, column=6, value=0)
|
|
|
|
# X6: 信息丰富度
|
|
print(" - 填充X6 (信息丰富度)")
|
|
for i in range(2, len(df) + 2):
|
|
ws.cell(row=i, column=7, value=0)
|
|
|
|
# 保存文件
|
|
print("\n4. 保存文件")
|
|
wb.save(output_file)
|
|
|
|
print(f"文件已成功保存: {output_file}")
|
|
print(f"总行数: {len(df) + 1} (包括表头)")
|
|
print(f"总列数: 7")
|
|
|
|
print()
|
|
print("========================================")
|
|
print(" 任务完成")
|
|
print("========================================")
|
|
|
|
except Exception as e:
|
|
print(f"处理文件时出错: {str(e)}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|