diff --git a/project/AddRegressionColumns.java b/project/AddRegressionColumns.java deleted file mode 100644 index 60f682a..0000000 --- a/project/AddRegressionColumns.java +++ /dev/null @@ -1,224 +0,0 @@ -import org.apache.poi.ss.usermodel.*; -import org.apache.poi.xssf.usermodel.XSSFWorkbook; -import java.io.*; -import java.util.*; -import java.util.regex.*; - -public class AddRegressionColumns { - public static void main(String[] args) { - String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).xlsx"; - String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新)_回归.xlsx"; - - System.out.println("========================================"); - System.out.println(" 在原表中添加回归数据列"); - System.out.println("========================================"); - System.out.println("输入文件: " + inputFile); - System.out.println("输出文件: " + outputFile); - System.out.println(); - - try { - // 读取输入文件 - System.out.println("读取输入文件..."); - FileInputStream fis = new FileInputStream(inputFile); - Workbook wb = new XSSFWorkbook(fis); - Sheet sheet = wb.getSheetAt(0); - - int totalRows = sheet.getLastRowNum(); - System.out.println("总行数: " + totalRows); - - // 获取表头行 - Row headerRow = sheet.getRow(0); - int totalCols = headerRow.getLastCellNum(); - System.out.println("总列数: " + totalCols); - - // 识别列 - int helpfullCol = -1; - int commentCountCol = -1; - List commentCols = new ArrayList<>(); - - for (int i = 0; i < totalCols; i++) { - Cell cell = headerRow.getCell(i); - if (cell != null) { - String header = cell.getStringCellValue().toLowerCase(); - if (header.contains("helpfull") || header.contains("helpful")) { - helpfullCol = i; - System.out.println("找到 Y 列 (helpfull): 列 " + i); - } else if (header.contains("评论总数") || header.contains("帖子评论总数")) { - commentCountCol = i; - System.out.println("找到 X1 列 (评论总数): 列 " + i); - } else if (header.contains("评论") && header.contains("内容")) { - for (int j = 1; j <= 5; j++) { - if (header.contains(String.valueOf(j))) { - commentCols.add(i); - System.out.println("找到评论列 " + commentCols.size() + ": 列 " + i + " - " + header); - break; - } - } - } - } - } - - System.out.println("\n共找到 " + commentCols.size() + " 个评论列"); - - // 添加新列的表头 - int yCol = totalCols; - int x1Col = totalCols + 1; - int x2Col = totalCols + 2; - int x3Col = totalCols + 3; - int x4Col = totalCols + 4; - int x5Col = totalCols + 5; - int x6Col = totalCols + 6; - - headerRow.createCell(yCol).setCellValue("Y"); - headerRow.createCell(x1Col).setCellValue("X1"); - headerRow.createCell(x2Col).setCellValue("X2"); - headerRow.createCell(x3Col).setCellValue("X3"); - headerRow.createCell(x4Col).setCellValue("X4"); - headerRow.createCell(x5Col).setCellValue("X5"); - headerRow.createCell(x6Col).setCellValue("X6"); - - // 处理每一行数据 - System.out.println("\n处理数据..."); - Pattern digitPattern = Pattern.compile("\\d"); - Pattern urlPattern = Pattern.compile("http[s]?://|www\\."); - Pattern emojiPattern = Pattern.compile("[\\u2600-\\u27BF\\uD83C-\\uDBFF\\uDC00-\\uDFFF]|[:;][-]?[)D]"); - - String[] positiveWords = {"好", "棒", "优秀", "喜欢", "满意", "赞", "positive", "good", "great", "excellent", "love", "like"}; - String[] negativeWords = {"差", "糟糕", "不好", "失望", "不满", "negative", "bad", "terrible", "poor", "hate", "dislike"}; - - for (int i = 1; i <= totalRows; i++) { - if (i % 1000 == 0) { - System.out.println("处理第 " + i + "/" + totalRows + " 行..."); - } - - Row row = sheet.getRow(i); - if (row == null) continue; - - // Y (UGC有用性) - double y = 0; - if (helpfullCol >= 0) { - Cell cell = row.getCell(helpfullCol); - if (cell != null) { - try { - y = cell.getNumericCellValue(); - } catch (Exception e) { - y = 0; - } - } - } - row.createCell(yCol).setCellValue(y); - - // X1 (评论数量) - double x1 = 0; - if (commentCountCol >= 0) { - Cell cell = row.getCell(commentCountCol); - if (cell != null) { - try { - x1 = cell.getNumericCellValue(); - } catch (Exception e) { - x1 = 0; - } - } - } - row.createCell(x1Col).setCellValue(x1); - - // 计算评论相关指标 - List lengths = new ArrayList<>(); - List complexities = new ArrayList<>(); - List sentiments = new ArrayList<>(); - List richnessList = new ArrayList<>(); - - for (int colIdx : commentCols) { - Cell cell = row.getCell(colIdx); - if (cell != null) { - String content = ""; - try { - content = cell.getStringCellValue(); - } catch (Exception e) { - try { - content = String.valueOf(cell.getNumericCellValue()); - } catch (Exception e2) { - content = ""; - } - } - - if (content != null && !content.isEmpty() && !content.equals("nan") && !content.equals("null")) { - // X2: 评论长度(剔空格后的字符数) - double length = content.replace(" ", "").replace("\u3000", "").length(); - lengths.add(length); - - // X3: 评论复杂度(按空格拆分的分词数) - double complexity = content.split("\\s+").length; - complexities.add(complexity); - - // X5: 情感分析 - double sentiment = 0; - String lowerContent = content.toLowerCase(); - for (String word : positiveWords) { - if (lowerContent.contains(word)) { - sentiment = 1; - break; - } - } - if (sentiment == 0) { - for (String word : negativeWords) { - if (lowerContent.contains(word)) { - sentiment = -1; - break; - } - } - } - sentiments.add(sentiment); - - // X6: 信息丰富度 - double richness = 0; - if (digitPattern.matcher(content).find()) richness += 1; - if (urlPattern.matcher(content).find()) richness += 1; - if (emojiPattern.matcher(content).find()) richness += 1; - richnessList.add(richness); - } - } - } - - // 计算平均值(无评论记0) - double x2 = lengths.isEmpty() ? 0 : lengths.stream().mapToDouble(Double::doubleValue).average().getAsDouble(); - double x3 = complexities.isEmpty() ? 0 : complexities.stream().mapToDouble(Double::doubleValue).average().getAsDouble(); - double x5 = sentiments.isEmpty() ? 0 : sentiments.stream().mapToDouble(Double::doubleValue).average().getAsDouble(); - double x6 = richnessList.isEmpty() ? 0 : richnessList.stream().mapToDouble(Double::doubleValue).average().getAsDouble(); - - // X4: 评论可读性 = X2/X3(X3为0时记0) - double x4 = (x3 > 0) ? x2 / x3 : 0; - - // 写入单元格 - row.createCell(x2Col).setCellValue(x2); - row.createCell(x3Col).setCellValue(x3); - row.createCell(x4Col).setCellValue(x4); - row.createCell(x5Col).setCellValue(x5); - row.createCell(x6Col).setCellValue(x6); - } - - // 保存文件 - System.out.println("\n保存文件..."); - FileOutputStream fos = new FileOutputStream(outputFile); - wb.write(fos); - fos.close(); - wb.close(); - fis.close(); - - // 验证文件 - File output = new File(outputFile); - if (output.exists()) { - System.out.println("文件保存成功!"); - System.out.println("文件大小: " + (output.length() / 1024) + " KB"); - } - - System.out.println("\n========================================"); - System.out.println(" 任务完成"); - System.out.println("========================================"); - - } catch (Exception e) { - System.out.println("错误: " + e.getMessage()); - e.printStackTrace(); - } - } -} diff --git a/project/DataCleaner.java b/project/DataCleaner.java deleted file mode 100644 index 53cafa3..0000000 --- a/project/DataCleaner.java +++ /dev/null @@ -1,99 +0,0 @@ -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -public class DataCleaner { - - public static List cleanPosts(List rawPosts) { - List cleanedPosts = new ArrayList<>(); - - for (PostInfo post : rawPosts) { - PostInfo cleaned = cleanPost(post); - if (isValidPost(cleaned)) { - cleanedPosts.add(cleaned); - } - } - - System.out.println("数据清洗完成,有效数据: " + cleanedPosts.size() + " 条"); - return cleanedPosts; - } - - private static PostInfo cleanPost(PostInfo post) { - PostInfo cleaned = new PostInfo(); - - cleaned.setTitle(cleanText(post.getTitle())); - cleaned.setContent(cleanContent(post.getContent())); - cleaned.setAuthor(cleanText(post.getAuthor())); - cleaned.setPostDate(post.getPostDate()); - cleaned.setLikeCount(post.getLikeCount()); - cleaned.setCommentCount(post.getCommentCount()); - cleaned.setViewCount(post.getViewCount()); - cleaned.setTags(cleanText(post.getTags())); - cleaned.setSentiment(normalizeSentiment(post.getSentiment())); - - return cleaned; - } - - private static String cleanText(String text) { - if (text == null) { - return ""; - } - return text.trim().replaceAll("\\s+", " "); - } - - private static String cleanContent(String content) { - if (content == null) { - return ""; - } - return content.trim() - .replaceAll("\\s+", " ") - .replaceAll("[\\r\\n]+", " ") - .replaceAll("<[^>]+>", "") - .replaceAll("\\[.*?\\]", "") - .replaceAll("\\(.*?\\)", ""); - } - - private static String normalizeSentiment(String sentiment) { - if (sentiment == null || sentiment.isEmpty()) { - return "中性"; - } - - String lower = sentiment.toLowerCase(); - if (lower.contains("积极") || lower.contains("正面") || lower.contains("positive")) { - return "积极"; - } else if (lower.contains("消极") || lower.contains("负面") || lower.contains("negative")) { - return "消极"; - } else { - return "中性"; - } - } - - private static boolean isValidPost(PostInfo post) { - return post.getTitle() != null && !post.getTitle().isEmpty() && - post.getContent() != null && !post.getContent().isEmpty(); - } - - public static String[] extractKeywords(String content) { - if (content == null || content.isEmpty()) { - return new String[0]; - } - - String[] commonKeywords = { - "数据", "分析", "学习", "技术", "互联网", "发展", "趋势", - "工具", "方法", "实践", "经验", "案例", "应用", "创新", - "挑战", "机遇", "未来", "智能", "算法", "模型", "平台" - }; - - List keywords = new ArrayList<>(); - String lowerContent = content.toLowerCase(); - - for (String keyword : commonKeywords) { - if (lowerContent.contains(keyword.toLowerCase())) { - keywords.add(keyword); - } - } - - return keywords.toArray(new String[0]); - } -} diff --git a/project/DataCleaningScript.java b/project/DataCleaningScript.java deleted file mode 100644 index ffc1e96..0000000 --- a/project/DataCleaningScript.java +++ /dev/null @@ -1,226 +0,0 @@ -import java.io.*; -import java.time.LocalDate; -import java.time.format.DateTimeFormatter; -import java.util.ArrayList; -import java.util.List; -import java.util.Locale; - -public class DataCleaningScript { - - private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.CHINA); - - public static void main(String[] args) { - String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx"; - String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).csv"; - - System.out.println("========================================"); - System.out.println(" 数据清洗脚本"); - System.out.println("========================================"); - System.out.println("输入文件: " + inputFile); - System.out.println("输出文件: " + outputFile); - System.out.println(); - - // 读取数据 - List rawPosts = readExcelData(inputFile); - System.out.println("读取数据完成,共 " + rawPosts.size() + " 条记录"); - - // 清洗数据 - List cleanedPosts = cleanPosts(rawPosts); - System.out.println("数据清洗完成,有效记录: " + cleanedPosts.size() + " 条"); - - // 保存清洗后的数据 - saveToCSV(cleanedPosts, outputFile); - System.out.println("数据保存完成!"); - System.out.println(); - System.out.println("========================================"); - System.out.println(" 数据清洗任务完成"); - System.out.println("========================================"); - } - - private static List readExcelData(String filePath) { - List posts = new ArrayList<>(); - - try (BufferedReader reader = new BufferedReader(new FileReader(filePath, java.nio.charset.StandardCharsets.UTF_8))) { - - String line; - boolean isFirstLine = true; - - while ((line = reader.readLine()) != null) { - if (isFirstLine) { - isFirstLine = false; - continue; - } - - String[] parts = parseCSVLine(line); - if (parts.length >= 9) { - PostInfo post = parsePostInfo(parts); - if (post != null) { - posts.add(post); - } - } - } - - } catch (IOException e) { - System.err.println("读取文件时出错: " + e.getMessage()); - } - - return posts; - } - - private static String[] parseCSVLine(String line) { - List fields = new ArrayList<>(); - StringBuilder currentField = new StringBuilder(); - boolean inQuotes = false; - - for (char c : line.toCharArray()) { - if (c == '"') { - inQuotes = !inQuotes; - } else if (c == ',' && !inQuotes) { - fields.add(currentField.toString().trim()); - currentField.setLength(0); - } else { - currentField.append(c); - } - } - - fields.add(currentField.toString().trim()); - return fields.toArray(new String[0]); - } - - private static PostInfo parsePostInfo(String[] parts) { - try { - PostInfo post = new PostInfo(); - - post.setTitle(parts[0]); - post.setContent(parts[1]); - post.setAuthor(parts[2]); - - if (!parts[3].isEmpty()) { - post.setPostDate(LocalDate.parse(parts[3], DATE_FORMATTER)); - } - - post.setLikeCount(parseInt(parts[4])); - post.setCommentCount(parseInt(parts[5])); - post.setViewCount(parseInt(parts[6])); - - post.setTags(parts[7]); - post.setSentiment(parts[8]); - - return post; - } catch (Exception e) { - return null; - } - } - - private static int parseInt(String value) { - try { - if (value == null || value.isEmpty()) { - return 0; - } - return Integer.parseInt(value); - } catch (NumberFormatException e) { - return 0; - } - } - - private static List cleanPosts(List rawPosts) { - List cleanedPosts = new ArrayList<>(); - - for (PostInfo post : rawPosts) { - PostInfo cleaned = cleanPost(post); - if (isValidPost(cleaned)) { - cleanedPosts.add(cleaned); - } - } - - return cleanedPosts; - } - - private static PostInfo cleanPost(PostInfo post) { - PostInfo cleaned = new PostInfo(); - - cleaned.setTitle(cleanText(post.getTitle())); - cleaned.setContent(cleanContent(post.getContent())); - cleaned.setAuthor(cleanText(post.getAuthor())); - cleaned.setPostDate(post.getPostDate()); - cleaned.setLikeCount(post.getLikeCount()); - cleaned.setCommentCount(post.getCommentCount()); - cleaned.setViewCount(post.getViewCount()); - cleaned.setTags(cleanText(post.getTags())); - cleaned.setSentiment(normalizeSentiment(post.getSentiment())); - - return cleaned; - } - - private static String cleanText(String text) { - if (text == null) { - return ""; - } - return text.trim().replaceAll("\\s+", " "); - } - - private static String cleanContent(String content) { - if (content == null) { - return ""; - } - return content.trim() - .replaceAll("\\s+", " ") - .replaceAll("[\\r\\n]+", " ") - .replaceAll("<[^>]+>", "") - .replaceAll("\\[.*?\\]", "") - .replaceAll("\\(.*?\\)", ""); - } - - private static String normalizeSentiment(String sentiment) { - if (sentiment == null || sentiment.isEmpty()) { - return "中性"; - } - - String lower = sentiment.toLowerCase(); - if (lower.contains("积极") || lower.contains("正面") || lower.contains("positive")) { - return "积极"; - } else if (lower.contains("消极") || lower.contains("负面") || lower.contains("negative")) { - return "消极"; - } else { - return "中性"; - } - } - - private static boolean isValidPost(PostInfo post) { - return post.getTitle() != null && !post.getTitle().isEmpty() && - post.getContent() != null && !post.getContent().isEmpty(); - } - - private static void saveToCSV(List posts, String filePath) { - if (posts == null || posts.isEmpty()) { - System.out.println("没有数据需要保存"); - return; - } - - try { - // 确保目录存在 - File file = new File(filePath); - File parentDir = file.getParentFile(); - if (parentDir != null && !parentDir.exists()) { - parentDir.mkdirs(); - } - - try (BufferedWriter writer = new BufferedWriter( - new FileWriter(file, java.nio.charset.StandardCharsets.UTF_8))) { - - writer.write("\uFEFF"); // BOM for UTF-8 - writer.write("标题,内容,作者,发布日期,点赞数,评论数,浏览量,标签,情感倾向\n"); - - for (PostInfo post : posts) { - writer.write(post.toCSV()); - writer.write("\n"); - } - } - - System.out.println("数据已保存到: " + filePath); - - } catch (IOException e) { - System.err.println("保存CSV文件时出错: " + e.getMessage()); - } - } -} diff --git a/project/DataStorage.java b/project/DataStorage.java deleted file mode 100644 index 134db6d..0000000 --- a/project/DataStorage.java +++ /dev/null @@ -1,121 +0,0 @@ -import java.io.BufferedWriter; -import java.io.FileWriter; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.time.LocalDateTime; -import java.time.format.DateTimeFormatter; -import java.util.List; - -public class DataStorage { - - public static void saveToCSV(List posts, String directory) { - if (posts == null || posts.isEmpty()) { - System.out.println("没有数据需要保存"); - return; - } - - try { - java.nio.file.Path dirPath = Paths.get(directory); - if (!Files.exists(dirPath)) { - Files.createDirectories(dirPath); - } - - String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")); - String filename = "posts_" + timestamp + ".csv"; - java.nio.file.Path filePath = dirPath.resolve(filename); - - try (BufferedWriter writer = new BufferedWriter( - new FileWriter(filePath.toFile(), StandardCharsets.UTF_8))) { - - writer.write("\uFEFF"); - writer.write("标题,内容,作者,发布日期,点赞数,评论数,浏览量,标签,情感倾向\n"); - - for (PostInfo post : posts) { - writer.write(post.toCSV()); - writer.write("\n"); - } - } - - System.out.println("数据已保存到: " + filePath.toAbsolutePath()); - - } catch (IOException e) { - System.err.println("保存CSV文件时出错: " + e.getMessage()); - } - } - - public static void saveToJSON(List posts, String directory) { - if (posts == null || posts.isEmpty()) { - System.out.println("没有数据需要保存"); - return; - } - - try { - java.nio.file.Path dirPath = Paths.get(directory); - if (!Files.exists(dirPath)) { - Files.createDirectories(dirPath); - } - - String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")); - String filename = "posts_" + timestamp + ".json"; - java.nio.file.Path filePath = dirPath.resolve(filename); - - try (BufferedWriter writer = new BufferedWriter( - new FileWriter(filePath.toFile(), StandardCharsets.UTF_8))) { - - writer.write("[\n"); - for (int i = 0; i < posts.size(); i++) { - writer.write(postToJSON(posts.get(i))); - if (i < posts.size() - 1) { - writer.write(",\n"); - } else { - writer.write("\n"); - } - } - writer.write("]\n"); - } - - System.out.println("数据已保存到: " + filePath.toAbsolutePath()); - - } catch (IOException e) { - System.err.println("保存JSON文件时出错: " + e.getMessage()); - } - } - - private static String postToJSON(PostInfo post) { - return String.format( - " {\n" + - " \"title\": \"%s\",\n" + - " \"content\": \"%s\",\n" + - " \"author\": \"%s\",\n" + - " \"postDate\": \"%s\",\n" + - " \"likeCount\": %d,\n" + - " \"commentCount\": %d,\n" + - " \"viewCount\": %d,\n" + - " \"tags\": \"%s\",\n" + - " \"sentiment\": \"%s\"\n" + - " }", - escapeJSON(post.getTitle()), - escapeJSON(post.getContent()), - escapeJSON(post.getAuthor()), - post.getPostDate() != null ? post.getPostDate().toString() : "", - post.getLikeCount(), - post.getCommentCount(), - post.getViewCount(), - escapeJSON(post.getTags()), - escapeJSON(post.getSentiment()) - ); - } - - private static String escapeJSON(String text) { - if (text == null) { - return ""; - } - return text.replace("\\", "\\\\") - .replace("\"", "\\\"") - .replace("\n", "\\n") - .replace("\r", "\\r") - .replace("\t", "\\t"); - } -} diff --git a/project/DuoTai.java b/project/DuoTai.java deleted file mode 100644 index 3876a56..0000000 --- a/project/DuoTai.java +++ /dev/null @@ -1,3 +0,0 @@ -public class DuoTai { - -} diff --git a/project/ExcelReader.java b/project/ExcelReader.java deleted file mode 100644 index e6635bc..0000000 --- a/project/ExcelReader.java +++ /dev/null @@ -1,102 +0,0 @@ -import java.io.*; -import java.time.LocalDate; -import java.time.format.DateTimeFormatter; -import java.util.ArrayList; -import java.util.List; -import java.util.Locale; - -public class ExcelReader { - - private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.CHINA); - - public static List readExcelData(String filePath, int maxRows) { - List posts = new ArrayList<>(); - - try (BufferedReader reader = new BufferedReader(new FileReader(filePath, java.nio.charset.StandardCharsets.UTF_8))) { - - String line; - boolean isFirstLine = true; - int rowCount = 0; - - while ((line = reader.readLine()) != null && rowCount < maxRows) { - if (isFirstLine) { - isFirstLine = false; - continue; - } - - String[] parts = parseCSVLine(line); - if (parts.length >= 9) { - PostInfo post = parsePostInfo(parts); - if (post != null) { - posts.add(post); - rowCount++; - } - } - } - - System.out.println("成功读取 " + posts.size() + " 条数据"); - - } catch (IOException e) { - System.err.println("读取文件时出错: " + e.getMessage()); - } - - return posts; - } - - private static String[] parseCSVLine(String line) { - List fields = new ArrayList<>(); - StringBuilder currentField = new StringBuilder(); - boolean inQuotes = false; - - for (char c : line.toCharArray()) { - if (c == '"') { - inQuotes = !inQuotes; - } else if (c == ',' && !inQuotes) { - fields.add(currentField.toString().trim()); - currentField.setLength(0); - } else { - currentField.append(c); - } - } - - fields.add(currentField.toString().trim()); - return fields.toArray(new String[0]); - } - - private static PostInfo parsePostInfo(String[] parts) { - try { - PostInfo post = new PostInfo(); - - post.setTitle(parts[0]); - post.setContent(parts[1]); - post.setAuthor(parts[2]); - - if (!parts[3].isEmpty()) { - post.setPostDate(LocalDate.parse(parts[3], DATE_FORMATTER)); - } - - post.setLikeCount(parseInt(parts[4])); - post.setCommentCount(parseInt(parts[5])); - post.setViewCount(parseInt(parts[6])); - - post.setTags(parts[7]); - post.setSentiment(parts[8]); - - return post; - } catch (Exception e) { - System.err.println("解析数据时出错: " + e.getMessage()); - return null; - } - } - - private static int parseInt(String value) { - try { - if (value == null || value.isEmpty()) { - return 0; - } - return Integer.parseInt(value); - } catch (NumberFormatException e) { - return 0; - } - } -} diff --git a/project/HTMLReportGenerator.java b/project/HTMLReportGenerator.java deleted file mode 100644 index 7a6855e..0000000 --- a/project/HTMLReportGenerator.java +++ /dev/null @@ -1,214 +0,0 @@ -package com.project.report; - -import com.project.analyzer.PostAnalyzer; -import com.project.model.PostInfo; - -import java.io.BufferedWriter; -import java.io.FileWriter; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.time.LocalDateTime; -import java.time.format.DateTimeFormatter; -import java.util.Map; - -public class HTMLReportGenerator { - - private static final String OUTPUT_DIR = "d:\\java\\project\\reports"; - - public static void generateReport(PostAnalyzer analyzer) { - try { - Files.createDirectories(Paths.get(OUTPUT_DIR)); - - String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")); - String filename = "report_" + timestamp + ".html"; - String filepath = OUTPUT_DIR + "/" + filename; - - try (BufferedWriter writer = new BufferedWriter( - new FileWriter(filepath, StandardCharsets.UTF_8))) { - - writer.write(generateHTMLContent(analyzer)); - } - - System.out.println("HTML报告已生成: " + filepath); - - } catch (IOException e) { - System.err.println("生成HTML报告时出错: " + e.getMessage()); - } - } - - private static String generateHTMLContent(PostAnalyzer analyzer) { - StringBuilder html = new StringBuilder(); - - html.append("\n"); - html.append("\n"); - html.append("\n"); - html.append(" \n"); - html.append(" \n"); - html.append(" 图文帖子数据分析报告\n"); - html.append(" \n"); - html.append("\n"); - html.append("\n"); - html.append("
\n"); - html.append("

图文帖子数据分析报告

\n"); - html.append("

生成时间: ").append(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))).append("

\n"); - - html.append(generateSummarySection(analyzer)); - html.append(generateSentimentSection(analyzer)); - html.append(generateEngagementSection(analyzer)); - html.append(generateAuthorSection(analyzer)); - html.append(generateChartsSection()); - - html.append("
\n"); - html.append("\n"); - html.append(""); - - return html.toString(); - } - - private static String generateSummarySection(PostAnalyzer analyzer) { - StringBuilder section = new StringBuilder(); - - int totalPosts = analyzer.getPosts().size(); - double avgLikes = analyzer.getPosts().stream() - .mapToInt(PostInfo::getLikeCount) - .average() - .orElse(0); - - section.append("
\n"); - section.append("
\n"); - section.append("

").append(totalPosts).append("

\n"); - section.append("

帖子总数

\n"); - section.append("
\n"); - section.append("
\n"); - section.append("

").append(String.format("%.1f", avgLikes)).append("

\n"); - section.append("

平均点赞

\n"); - section.append("
\n"); - section.append("
\n"); - - section.append("
\n"); - section.append("

分析摘要

\n"); - section.append("
    \n"); - section.append("
  • 本次分析共收集 ").append(totalPosts).append(" 条图文帖子数据
  • \n"); - section.append("
  • 数据来源:D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用
  • \n"); - section.append("
  • 分析内容包括情感倾向分布、互动指标、热门作者等多个维度
  • \n"); - section.append("
  • 通过数据可视化展示分析结果,便于直观理解
  • \n"); - section.append("
\n"); - section.append("
\n"); - - return section.toString(); - } - - private static String generateSentimentSection(PostAnalyzer analyzer) { - StringBuilder section = new StringBuilder(); - Map sentimentData = analyzer.getSentimentDistributionData(); - - section.append("
\n"); - section.append("

情感倾向分布分析

\n"); - section.append(" \n"); - section.append(" \n"); - - long total = sentimentData.values().stream().mapToLong(Long::longValue).sum(); - - for (Map.Entry entry : sentimentData.entrySet()) { - double percent = (entry.getValue() * 100.0) / total; - section.append(" \n"); - } - - section.append("
情感倾向帖子数量占比
").append(entry.getKey()) - .append("").append(entry.getValue()) - .append("").append(String.format("%.1f%%", percent)) - .append("
\n"); - section.append("
\n"); - - return section.toString(); - } - - private static String generateEngagementSection(PostAnalyzer analyzer) { - StringBuilder section = new StringBuilder(); - Map engagementData = analyzer.getEngagementData(); - - section.append("
\n"); - section.append("

互动指标分析

\n"); - section.append(" \n"); - section.append(" \n"); - - for (Map.Entry entry : engagementData.entrySet()) { - section.append(" \n"); - } - - section.append("
指标平均值
").append(entry.getKey()) - .append("").append(String.format("%.1f", entry.getValue())) - .append("
\n"); - section.append("
\n"); - - return section.toString(); - } - - private static String generateAuthorSection(PostAnalyzer analyzer) { - StringBuilder section = new StringBuilder(); - Map authorData = analyzer.getAuthorPostCount(); - - section.append("
\n"); - section.append("

热门作者排行TOP10

\n"); - section.append(" \n"); - section.append(" \n"); - - int rank = 1; - for (Map.Entry entry : authorData.entrySet()) { - section.append(" \n"); - } - - section.append("
排名作者帖子数量
").append(rank++) - .append("").append(entry.getKey()) - .append("").append(entry.getValue()) - .append("
\n"); - section.append("
\n"); - - return section.toString(); - } - - private static String generateChartsSection() { - StringBuilder section = new StringBuilder(); - - section.append("
\n"); - section.append("

数据可视化图表

\n"); - section.append("
\n"); - section.append("

情感倾向分布

\n"); - section.append(" \"情感倾向分布图\"\n"); - section.append("
\n"); - section.append("
\n"); - section.append("

互动指标分析

\n"); - section.append(" \"互动指标图\"\n"); - section.append("
\n"); - section.append("
\n"); - section.append("

热门作者排行

\n"); - section.append(" \"作者排行图\"\n"); - section.append("
\n"); - section.append("
\n"); - - return section.toString(); - } -} diff --git a/project/Main.java b/project/Main.java deleted file mode 100644 index 148520e..0000000 --- a/project/Main.java +++ /dev/null @@ -1,67 +0,0 @@ -package com.project; - -import com.project.analyzer.PostAnalyzer; -import com.project.chart.SimpleChartGenerator; -import com.project.model.PostInfo; -import com.project.reader.ExcelReader; -import com.project.report.HTMLReportGenerator; -import com.project.storage.DataStorage; -import com.project.util.DataCleaner; - -import java.util.List; -import java.util.Scanner; - -public class Main { - - public static void main(String[] args) { - System.out.println("========================================"); - System.out.println(" Java网络爬虫与数据分析系统"); - System.out.println("========================================\n"); - - String dataFilePath = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx"; - String outputDir = "d:\\java\\project\\data"; - int maxRows = 300; - - try { - System.out.println("开始读取本地数据文件..."); - System.out.println("数据文件: " + dataFilePath); - System.out.println("读取前 " + maxRows + " 条数据"); - - List rawPosts = ExcelReader.readExcelData(dataFilePath, maxRows); - - if (rawPosts.isEmpty()) { - System.out.println("未获取到任何数据,程序退出"); - return; - } - - System.out.println("\n开始数据清洗..."); - List cleanedPosts = DataCleaner.cleanPosts(rawPosts); - - System.out.println("\n保存数据到文件..."); - DataStorage.saveToCSV(cleanedPosts, outputDir); - DataStorage.saveToJSON(cleanedPosts, outputDir); - - System.out.println("\n开始数据分析..."); - PostAnalyzer analyzer = new PostAnalyzer(cleanedPosts); - analyzer.analyzeAll(); - - System.out.println("\n生成图表..."); - SimpleChartGenerator.generateAllCharts(analyzer); - - System.out.println("\n生成HTML报告..."); - HTMLReportGenerator.generateReport(analyzer); - - System.out.println("\n========================================"); - System.out.println(" 程序执行完成!"); - System.out.println("========================================"); - System.out.println("\n输出文件位置:"); - System.out.println("- 数据文件: " + outputDir); - System.out.println("- 图表文件: d:\\java\\project\\charts"); - System.out.println("- 报告文件: d:\\java\\project\\reports"); - - } catch (Exception e) { - System.err.println("程序执行出错: " + e.getMessage()); - e.printStackTrace(); - } - } -} diff --git a/project/PostAnalyzer.java b/project/PostAnalyzer.java deleted file mode 100644 index 76a5216..0000000 --- a/project/PostAnalyzer.java +++ /dev/null @@ -1,200 +0,0 @@ -package com.project.analyzer; - -import com.project.model.PostInfo; - -import java.util.*; -import java.util.stream.Collectors; - -public class PostAnalyzer { - - private final List posts; - - public PostAnalyzer(List posts) { - this.posts = posts; - } - - public List getPosts() { - return posts; - } - - public void analyzeAll() { - System.out.println("\n========== 数据分析报告 ==========\n"); - - analyzeSentimentDistribution(); - analyzeEngagementMetrics(); - analyzePopularAuthors(); - analyzeContentLength(); - analyzeTemporalTrends(); - - System.out.println("\n========== 分析完成 ==========\n"); - } - - public void analyzeSentimentDistribution() { - System.out.println("【情感倾向分布分析】"); - System.out.println("----------------------------------------"); - - Map sentimentCounts = posts.stream() - .collect(Collectors.groupingBy( - PostInfo::getSentiment, - Collectors.counting() - )); - - System.out.printf("%-20s %s%n", "情感倾向", "帖子数量"); - System.out.println("----------------------------------------"); - - sentimentCounts.entrySet().stream() - .sorted(Map.Entry.comparingByValue().reversed()) - .forEach(entry -> System.out.printf("%-20s %d%n", entry.getKey(), entry.getValue())); - - System.out.println(); - } - - public void analyzeEngagementMetrics() { - System.out.println("【互动指标分析】"); - System.out.println("----------------------------------------"); - - double avgLikes = posts.stream() - .mapToInt(PostInfo::getLikeCount) - .average() - .orElse(0); - - double avgComments = posts.stream() - .mapToInt(PostInfo::getCommentCount) - .average() - .orElse(0); - - double avgViews = posts.stream() - .mapToInt(PostInfo::getViewCount) - .average() - .orElse(0); - - System.out.printf("平均点赞数: %.1f%n", avgLikes); - System.out.printf("平均评论数: %.1f%n", avgComments); - System.out.printf("平均浏览量: %.1f%n", avgViews); - - System.out.println(); - } - - public void analyzePopularAuthors() { - System.out.println("【热门作者排行】"); - System.out.println("----------------------------------------"); - System.out.printf("%-30s %10s %10s %10s%n", "作者", "帖子数", "总点赞", "总评论"); - System.out.println("----------------------------------------"); - - Map> authorPosts = posts.stream() - .collect(Collectors.groupingBy(PostInfo::getAuthor)); - - authorPosts.entrySet().stream() - .sorted(Map.Entry.>comparingByValue((a, b) -> b.size() - a.size())) - .limit(10) - .forEach(entry -> { - String author = entry.getKey(); - List authorPostList = entry.getValue(); - int postCount = authorPostList.size(); - int totalLikes = authorPostList.stream().mapToInt(PostInfo::getLikeCount).sum(); - int totalComments = authorPostList.stream().mapToInt(PostInfo::getCommentCount).sum(); - - System.out.printf("%-30s %10d %10d %10d%n", - author.length() > 28 ? author.substring(0, 28) : author, - postCount, totalLikes, totalComments); - }); - - System.out.println(); - } - - public void analyzeContentLength() { - System.out.println("【内容长度分析】"); - System.out.println("----------------------------------------"); - - double avgLength = posts.stream() - .mapToInt(post -> post.getContent().length()) - .average() - .orElse(0); - - int maxLength = posts.stream() - .mapToInt(post -> post.getContent().length()) - .max() - .orElse(0); - - int minLength = posts.stream() - .mapToInt(post -> post.getContent().length()) - .min() - .orElse(0); - - System.out.printf("平均内容长度: %.1f 字符%n", avgLength); - System.out.printf("最长内容: %d 字符%n", maxLength); - System.out.printf("最短内容: %d 字符%n", minLength); - - System.out.println(); - } - - public void analyzeTemporalTrends() { - System.out.println("【时间趋势分析】"); - System.out.println("----------------------------------------"); - - Map monthlyPosts = posts.stream() - .filter(post -> post.getPostDate() != null) - .collect(Collectors.groupingBy( - post -> post.getPostDate().format(java.time.format.DateTimeFormatter.ofPattern("yyyy-MM")), - Collectors.counting() - )); - - System.out.printf("%-10s %s%n", "月份", "帖子数量"); - System.out.println("----------------------------------------"); - - monthlyPosts.entrySet().stream() - .sorted(Map.Entry.comparingByKey()) - .forEach(entry -> System.out.printf("%-10s %d%n", entry.getKey(), entry.getValue())); - - System.out.println(); - } - - public Map getSentimentDistributionData() { - return posts.stream() - .collect(Collectors.groupingBy( - PostInfo::getSentiment, - Collectors.counting() - )); - } - - public Map getEngagementData() { - Map engagementData = new LinkedHashMap<>(); - - double avgLikes = posts.stream() - .mapToInt(PostInfo::getLikeCount) - .average() - .orElse(0); - - double avgComments = posts.stream() - .mapToInt(PostInfo::getCommentCount) - .average() - .orElse(0); - - double avgViews = posts.stream() - .mapToInt(PostInfo::getViewCount) - .average() - .orElse(0); - - engagementData.put("点赞", avgLikes); - engagementData.put("评论", avgComments); - engagementData.put("浏览", avgViews); - - return engagementData; - } - - public Map getAuthorPostCount() { - return posts.stream() - .collect(Collectors.groupingBy( - PostInfo::getAuthor, - Collectors.summingInt(post -> 1) - )).entrySet().stream() - .sorted(Map.Entry.comparingByValue().reversed()) - .limit(10) - .collect(Collectors.toMap( - Map.Entry::getKey, - Map.Entry::getValue, - (e1, e2) -> e1, - LinkedHashMap::new - )); - } -} diff --git a/project/PostInfo.java b/project/PostInfo.java deleted file mode 100644 index 831bfd7..0000000 --- a/project/PostInfo.java +++ /dev/null @@ -1,127 +0,0 @@ -import java.time.LocalDate; - -public class PostInfo { - private String title; - private String content; - private String author; - private LocalDate postDate; - private int likeCount; - private int commentCount; - private int viewCount; - private String tags; - private String sentiment; - - public PostInfo() { - } - - public PostInfo(String title, String content, String author, LocalDate postDate, - int likeCount, int commentCount, int viewCount, String tags, String sentiment) { - this.title = title; - this.content = content; - this.author = author; - this.postDate = postDate; - this.likeCount = likeCount; - this.commentCount = commentCount; - this.viewCount = viewCount; - this.tags = tags; - this.sentiment = sentiment; - } - - public String getTitle() { - return title; - } - - public void setTitle(String title) { - this.title = title; - } - - public String getContent() { - return content; - } - - public void setContent(String content) { - this.content = content; - } - - public String getAuthor() { - return author; - } - - public void setAuthor(String author) { - this.author = author; - } - - public LocalDate getPostDate() { - return postDate; - } - - public void setPostDate(LocalDate postDate) { - this.postDate = postDate; - } - - public int getLikeCount() { - return likeCount; - } - - public void setLikeCount(int likeCount) { - this.likeCount = likeCount; - } - - public int getCommentCount() { - return commentCount; - } - - public void setCommentCount(int commentCount) { - this.commentCount = commentCount; - } - - public int getViewCount() { - return viewCount; - } - - public void setViewCount(int viewCount) { - this.viewCount = viewCount; - } - - public String getTags() { - return tags; - } - - public void setTags(String tags) { - this.tags = tags; - } - - public String getSentiment() { - return sentiment; - } - - public void setSentiment(String sentiment) { - this.sentiment = sentiment; - } - - @Override - public String toString() { - return "PostInfo{" + - "title='" + title + '\'' + - ", author='" + author + '\'' + - ", postDate=" + postDate + - ", likeCount=" + likeCount + - ", commentCount=" + commentCount + - ", viewCount=" + viewCount + - ", sentiment='" + sentiment + '\'' + - '}'; - } - - public String toCSV() { - return String.format("\"%s\",\"%s\",\"%s\",\"%s\",%d,%d,%d,\"%s\",\"%s\"", - title != null ? title.replace("\"", "\"\"") : "", - content != null ? content.replace("\"", "\"\"").replace("\n", " ") : "", - author != null ? author.replace("\"", "\"\"") : "", - postDate != null ? postDate.toString() : "", - likeCount, - commentCount, - viewCount, - tags != null ? tags.replace("\"", "\"\"") : "", - sentiment != null ? sentiment.replace("\"", "\"\"") : ""); - } -} diff --git a/project/ProcessRegressionData.java b/project/ProcessRegressionData.java deleted file mode 100644 index 8e8a98d..0000000 --- a/project/ProcessRegressionData.java +++ /dev/null @@ -1,50 +0,0 @@ -import java.io.*; -import java.util.*; -import java.util.regex.*; - -public class ProcessRegressionData { - public static void main(String[] args) { - String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).xlsx"; - String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新)_回归.xlsx"; - - System.out.println("========================================"); - System.out.println(" 处理回归数据"); - System.out.println("========================================"); - System.out.println("输入文件: " + inputFile); - System.out.println("输出文件: " + outputFile); - System.out.println(); - - // 检查文件是否存在 - File file = new File(inputFile); - if (!file.exists()) { - System.out.println("错误: 输入文件不存在!"); - return; - } - - System.out.println("输入文件大小: " + (file.length() / 1024) + " KB"); - System.out.println("\n注意: 这是一个简化版本,用于演示处理逻辑。"); - System.out.println("实际处理需要使用Apache POI库来读取和写入Excel文件。"); - System.out.println(); - System.out.println("处理逻辑:"); - System.out.println("1. 读取原始数据"); - System.out.println("2. 识别列: helpfull( Y ), 帖子评论总数( X1 ), 评论1-5内容列"); - System.out.println("3. 计算 X2-X6:"); - System.out.println(" - X2: 评论长度平均值(剔空格后的字符数)"); - System.out.println(" - X3: 评论复杂度平均值(按空格拆分的分词数)"); - System.out.println(" - X4: X2/X3(X3为0时记0)"); - System.out.println(" - X5: 情感性平均值(正面=1、中性=0、负面=-1)"); - System.out.println(" - X6: 信息丰富度平均值(含数字/链接/表情各1分)"); - System.out.println("4. 数据清洗: 确保所有值为纯数字,无空值/错误值"); - System.out.println("5. 保存到新文件"); - System.out.println(); - System.out.println("由于数据量较大(3万+行),建议使用Python的pandas库处理。"); - System.out.println("请确保Python脚本能够完整执行,可能需要增加内存或分批处理。"); - System.out.println(); - System.out.println("========================================"); - System.out.println(" 建议使用以下Python命令运行"); - System.out.println("========================================"); - System.out.println("cd d:\\java\\project"); - System.out.println("python process_300_rows.py (测试前300行)"); - System.out.println("python process_all_rows.py (处理全部数据)"); - } -} diff --git a/project/README.md b/project/README.md deleted file mode 100644 index a8687f1..0000000 --- a/project/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# java - diff --git a/project/SimpleChartGenerator.java b/project/SimpleChartGenerator.java deleted file mode 100644 index 5a14324..0000000 --- a/project/SimpleChartGenerator.java +++ /dev/null @@ -1,165 +0,0 @@ -package com.project.chart; - -import com.project.analyzer.PostAnalyzer; - -import java.awt.*; -import java.awt.image.BufferedImage; -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.util.Map; -import javax.imageio.ImageIO; - -public class SimpleChartGenerator { - - private static final String OUTPUT_DIR = "d:\\java\\project\\charts"; - private static final int WIDTH = 800; - private static final int HEIGHT = 600; - - public static void generateAllCharts(PostAnalyzer analyzer) { - try { - Files.createDirectories(Paths.get(OUTPUT_DIR)); - - generateSentimentChart(analyzer); - generateEngagementChart(analyzer); - generateAuthorChart(analyzer); - - System.out.println("\n所有图表已生成,保存在: " + OUTPUT_DIR); - - } catch (IOException e) { - System.err.println("创建图表目录时出错: " + e.getMessage()); - } - } - - public static void generateSentimentChart(PostAnalyzer analyzer) { - Map data = analyzer.getSentimentDistributionData(); - - BufferedImage image = new BufferedImage(WIDTH, HEIGHT, BufferedImage.TYPE_INT_RGB); - Graphics2D g2d = image.createGraphics(); - - g2d.setColor(Color.WHITE); - g2d.fillRect(0, 0, WIDTH, HEIGHT); - - g2d.setColor(Color.BLACK); - g2d.setFont(new Font("宋体", Font.BOLD, 24)); - g2d.drawString("情感倾向分布", 300, 40); - - int barWidth = 150; - int startX = 200; - int startY = 500; - int maxHeight = 400; - - long maxValue = data.values().stream().max(Long::compare).orElse(1L); - - int index = 0; - for (Map.Entry entry : data.entrySet()) { - int barHeight = (int) ((entry.getValue() * 1.0 / maxValue) * maxHeight); - - g2d.setColor(new Color(70, 130, 180)); - g2d.fillRect(startX + index * (barWidth + 50), startY - barHeight, barWidth, barHeight); - - g2d.setColor(Color.BLACK); - g2d.setFont(new Font("宋体", Font.PLAIN, 14)); - g2d.drawString(entry.getKey(), startX + index * (barWidth + 50) + 50, startY + 20); - g2d.drawString(String.valueOf(entry.getValue()), startX + index * (barWidth + 50) + 60, startY - barHeight - 5); - - index++; - } - - g2d.dispose(); - saveImage(image, "sentiment_distribution.png"); - } - - public static void generateEngagementChart(PostAnalyzer analyzer) { - Map data = analyzer.getEngagementData(); - - BufferedImage image = new BufferedImage(WIDTH, HEIGHT, BufferedImage.TYPE_INT_RGB); - Graphics2D g2d = image.createGraphics(); - - g2d.setColor(Color.WHITE); - g2d.fillRect(0, 0, WIDTH, HEIGHT); - - g2d.setColor(Color.BLACK); - g2d.setFont(new Font("宋体", Font.BOLD, 24)); - g2d.drawString("互动指标分析", 300, 40); - - int barWidth = 150; - int startX = 200; - int startY = 500; - int maxHeight = 400; - - double maxValue = data.values().stream().max(Double::compare).orElse(1.0); - - int index = 0; - for (Map.Entry entry : data.entrySet()) { - int barHeight = (int) ((entry.getValue() / maxValue) * maxHeight); - - g2d.setColor(new Color(60, 179, 113)); - g2d.fillRect(startX + index * (barWidth + 50), startY - barHeight, barWidth, barHeight); - - g2d.setColor(Color.BLACK); - g2d.setFont(new Font("宋体", Font.PLAIN, 14)); - g2d.drawString(entry.getKey(), startX + index * (barWidth + 50) + 60, startY + 20); - g2d.drawString(String.format("%.1f", entry.getValue()), startX + index * (barWidth + 50) + 50, startY - barHeight - 5); - - index++; - } - - g2d.dispose(); - saveImage(image, "engagement_metrics.png"); - } - - public static void generateAuthorChart(PostAnalyzer analyzer) { - Map data = analyzer.getAuthorPostCount(); - - BufferedImage image = new BufferedImage(WIDTH, HEIGHT, BufferedImage.TYPE_INT_RGB); - Graphics2D g2d = image.createGraphics(); - - g2d.setColor(Color.WHITE); - g2d.fillRect(0, 0, WIDTH, HEIGHT); - - g2d.setColor(Color.BLACK); - g2d.setFont(new Font("宋体", Font.BOLD, 24)); - g2d.drawString("热门作者排行TOP10", 280, 40); - - int barHeight = 35; - int startY = 80; - int startX = 200; - int maxWidth = 500; - - int maxValue = data.values().stream().max(Integer::compare).orElse(1); - - int index = 0; - for (Map.Entry entry : data.entrySet()) { - int barWidth = (int) ((entry.getValue() * 1.0 / maxValue) * maxWidth); - - g2d.setColor(new Color(255, 140, 0)); - g2d.fillRect(startX, startY + index * (barHeight + 10), barWidth, barHeight); - - g2d.setColor(Color.BLACK); - g2d.setFont(new Font("宋体", Font.PLAIN, 12)); - String author = entry.getKey(); - if (author.length() > 15) { - author = author.substring(0, 15) + "..."; - } - g2d.drawString(author, 50, startY + index * (barHeight + 10) + 23); - g2d.drawString(String.valueOf(entry.getValue()), startX + barWidth + 10, startY + index * (barHeight + 10) + 23); - - index++; - } - - g2d.dispose(); - saveImage(image, "author_ranking.png"); - } - - private static void saveImage(BufferedImage image, String filename) { - try { - File file = new File(OUTPUT_DIR, filename); - ImageIO.write(image, "PNG", file); - System.out.println("图表已保存: " + file.getAbsolutePath()); - } catch (IOException e) { - System.err.println("保存图表失败: " + e.getMessage()); - } - } -} diff --git a/project/SimpleDataCleaner.java b/project/SimpleDataCleaner.java deleted file mode 100644 index c35cb2c..0000000 --- a/project/SimpleDataCleaner.java +++ /dev/null @@ -1,59 +0,0 @@ -import java.io.*; -import java.util.ArrayList; -import java.util.List; - -public class SimpleDataCleaner { - - public static void main(String[] args) { - String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx"; - String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).csv"; - - System.out.println("========================================"); - System.out.println(" 简单数据清洗脚本"); - System.out.println("========================================"); - System.out.println("输入文件: " + inputFile); - System.out.println("输出文件: " + outputFile); - System.out.println(); - - // 检查文件是否存在 - File input = new File(inputFile); - if (!input.exists()) { - System.out.println("错误: 输入文件不存在!"); - return; - } - - System.out.println("文件大小: " + (input.length() / 1024) + " KB"); - - // 由于.xlsx是二进制格式,我们直接复制文件并重命名 - // 实际项目中应该使用Apache POI等库来处理Excel文件 - try { - File output = new File(outputFile); - - // 确保输出目录存在 - File parentDir = output.getParentFile(); - if (parentDir != null && !parentDir.exists()) { - parentDir.mkdirs(); - } - - // 复制文件 - try (FileInputStream fis = new FileInputStream(input); - FileOutputStream fos = new FileOutputStream(output)) { - - byte[] buffer = new byte[1024]; - int length; - while ((length = fis.read(buffer)) > 0) { - fos.write(buffer, 0, length); - } - } - - System.out.println("文件已成功复制并重命名为: " + outputFile); - System.out.println(); - System.out.println("========================================"); - System.out.println(" 任务完成"); - System.out.println("========================================"); - - } catch (IOException e) { - System.err.println("处理文件时出错: " + e.getMessage()); - } - } -} diff --git a/project/add_regression_columns.py b/project/add_regression_columns.py deleted file mode 100644 index 993ddde..0000000 --- a/project/add_regression_columns.py +++ /dev/null @@ -1,189 +0,0 @@ -import os -import pandas as pd -import re - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' - -print("========================================") -print(" 在原表中添加回归数据列") -print("========================================") -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - -# 读取原始数据 -try: - print("\n正在读取原始数据...") - df = pd.read_excel(input_file) - print(f"成功读取 {len(df)} 行数据") - print(f"原始列名: {list(df.columns)}") - - # 识别列 - print("\n识别列...") - helpfull_col = None - comment_count_col = None - comment_cols = [] - - for col in df.columns: - col_str = str(col).lower() - if 'helpfull' in col_str or 'helpful' in col_str: - helpfull_col = col - print(f"找到 Y 列 (helpfull): {col}") - elif '评论总数' in str(col) or '帖子评论总数' in str(col): - comment_count_col = col - print(f"找到 X1 列 (评论总数): {col}") - elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)): - comment_cols.append(col) - print(f"找到评论列 {len(comment_cols)}: {col}") - - print(f"\n共找到 {len(comment_cols)} 个评论列") - - # 添加回归数据列 - print("\n添加回归数据列...") - - # Y (UGC有用性) - print("1. 添加 Y (UGC有用性)") - if helpfull_col: - df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) - else: - df['Y'] = 0 - - # X1 (评论数量) - print("2. 添加 X1 (评论数量)") - if comment_count_col: - df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) - else: - df['X1'] = 0 - - # 定义函数计算评论指标 - def calculate_comment_metrics(content): - if pd.isna(content) or str(content) in ['None', 'nan', '']: - return 0, 0, 0, 0 - - content = str(content) - # 评论长度(剔空格后的字符数) - length = len(content.replace(' ', '').replace('\u3000', '')) - # 评论复杂度(按空格拆分的分词数) - complexity = len(content.split()) - # 情感分析 - positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] - negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] - - sentiment = 0 - lower_content = content.lower() - if any(word in lower_content for word in positive_words): - sentiment = 1 - elif any(word in lower_content for word in negative_words): - sentiment = -1 - # 信息丰富度 - richness = 0 - if re.search(r'\d', content): # 含数字 - richness += 1 - if re.search(r'http[s]?://|www\.', content): # 含链接 - richness += 1 - if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情 - richness += 1 - - return length, complexity, sentiment, richness - - # 计算评论相关指标 - print("3. 计算评论相关指标...") - - # 初始化列 - df['X2'] = 0.0 # 评论长度 - df['X3'] = 0.0 # 评论复杂度 - df['X5'] = 0.0 # 情感性 - df['X6'] = 0.0 # 信息丰富度 - - # 逐行计算 - total_rows = len(df) - for i in range(total_rows): - if i % 1000 == 0: - print(f" 处理到第 {i}/{total_rows} 行...") - - lengths = [] - complexities = [] - sentiments = [] - richness = [] - - for col in comment_cols: - content = df.iloc[i].get(col, '') - length, complexity, sentiment, r = calculate_comment_metrics(content) - if length > 0: - lengths.append(length) - complexities.append(complexity) - sentiments.append(sentiment) - richness.append(r) - - # 计算平均值 - if lengths: - df.loc[i, 'X2'] = sum(lengths) / len(lengths) - df.loc[i, 'X3'] = sum(complexities) / len(complexities) - df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) - df.loc[i, 'X6'] = sum(richness) / len(richness) - - # X4: 评论可读性 - print("4. 计算 X4 (评论可读性)") - df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) - - # 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误 - print("\n5. 数据清洗...") - regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] - for col in regression_cols: - # 转换为数字,错误值转为0 - df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) - # 替换无穷大 - df[col] = df[col].replace([float('inf'), float('-inf')], 0) - - # 验证数据 - print("\n6. 验证数据...") - print(f"总行数: {len(df)}") - print(f"总列数: {len(df.columns)}") - print(f"\n回归数据列统计:") - print(df[regression_cols].describe()) - print(f"\n前5行回归数据:") - print(df[regression_cols].head()) - - # 检查是否有空值或错误值 - print(f"\n空值检查:") - for col in regression_cols: - null_count = df[col].isnull().sum() - print(f" {col}: {null_count} 个空值") - - # 保存文件 - print("\n7. 保存文件...") - df.to_excel(output_file, index=False) - - # 验证文件 - print("\n8. 验证文件...") - if os.path.exists(output_file): - print(f"文件已成功保存: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - # 重新读取检查 - df_check = pd.read_excel(output_file) - print(f"输出文件行数: {len(df_check)}") - print(f"输出文件列数: {len(df_check.columns)}") - print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}") - else: - print("文件保存失败!") - - print() - print("========================================") - print(" 任务完成") - print("========================================") - print(f"新文件已保存: {output_file}") - print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/basic_test.py b/project/basic_test.py deleted file mode 100644 index 64e4bad..0000000 --- a/project/basic_test.py +++ /dev/null @@ -1,32 +0,0 @@ -import os - -print("========================================") -print(" 基本测试") -print("========================================") -print(f"当前目录: {os.getcwd()}") -print(f"Python版本:") - -# 执行Python版本检查 -import sys -print(sys.version) - -# 检查目录 -print("\n检查目录:") -dir_path = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求' -print(f"目录: {dir_path}") -print(f"存在: {os.path.exists(dir_path)}") - -# 列出文件 -if os.path.exists(dir_path): - print("\n目录文件:") - files = os.listdir(dir_path) - for file in files[:15]: - file_path = os.path.join(dir_path, file) - if os.path.isfile(file_path): - size = os.path.getsize(file_path) / 1024 - print(f" {file}: {size:.2f} KB") - -print() -print("========================================") -print(" 测试完成") -print("========================================") diff --git a/project/batch_process.py b/project/batch_process.py deleted file mode 100644 index 2a8a572..0000000 --- a/project/batch_process.py +++ /dev/null @@ -1,219 +0,0 @@ -import os -import pandas as pd -import re -import gc - -print("=" * 60) -print(" 分批处理回归数据") -print("=" * 60) - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' - -print(f"输入文件: {input_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - -# 读取原始数据 -print("\n正在读取原始数据...") -try: - df = pd.read_excel(input_file, engine='openpyxl') - print(f"成功读取 {len(df)} 行数据") - print(f"原始列数: {len(df.columns)}") -except Exception as e: - print(f"读取失败: {e}") - import traceback - traceback.print_exc() - exit(1) - -# 识别列 -print("\n识别列...") -helpfull_col = None -comment_count_col = None -comment_cols = [] - -for col in df.columns: - col_str = str(col).lower() - if 'helpfull' in col_str or 'helpful' in col_str: - helpfull_col = col - print(f"找到 Y 列 (helpfull): {col}") - elif '评论总数' in str(col) or '帖子评论总数' in str(col): - comment_count_col = col - print(f"找到 X1 列 (评论总数): {col}") - elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): - comment_cols.append(col) - print(f"找到评论列 {len(comment_cols)}: {col}") - -print(f"\n共找到 {len(comment_cols)} 个评论内容列") - -# 添加回归数据列 -print("\n添加回归数据列...") - -# Y (UGC有用性) - 直接复制helpfull列 -print("1. 添加 Y (UGC有用性)") -if helpfull_col: - df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) -else: - df['Y'] = 0 - -# X1 (评论数量) - 直接复制帖子评论总数列 -print("2. 添加 X1 (评论数量)") -if comment_count_col: - df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) -else: - df['X1'] = 0 - -# 定义函数计算评论指标 -def calculate_comment_metrics(content): - if pd.isna(content) or str(content) in ['None', 'nan', '']: - return 0, 0, 0, 0 - - content = str(content) - # X2: 评论长度(剔空格后的字符数) - length = len(content.replace(' ', '').replace('\u3000', '')) - # X3: 评论复杂度(按空格拆分的分词数) - complexity = len(content.split()) - # X5: 情感分析(正面=1、中性=0、负面=-1) - positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] - negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] - - sentiment = 0 - lower_content = content.lower() - if any(word in lower_content for word in positive_words): - sentiment = 1 - elif any(word in lower_content for word in negative_words): - sentiment = -1 - # X6: 信息丰富度(含数字/链接/表情各1分,满分3分) - richness = 0 - if re.search(r'\d', content): # 含数字 - richness += 1 - if re.search(r'http[s]?://|www\.', content): # 含链接 - richness += 1 - if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情 - richness += 1 - - return length, complexity, sentiment, richness - -# 计算评论相关指标 -print("3. 计算评论相关指标...") - -# 初始化列 -df['X2'] = 0.0 # 评论长度 -df['X3'] = 0.0 # 评论复杂度 -df['X5'] = 0.0 # 情感性 -df['X6'] = 0.0 # 信息丰富度 - -# 逐行计算 -total_rows = len(df) -print(f"总数据行数: {total_rows}") - -batch_size = 5000 -num_batches = (total_rows + batch_size - 1) // batch_size - -for batch in range(num_batches): - start_idx = batch * batch_size - end_idx = min((batch + 1) * batch_size, total_rows) - print(f"处理批次 {batch + 1}/{num_batches} (行 {start_idx} 到 {end_idx})...") - - for i in range(start_idx, end_idx): - lengths = [] - complexities = [] - sentiments = [] - richness = [] - - for col in comment_cols: - content = df.iloc[i].get(col, '') - length, complexity, sentiment, r = calculate_comment_metrics(content) - if length > 0: # 只统计有内容的评论 - lengths.append(length) - complexities.append(complexity) - sentiments.append(sentiment) - richness.append(r) - - # 计算平均值(无评论记0) - if lengths: - df.loc[i, 'X2'] = sum(lengths) / len(lengths) - df.loc[i, 'X3'] = sum(complexities) / len(complexities) - df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) - df.loc[i, 'X6'] = sum(richness) / len(richness) - - # 释放内存 - gc.collect() - -# X4: 评论可读性 = X2/X3(X3为0时记0,避免报错) -print("4. 计算 X4 (评论可读性)") -df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) - -# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误 -print("\n5. 数据清洗...") -regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] -for col in regression_cols: - # 转换为数字,错误值转为0 - df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) - # 替换无穷大 - df[col] = df[col].replace([float('inf'), float('-inf')], 0) - -# 验证数据 -print("\n6. 验证数据...") -print(f"总行数: {len(df)}") -print(f"总列数: {len(df.columns)}") -print(f"\n回归数据列统计:") -print(df[regression_cols].describe()) -print(f"\n前5行回归数据:") -print(df[regression_cols].head()) - -# 检查是否有空值或错误值 -print(f"\n空值检查:") -for col in regression_cols: - null_count = df[col].isnull().sum() - print(f" {col}: {null_count} 个空值") - -# 保存文件 -print("\n7. 保存文件...") -print(f"正在保存到: {output_file}") - -try: - # 使用xlsxwriter引擎 - df.to_excel(output_file, index=False, engine='xlsxwriter') - print("文件保存成功!") -except Exception as e: - print(f"xlsxwriter保存失败: {e}") - try: - print("尝试使用openpyxl引擎...") - df.to_excel(output_file, index=False, engine='openpyxl') - print("文件保存成功!") - except Exception as e2: - print(f"openpyxl保存也失败: {e2}") - import traceback - traceback.print_exc() - -# 验证文件 -print("\n8. 验证文件...") -if os.path.exists(output_file): - print(f"文件已成功保存: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - try: - # 重新读取检查 - df_check = pd.read_excel(output_file) - print(f"输出文件行数: {len(df_check)}") - print(f"输出文件列数: {len(df_check.columns)}") - print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}") - except Exception as e: - print(f"验证文件时出错: {e}") -else: - print("文件保存失败!") - -print() -print("=" * 60) -print(" 任务完成") -print("=" * 60) -if os.path.exists(output_file): - print(f"新文件已保存: {output_file}") - print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列") diff --git a/project/calculate_regression_data.py b/project/calculate_regression_data.py deleted file mode 100644 index 642e383..0000000 --- a/project/calculate_regression_data.py +++ /dev/null @@ -1,169 +0,0 @@ -import os -import pandas as pd -import re - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' - -print("========================================") -print(" 计算UGC回归数据") -print("========================================") -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - -# 读取原始数据 -try: - print("正在读取原始数据...") - df = pd.read_excel(input_file) - print(f"成功读取 {len(df)} 行数据") - print(f"列名: {list(df.columns)}") - - # 识别评论列 - comment_columns = [col for col in df.columns if '评论' in col and any(str(i) in col for i in range(1, 6))] - print(f"\n找到评论列: {comment_columns}") - - # 创建回归数据 - regression_data = pd.DataFrame() - - # 1. Y (UGC有用性) - print("\n1. 计算 Y (UGC有用性)") - if 'helpfull' in df.columns: - regression_data['Y'] = df['helpfull'].fillna(0).astype(float) - print(f"成功提取 Y 列,共 {len(regression_data['Y'])} 个值") - else: - print("警告: 未找到 helpfull 列,使用默认值 0") - regression_data['Y'] = 0 - - # 2. X1 (评论数量) - print("\n2. 计算 X1 (评论数量)") - comment_count_columns = [col for col in df.columns if '评论总数' in col or '帖子评论总数' in col] - if comment_count_columns: - regression_data['X1'] = df[comment_count_columns[0]].fillna(0).astype(float) - print(f"成功提取 X1 列,使用列: {comment_count_columns[0]}") - else: - print("警告: 未找到评论总数列,使用默认值 0") - regression_data['X1'] = 0 - - # 3. X2 (评论长度) - print("\n3. 计算 X2 (评论长度)") - def calculate_comment_length(row): - lengths = [] - for col in comment_columns: - content = str(row.get(col, '')) - if content and content != 'nan': - # 剔空格后的字符数 - length = len(content.replace(' ', '')) - lengths.append(length) - return sum(lengths) / len(lengths) if lengths else 0 - - regression_data['X2'] = df.apply(calculate_comment_length, axis=1) - - # 4. X3 (评论复杂度) - print("\n4. 计算 X3 (评论复杂度)") - def calculate_comment_complexity(row): - complexities = [] - for col in comment_columns: - content = str(row.get(col, '')) - if content and content != 'nan': - # 按空格拆分的分词数 - complexity = len(content.split()) - complexities.append(complexity) - return sum(complexities) / len(complexities) if complexities else 0 - - regression_data['X3'] = df.apply(calculate_comment_complexity, axis=1) - - # 5. X4 (评论可读性) - print("\n5. 计算 X4 (评论可读性)") - regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) - - # 6. X5 (内容情感性) - print("\n6. 计算 X5 (内容情感性)") - def calculate_sentiment(row): - sentiments = [] - for col in comment_columns: - content = str(row.get(col, '')) - if content and content != 'nan': - # 简单的情感分析 - positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive'] - negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative'] - - sentiment = 0 - lower_content = content.lower() - - if any(word in lower_content for word in positive_words): - sentiment = 1 - elif any(word in lower_content for word in negative_words): - sentiment = -1 - - sentiments.append(sentiment) - return sum(sentiments) / len(sentiments) if sentiments else 0 - - regression_data['X5'] = df.apply(calculate_sentiment, axis=1) - - # 7. X6 (信息丰富度) - print("\n7. 计算 X6 (信息丰富度)") - def calculate_information_richness(row): - richness_scores = [] - for col in comment_columns: - content = str(row.get(col, '')) - if content and content != 'nan': - score = 0 - # 含数字 - if re.search(r'\d', content): - score += 1 - # 含链接 - if re.search(r'http[s]?://', content): - score += 1 - # 含表情(简单判断) - if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content): - score += 1 - richness_scores.append(score) - return sum(richness_scores) / len(richness_scores) if richness_scores else 0 - - regression_data['X6'] = df.apply(calculate_information_richness, axis=1) - - # 数据清洗 - print("\n8. 数据清洗") - # 确保所有值都是数字 - for col in regression_data.columns: - regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0) - - # 验证数据 - print("\n9. 数据验证") - print(f"行数: {len(regression_data)}") - print(f"列数: {len(regression_data.columns)}") - print(f"列名: {list(regression_data.columns)}") - print(f"数据类型:") - print(regression_data.dtypes) - print(f"\n前5行数据:") - print(regression_data.head()) - - # 保存文件 - print("\n10. 保存文件") - regression_data.to_excel(output_file, index=False) - - # 验证文件是否创建成功 - if os.path.exists(output_file): - print(f"文件已成功保存到: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - else: - print("错误: 文件保存失败") - - print() - print("========================================") - print(" 任务完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/check_data_structure.py b/project/check_data_structure.py deleted file mode 100644 index 9489ed3..0000000 --- a/project/check_data_structure.py +++ /dev/null @@ -1,43 +0,0 @@ -import os -import pandas as pd - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' - -print("========================================") -print(" 检查数据结构") -print("========================================") -print(f"输入文件: {input_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - -# 读取原始数据 -try: - print("正在读取原始数据...") - df = pd.read_excel(input_file) - print(f"成功读取 {len(df)} 行数据") - print(f"列数: {len(df.columns)}") - print(f"\n所有列名:") - for i, col in enumerate(df.columns, 1): - print(f"{i}. {col}") - - print("\n前3行数据:") - print(df.head(3)) - - print("\n数据类型:") - print(df.dtypes) - - print("\n========================================") - print(" 数据结构检查完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/check_excel_size.py b/project/check_excel_size.py deleted file mode 100644 index de8d514..0000000 --- a/project/check_excel_size.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -import openpyxl - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' - -print("========================================") -print(" 检查Excel文件大小") -print("========================================") -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查输入文件 -if os.path.exists(input_file): - print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - try: - wb = openpyxl.load_workbook(input_file) - ws = wb.active - print(f"输入文件行数: {ws.max_row}") - print(f"输入文件列数: {ws.max_column}") - except Exception as e: - print(f"读取输入文件出错: {e}") -else: - print("输入文件不存在!") - -# 检查输出文件 -if os.path.exists(output_file): - print(f"\n输出文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - try: - wb = openpyxl.load_workbook(output_file) - ws = wb.active - print(f"输出文件行数: {ws.max_row}") - print(f"输出文件列数: {ws.max_column}") - - # 显示前10行数据 - print("\n前10行数据:") - for row in range(1, min(11, ws.max_row + 1)): - row_data = [] - for col in range(1, ws.max_column + 1): - value = ws.cell(row=row, column=col).value - row_data.append(value) - print(f"行 {row}: {row_data}") - except Exception as e: - print(f"读取输出文件出错: {e}") -else: - print("输出文件不存在!") - -print() -print("========================================") -print(" 检查完成") -print("========================================") diff --git a/project/create_and_fill_data.py b/project/create_and_fill_data.py deleted file mode 100644 index 980417a..0000000 --- a/project/create_and_fill_data.py +++ /dev/null @@ -1,69 +0,0 @@ -import os -import csv - -# 文件路径 -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.csv' - -print("========================================") -print(" 创建并填充UGC回归数据") -print("========================================") -print(f"输出文件: {output_file}") -print() - -# 检查输出目录是否存在 -output_dir = os.path.dirname(output_file) -print(f"输出目录: {output_dir}") -print(f"目录存在: {os.path.exists(output_dir)}") - -if not os.path.exists(output_dir): - print("正在创建输出目录...") - try: - os.makedirs(output_dir) - print("目录创建成功") - except Exception as e: - print(f"创建目录失败: {e}") - exit(1) - -# 创建并填充CSV文件 -try: - print("\n创建并填充CSV文件...") - with open(output_file, 'w', newline='', encoding='utf-8-sig') as f: - writer = csv.writer(f) - - # 写入表头 - headers = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] - writer.writerow(headers) - - # 写入示例数据(前10行) - for i in range(1, 11): - row = [ - i * 0.5, # Y: UGC有用性 - i * 2, # X1: 评论数量 - i * 10, # X2: 评论长度 - i * 2, # X3: 评论复杂度 - 5.0, # X4: 评论可读性 - (i % 3) - 1, # X5: 内容情感性 - i * 0.3 # X6: 信息丰富度 - ] - writer.writerow(row) - - print(f"文件已成功创建: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - - # 读取并显示文件内容 - print("\n文件内容:") - with open(output_file, 'r', encoding='utf-8-sig') as f: - reader = csv.reader(f) - for i, row in enumerate(reader): - if i < 5: - print(f"行 {i+1}: {row}") - - print() - print("========================================") - print(" 任务完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/create_excel_with_data.py b/project/create_excel_with_data.py deleted file mode 100644 index a256d27..0000000 --- a/project/create_excel_with_data.py +++ /dev/null @@ -1,86 +0,0 @@ -import os -import openpyxl - -# 文件路径 -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' - -print("========================================") -print(" 创建Excel文件并填充数据") -print("========================================") -print(f"输出文件: {output_file}") -print() - -# 检查输出目录是否存在 -output_dir = os.path.dirname(output_file) -print(f"输出目录: {output_dir}") -print(f"目录存在: {os.path.exists(output_dir)}") - -if not os.path.exists(output_dir): - print("正在创建输出目录...") - try: - os.makedirs(output_dir) - print("目录创建成功") - except Exception as e: - print(f"创建目录失败: {e}") - exit(1) - -# 创建Excel文件 -try: - print("\n创建Excel文件...") - wb = openpyxl.Workbook() - ws = wb.active - - # 写入表头 - headers = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] - for i, header in enumerate(headers, 1): - ws.cell(row=1, column=i, value=header) - - # 写入示例数据(前10行) - print("填充示例数据...") - for i in range(1, 11): - ws.cell(row=i+1, column=1, value=i * 0.5) # Y: UGC有用性 - ws.cell(row=i+1, column=2, value=i * 2) # X1: 评论数量 - ws.cell(row=i+1, column=3, value=i * 10) # X2: 评论长度 - ws.cell(row=i+1, column=4, value=i * 2) # X3: 评论复杂度 - ws.cell(row=i+1, column=5, value=5.0) # X4: 评论可读性 - ws.cell(row=i+1, column=6, value=(i % 3) - 1) # X5: 内容情感性 - ws.cell(row=i+1, column=7, value=i * 0.3) # X6: 信息丰富度 - - # 保存文件 - print("保存文件...") - wb.save(output_file) - - print(f"文件已成功创建: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - - # 验证文件 - print("\n验证文件...") - if os.path.exists(output_file): - print("文件创建成功!") - # 重新打开文件读取内容 - wb_check = openpyxl.load_workbook(output_file) - ws_check = wb_check.active - print(f"工作表名称: {ws_check.title}") - print(f"行数: {ws_check.max_row}") - print(f"列数: {ws_check.max_column}") - - # 显示前5行 - print("\n前5行数据:") - for row in range(1, min(6, ws_check.max_row + 1)): - row_data = [] - for col in range(1, ws_check.max_column + 1): - value = ws_check.cell(row=row, column=col).value - row_data.append(value) - print(f"行 {row}: {row_data}") - else: - print("文件创建失败!") - - print() - print("========================================") - print(" 任务完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/create_regression_data.py b/project/create_regression_data.py deleted file mode 100644 index 9100b20..0000000 --- a/project/create_regression_data.py +++ /dev/null @@ -1,112 +0,0 @@ -import os -import pandas as pd -import numpy as np -import re - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' - -print("========================================") -print(" 创建UGC回归数据文件") -print("========================================") -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查输入文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - -# 读取原始数据 -try: - print("正在读取原始数据...") - df = pd.read_excel(input_file) - print(f"成功读取 {len(df)} 行数据") - print(f"列名: {list(df.columns)}") - print() - - # 创建新的回归数据DataFrame - regression_data = pd.DataFrame() - - # 1. 提取因变量Y (helpfull列) - print("1. 提取因变量Y (helpfull列)") - if 'helpfull' in df.columns: - regression_data['Y'] = df['helpfull'].fillna(0) - print(f"成功提取 Y 列,共 {len(regression_data['Y'])} 个值") - else: - print("警告: 未找到 helpfull 列,使用默认值 0") - regression_data['Y'] = 0 - - # 2. 提取X1 (评论总数列) - print("\n2. 提取X1 (评论总数列)") - comment_columns = [col for col in df.columns if '评论' in col and '总数' in col] - if comment_columns: - regression_data['X1'] = df[comment_columns[0]].fillna(0) - print(f"成功提取 X1 列,使用列: {comment_columns[0]}") - else: - print("警告: 未找到评论总数列,使用默认值 0") - regression_data['X1'] = 0 - - # 3. 计算X2-X6 - print("\n3. 计算X2-X6") - - # X2: 评论长度 - print(" - 计算X2 (评论长度)") - regression_data['X2'] = 0 - - # X3: 评论复杂度 - print(" - 计算X3 (评论复杂度)") - regression_data['X3'] = 0 - - # X4: 评论可读性 - print(" - 计算X4 (评论可读性)") - regression_data['X4'] = 0 - - # X5: 内容情感性 - print(" - 计算X5 (内容情感性)") - regression_data['X5'] = 0 - - # X6: 信息丰富度 - print(" - 计算X6 (信息丰富度)") - regression_data['X6'] = 0 - - # 4. 数据清洗 - print("\n4. 数据清洗") - # 确保所有值都是数字 - for col in regression_data.columns: - regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0) - - # 5. 验证数据 - print("\n5. 数据验证") - print(f"行数: {len(regression_data)}") - print(f"列数: {len(regression_data.columns)}") - print(f"列名: {list(regression_data.columns)}") - print(f"数据类型:") - print(regression_data.dtypes) - print(f"\n前5行数据:") - print(regression_data.head()) - - # 6. 保存文件 - print("\n6. 保存文件") - regression_data.to_excel(output_file, index=False) - - # 验证文件是否创建成功 - if os.path.exists(output_file): - print(f"文件已成功保存到: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - else: - print("错误: 文件保存失败") - - print() - print("========================================") - print(" 任务完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/create_regression_data_v2.py b/project/create_regression_data_v2.py deleted file mode 100644 index 6e18bed..0000000 --- a/project/create_regression_data_v2.py +++ /dev/null @@ -1,142 +0,0 @@ -import os -import pandas as pd -import numpy as np - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' - -print("========================================") -print(" 创建UGC回归数据文件 v2") -print("========================================") -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查输入文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - print(f"检查路径: {input_file}") - exit(1) - -print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") -print(f"文件存在: {os.path.exists(input_file)}") - -# 检查输出目录是否存在 -output_dir = os.path.dirname(output_file) -print(f"输出目录: {output_dir}") -print(f"目录存在: {os.path.exists(output_dir)}") - -if not os.path.exists(output_dir): - print("正在创建输出目录...") - try: - os.makedirs(output_dir) - print("目录创建成功") - except Exception as e: - print(f"创建目录失败: {e}") - exit(1) - -# 读取原始数据 -try: - print("\n正在读取原始数据...") - # 尝试读取文件 - df = pd.read_excel(input_file) - print(f"成功读取 {len(df)} 行数据") - print(f"列名: {list(df.columns)}") - - # 显示前几行数据以了解结构 - print("\n前3行数据:") - print(df.head(3)) - - # 创建新的回归数据DataFrame - regression_data = pd.DataFrame() - - # 1. 提取因变量Y (helpfull列) - print("\n1. 提取因变量Y (helpfull列)") - if 'helpfull' in df.columns: - regression_data['Y'] = df['helpfull'].fillna(0) - print(f"成功提取 Y 列,共 {len(regression_data['Y'])} 个值") - print(f"Y列前5个值: {list(regression_data['Y'].head())}") - else: - print("警告: 未找到 helpfull 列,使用默认值 0") - regression_data['Y'] = 0 - - # 2. 提取X1 (评论总数列) - print("\n2. 提取X1 (评论总数列)") - # 尝试找到评论相关的列 - comment_columns = [col for col in df.columns if '评论' in col] - print(f"找到评论相关列: {comment_columns}") - - if comment_columns: - regression_data['X1'] = df[comment_columns[0]].fillna(0) - print(f"成功提取 X1 列,使用列: {comment_columns[0]}") - print(f"X1列前5个值: {list(regression_data['X1'].head())}") - else: - print("警告: 未找到评论列,使用默认值 0") - regression_data['X1'] = 0 - - # 3. 计算X2-X6 - print("\n3. 计算X2-X6") - - # X2: 评论长度 - print(" - 计算X2 (评论长度)") - regression_data['X2'] = 0 - - # X3: 评论复杂度 - print(" - 计算X3 (评论复杂度)") - regression_data['X3'] = 0 - - # X4: 评论可读性 - print(" - 计算X4 (评论可读性)") - regression_data['X4'] = 0 - - # X5: 内容情感性 - print(" - 计算X5 (内容情感性)") - regression_data['X5'] = 0 - - # X6: 信息丰富度 - print(" - 计算X6 (信息丰富度)") - regression_data['X6'] = 0 - - # 4. 数据清洗 - print("\n4. 数据清洗") - # 确保所有值都是数字 - for col in regression_data.columns: - regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0) - - # 5. 验证数据 - print("\n5. 数据验证") - print(f"行数: {len(regression_data)}") - print(f"列数: {len(regression_data.columns)}") - print(f"列名: {list(regression_data.columns)}") - print(f"数据类型:") - print(regression_data.dtypes) - print(f"\n前5行数据:") - print(regression_data.head()) - - # 6. 保存文件 - print("\n6. 保存文件") - print(f"保存路径: {output_file}") - - try: - regression_data.to_excel(output_file, index=False) - print("文件保存成功") - except Exception as e: - print(f"保存文件失败: {e}") - - # 验证文件是否创建成功 - if os.path.exists(output_file): - print(f"文件已成功保存到: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - else: - print("错误: 文件保存失败,未找到输出文件") - - print() - print("========================================") - print(" 任务完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/d b/project/d deleted file mode 100644 index e69de29..0000000 diff --git a/project/data_cleaner.py b/project/data_cleaner.py deleted file mode 100644 index d9f2d42..0000000 --- a/project/data_cleaner.py +++ /dev/null @@ -1,73 +0,0 @@ -import os -import pandas as pd - -# 输入输出文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).csv' - -print("========================================") -print(" Python 数据清洗脚本") -print("========================================") -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - -# 读取Excel文件 -try: - print("正在读取Excel文件...") - df = pd.read_excel(input_file) - print(f"成功读取 {len(df)} 行数据") - - # 数据清洗 - print("正在清洗数据...") - - # 1. 处理缺失值 - df = df.fillna('') - - # 2. 去除文本中的多余空格 - for col in df.columns: - if df[col].dtype == 'object': - df[col] = df[col].astype(str).str.strip() - df[col] = df[col].str.replace('\\s+', ' ', regex=True) - - # 3. 规范化情感倾向 - if '情感倾向' in df.columns: - def normalize_sentiment(sentiment): - if pd.isna(sentiment) or sentiment == '': - return '中性' - sentiment = str(sentiment).lower() - if any(keyword in sentiment for keyword in ['积极', '正面', 'positive']): - return '积极' - elif any(keyword in sentiment for keyword in ['消极', '负面', 'negative']): - return '消极' - else: - return '中性' - - df['情感倾向'] = df['情感倾向'].apply(normalize_sentiment) - - # 4. 确保输出目录存在 - output_dir = os.path.dirname(output_file) - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - # 保存为CSV文件 - print("正在保存清洗后的数据...") - df.to_csv(output_file, index=False, encoding='utf-8-sig') - - print(f"数据已成功保存到: {output_file}") - print(f"保存了 {len(df)} 行清洗后的数据") - - print() - print("========================================") - print(" 数据清洗任务完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") diff --git a/project/data_cleaner_v2.py b/project/data_cleaner_v2.py deleted file mode 100644 index a27eef6..0000000 --- a/project/data_cleaner_v2.py +++ /dev/null @@ -1,98 +0,0 @@ -import os -import pandas as pd - -# 输入输出文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).csv' - -print("========================================") -print(" Python 数据清洗脚本 v2") -print("========================================") -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - print(f"检查路径: {input_file}") - exit(1) - -print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") -print(f"文件存在: {os.path.exists(input_file)}") - -# 读取Excel文件 -try: - print("正在读取Excel文件...") - # 尝试读取前10行数据 - df = pd.read_excel(input_file, nrows=10) - print(f"成功读取 {len(df)} 行示例数据") - print(f"列名: {list(df.columns)}") - - # 读取全部数据 - print("正在读取全部数据...") - df = pd.read_excel(input_file) - print(f"成功读取 {len(df)} 行完整数据") - - # 数据清洗 - print("正在清洗数据...") - - # 1. 处理缺失值 - print(f"清洗前 - 缺失值统计:") - print(df.isnull().sum()) - df = df.fillna('') - - # 2. 去除文本中的多余空格 - for col in df.columns: - if df[col].dtype == 'object': - df[col] = df[col].astype(str).str.strip() - df[col] = df[col].str.replace('\\s+', ' ', regex=True) - - # 3. 规范化情感倾向 - if '情感倾向' in df.columns: - def normalize_sentiment(sentiment): - if pd.isna(sentiment) or sentiment == '': - return '中性' - sentiment = str(sentiment).lower() - if any(keyword in sentiment for keyword in ['积极', '正面', 'positive']): - return '积极' - elif any(keyword in sentiment for keyword in ['消极', '负面', 'negative']): - return '消极' - else: - return '中性' - - df['情感倾向'] = df['情感倾向'].apply(normalize_sentiment) - print("情感倾向规范化完成") - - # 4. 确保输出目录存在 - output_dir = os.path.dirname(output_file) - print(f"输出目录: {output_dir}") - print(f"目录存在: {os.path.exists(output_dir)}") - - if not os.path.exists(output_dir): - print("正在创建输出目录...") - os.makedirs(output_dir) - - # 保存为CSV文件 - print("正在保存清洗后的数据...") - print(f"保存路径: {output_file}") - - df.to_csv(output_file, index=False, encoding='utf-8-sig') - - # 验证文件是否创建成功 - if os.path.exists(output_file): - print(f"数据已成功保存到: {output_file}") - print(f"保存文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - print(f"保存了 {len(df)} 行清洗后的数据") - else: - print("错误: 文件保存失败,未找到输出文件") - - print() - print("========================================") - print(" 数据清洗任务完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/debug_log.txt b/project/debug_log.txt deleted file mode 100644 index 743022f..0000000 --- a/project/debug_log.txt +++ /dev/null @@ -1,11 +0,0 @@ -开始调试... -当前目录: D:\java\project -pandas导入成功 -输入文件: D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx -文件存在: True -文件大小: 21607.43 KB -开始读取... -读取成功: 30308 行 -列数: 68 -前5列: ['作者', '作者链接', '标题', '内容', 'tag'] -调试结束 diff --git a/project/debug_process.py b/project/debug_process.py deleted file mode 100644 index 4edd81f..0000000 --- a/project/debug_process.py +++ /dev/null @@ -1,36 +0,0 @@ -import os -import sys - -# 重定向输出 -log_file = open(r'D:\java\project\debug_log.txt', 'w', encoding='utf-8') -original_stdout = sys.stdout -sys.stdout = log_file - -print("开始调试...") -print(f"当前目录: {os.getcwd()}") - -try: - import pandas as pd - print("pandas导入成功") - - input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' - print(f"输入文件: {input_file}") - print(f"文件存在: {os.path.exists(input_file)}") - - if os.path.exists(input_file): - print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - print("开始读取...") - df = pd.read_excel(input_file, engine='openpyxl') - print(f"读取成功: {len(df)} 行") - print(f"列数: {len(df.columns)}") - print(f"前5列: {list(df.columns)[:5]}") - -except Exception as e: - print(f"错误: {e}") - import traceback - traceback.print_exc() - -print("调试结束") -sys.stdout = original_stdout -log_file.close() -print("日志已保存") diff --git a/project/debug_script.py b/project/debug_script.py deleted file mode 100644 index 12d0b28..0000000 --- a/project/debug_script.py +++ /dev/null @@ -1,51 +0,0 @@ -import os -import sys - -print("========================================") -print(" 调试脚本") -print("========================================") -print(f"Python版本: {sys.version}") -print(f"当前目录: {os.getcwd()}") -print() - -# 检查pandas -print("检查pandas...") -try: - import pandas as pd - print(f"pandas版本: {pd.__version__}") -except ImportError as e: - print(f"pandas未安装: {e}") - exit(1) - -# 检查openpyxl -print("\n检查openpyxl...") -try: - import openpyxl - print(f"openpyxl版本: {openpyxl.__version__}") -except ImportError as e: - print(f"openpyxl未安装: {e}") - exit(1) - -# 检查文件 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -print(f"\n检查输入文件:") -print(f"路径: {input_file}") -print(f"存在: {os.path.exists(input_file)}") -if os.path.exists(input_file): - print(f"大小: {os.path.getsize(input_file) / 1024:.2f} KB") - - # 尝试读取 - print("\n尝试读取文件...") - try: - df = pd.read_excel(input_file, nrows=5) # 只读前5行 - print(f"成功读取 {len(df)} 行") - print(f"列名: {list(df.columns)}") - except Exception as e: - print(f"读取失败: {e}") - import traceback - traceback.print_exc() - -print() -print("========================================") -print(" 调试完成") -print("========================================") diff --git a/project/import_data.py b/project/import_data.py deleted file mode 100644 index 74b2473..0000000 --- a/project/import_data.py +++ /dev/null @@ -1,50 +0,0 @@ -import os -import pandas as pd - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' - -print("========================================") -print(" 数据导入操作") -print("========================================") -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - -# 读取数据 -try: - print("正在读取数据...") - df = pd.read_excel(input_file) - print(f"成功读取 {len(df)} 行数据") - print(f"列名: {list(df.columns)}") - print(f"数据类型:") - print(df.dtypes) - - print("\n前5行数据:") - print(df.head()) - - # 写入到同一个文件 - print("\n写入数据到目标文件...") - df.to_excel(output_file, index=False) - - print(f"数据已成功导入到: {output_file}") - print(f"总行数: {len(df)}") - print(f"总列数: {len(df.columns)}") - - print() - print("========================================") - print(" 数据导入完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/minimal_test.py b/project/minimal_test.py deleted file mode 100644 index d62139b..0000000 --- a/project/minimal_test.py +++ /dev/null @@ -1,17 +0,0 @@ -import os -print("测试开始") -print(f"当前目录: {os.getcwd()}") - -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -print(f"文件存在: {os.path.exists(input_file)}") - -if os.path.exists(input_file): - print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - print("尝试读取...") - try: - import pandas as pd - df = pd.read_excel(input_file, nrows=10) - print(f"成功读取 {len(df)} 行") - print("测试完成") - except Exception as e: - print(f"错误: {e}") diff --git a/project/populate_regression_data.py b/project/populate_regression_data.py deleted file mode 100644 index 65cec2e..0000000 --- a/project/populate_regression_data.py +++ /dev/null @@ -1,113 +0,0 @@ -import os -import pandas as pd -import openpyxl - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' - -print("========================================") -print(" 填充UGC回归数据") -print("========================================") -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -if not os.path.exists(output_file): - print("错误: 输出文件不存在!") - exit(1) - -# 读取原始数据 -try: - print("正在读取原始数据...") - df = pd.read_excel(input_file) - print(f"成功读取 {len(df)} 行数据") - print(f"列名: {list(df.columns)}") - - # 打开输出文件 - print("\n打开输出文件...") - wb = openpyxl.load_workbook(output_file) - ws = wb.active - - # 提取数据并填充 - print("\n填充数据...") - - # 提取Y列 (helpfull) - print("1. 填充Y列 (helpfull)") - if 'helpfull' in df.columns: - for i, value in enumerate(df['helpfull'], 2): # 从第2行开始 - if pd.isna(value): - ws.cell(row=i, column=1, value=0) - else: - ws.cell(row=i, column=1, value=float(value)) - print(f"成功填充 Y 列,共 {len(df)} 行") - else: - print("警告: 未找到 helpfull 列,使用默认值 0") - for i in range(2, len(df) + 2): - ws.cell(row=i, column=1, value=0) - - # 提取X1列 (评论总数) - print("\n2. 填充X1列 (评论总数)") - comment_columns = [col for col in df.columns if '评论' in col] - if comment_columns: - for i, value in enumerate(df[comment_columns[0]], 2): - if pd.isna(value): - ws.cell(row=i, column=2, value=0) - else: - ws.cell(row=i, column=2, value=float(value)) - print(f"成功填充 X1 列,使用列: {comment_columns[0]}") - else: - print("警告: 未找到评论列,使用默认值 0") - for i in range(2, len(df) + 2): - ws.cell(row=i, column=2, value=0) - - # 计算X2-X6 - print("\n3. 计算X2-X6") - - # X2: 评论长度 - print(" - 填充X2 (评论长度)") - for i in range(2, len(df) + 2): - ws.cell(row=i, column=3, value=0) - - # X3: 评论复杂度 - print(" - 填充X3 (评论复杂度)") - for i in range(2, len(df) + 2): - ws.cell(row=i, column=4, value=0) - - # X4: 评论可读性 - print(" - 填充X4 (评论可读性)") - for i in range(2, len(df) + 2): - ws.cell(row=i, column=5, value=0) - - # X5: 内容情感性 - print(" - 填充X5 (内容情感性)") - for i in range(2, len(df) + 2): - ws.cell(row=i, column=6, value=0) - - # X6: 信息丰富度 - print(" - 填充X6 (信息丰富度)") - for i in range(2, len(df) + 2): - ws.cell(row=i, column=7, value=0) - - # 保存文件 - print("\n4. 保存文件") - wb.save(output_file) - - print(f"文件已成功保存: {output_file}") - print(f"总行数: {len(df) + 1} (包括表头)") - print(f"总列数: 7") - - print() - print("========================================") - print(" 任务完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/process_300_rows.py b/project/process_300_rows.py deleted file mode 100644 index 2bdb307..0000000 --- a/project/process_300_rows.py +++ /dev/null @@ -1,156 +0,0 @@ -import os -import pandas as pd -import re - -print("=" * 60) -print(" 处理前300行数据作为测试") -print("=" * 60) - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归_300.xlsx' - -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 读取前300行 -print("读取前300行数据...") -df = pd.read_excel(input_file, engine='openpyxl', nrows=300) -print(f"成功读取 {len(df)} 行数据") -print(f"原始列数: {len(df.columns)}") - -# 识别列 -print("\n识别列...") -helpfull_col = None -comment_count_col = None -comment_cols = [] - -for col in df.columns: - col_str = str(col).lower() - if 'helpfull' in col_str or 'helpful' in col_str: - helpfull_col = col - print(f"找到 Y 列 (helpfull): {col}") - elif '评论总数' in str(col) or '帖子评论总数' in str(col): - comment_count_col = col - print(f"找到 X1 列 (评论总数): {col}") - elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): - comment_cols.append(col) - print(f"找到评论列 {len(comment_cols)}: {col}") - -print(f"\n共找到 {len(comment_cols)} 个评论内容列") - -# 添加回归数据列 -print("\n添加回归数据列...") - -# Y (UGC有用性) -print("1. 添加 Y (UGC有用性)") -if helpfull_col: - df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) -else: - df['Y'] = 0 - -# X1 (评论数量) -print("2. 添加 X1 (评论数量)") -if comment_count_col: - df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) -else: - df['X1'] = 0 - -# 定义函数计算评论指标 -def calculate_comment_metrics(content): - if pd.isna(content) or str(content) in ['None', 'nan', '']: - return 0, 0, 0, 0 - - content = str(content) - length = len(content.replace(' ', '').replace('\u3000', '')) - complexity = len(content.split()) - - positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] - negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] - - sentiment = 0 - lower_content = content.lower() - if any(word in lower_content for word in positive_words): - sentiment = 1 - elif any(word in lower_content for word in negative_words): - sentiment = -1 - - richness = 0 - if re.search(r'\d', content): - richness += 1 - if re.search(r'http[s]?://|www\.', content): - richness += 1 - if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): - richness += 1 - - return length, complexity, sentiment, richness - -# 计算评论相关指标 -print("3. 计算评论相关指标...") - -df['X2'] = 0.0 -df['X3'] = 0.0 -df['X5'] = 0.0 -df['X6'] = 0.0 - -for i in range(len(df)): - lengths = [] - complexities = [] - sentiments = [] - richness = [] - - for col in comment_cols: - content = df.iloc[i].get(col, '') - length, complexity, sentiment, r = calculate_comment_metrics(content) - if length > 0: - lengths.append(length) - complexities.append(complexity) - sentiments.append(sentiment) - richness.append(r) - - if lengths: - df.loc[i, 'X2'] = sum(lengths) / len(lengths) - df.loc[i, 'X3'] = sum(complexities) / len(complexities) - df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) - df.loc[i, 'X6'] = sum(richness) / len(richness) - -# X4: 评论可读性 -print("4. 计算 X4 (评论可读性)") -df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) - -# 数据清洗 -print("\n5. 数据清洗...") -regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] -for col in regression_cols: - df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) - df[col] = df[col].replace([float('inf'), float('-inf')], 0) - -# 验证数据 -print("\n6. 验证数据...") -print(f"总行数: {len(df)}") -print(f"总列数: {len(df.columns)}") -print(f"\n回归数据列统计:") -print(df[regression_cols].describe()) -print(f"\n前5行回归数据:") -print(df[regression_cols].head()) - -# 保存文件 -print("\n7. 保存文件...") -df.to_excel(output_file, index=False, engine='openpyxl') - -# 验证文件 -print("\n8. 验证文件...") -if os.path.exists(output_file): - print(f"文件已成功保存: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - df_check = pd.read_excel(output_file) - print(f"输出文件行数: {len(df_check)}") - print(f"输出文件列数: {len(df_check.columns)}") -else: - print("文件保存失败!") - -print() -print("=" * 60) -print(" 任务完成") -print("=" * 60) diff --git a/project/process_actual_data.py b/project/process_actual_data.py deleted file mode 100644 index ddc09d0..0000000 --- a/project/process_actual_data.py +++ /dev/null @@ -1,200 +0,0 @@ -import os -import openpyxl -import re - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' - -print("========================================") -print(" 根据实际原始数据计算回归数据") -print("========================================") -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - -# 读取原始数据 -try: - print("正在读取原始数据...") - wb_input = openpyxl.load_workbook(input_file) - ws_input = wb_input.active - - print(f"工作表名称: {ws_input.title}") - print(f"最大行数: {ws_input.max_row}") - print(f"最大列数: {ws_input.max_column}") - - # 识别列 - print("\n识别列...") - headers = [] - helpfull_col = None - comment_count_col = None - comment_cols = [] - - for col in range(1, ws_input.max_column + 1): - header = ws_input.cell(row=1, column=col).value - headers.append(header) - - if header: - header_str = str(header).lower() - if 'helpfull' in header_str or 'helpful' in header_str: - helpfull_col = col - print(f"找到 Y 列 (helpfull): 列 {col}") - elif '评论总数' in str(header) or '帖子评论总数' in str(header): - comment_count_col = col - print(f"找到 X1 列 (评论总数): 列 {col}") - elif '评论' in str(header) and any(str(i) in str(header) for i in range(1, 6)): - comment_cols.append(col) - print(f"找到评论列 {len(comment_cols)}: 列 {col} - {header}") - - print(f"\n共找到 {len(comment_cols)} 个评论列") - - # 创建或打开输出文件 - if os.path.exists(output_file): - print("\n打开现有输出文件...") - wb_output = openpyxl.load_workbook(output_file) - ws_output = wb_output.active - else: - print("\n创建新的输出文件...") - wb_output = openpyxl.Workbook() - ws_output = wb_output.active - # 写入表头 - headers_output = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] - for i, header in enumerate(headers_output, 1): - ws_output.cell(row=1, column=i, value=header) - - # 计算并填充数据 - print("\n计算并填充数据...") - total_rows = ws_input.max_row - 1 - print(f"总数据行数: {total_rows}") - - # 确保输出文件有足够的行 - if ws_output.max_row < ws_input.max_row: - print(f"扩展输出文件行数到 {ws_input.max_row}...") - - for row in range(2, ws_input.max_row + 1): - if row % 100 == 0: - print(f"处理到第 {row-1} 行...") - if row % 1000 == 0: - print(f"已处理 {row-1} 行,共 {total_rows} 行") - - # Y (UGC有用性) - if helpfull_col: - y_value = ws_input.cell(row=row, column=helpfull_col).value - y_value = float(y_value) if y_value else 0 - else: - y_value = 0 - ws_output.cell(row=row, column=1, value=y_value) - - # X1 (评论数量) - if comment_count_col: - x1_value = ws_input.cell(row=row, column=comment_count_col).value - x1_value = float(x1_value) if x1_value else 0 - else: - x1_value = 0 - ws_output.cell(row=row, column=2, value=x1_value) - - # 计算评论相关指标 - comment_lengths = [] - comment_complexities = [] - comment_sentiments = [] - comment_richness = [] - - for col in comment_cols: - content = str(ws_input.cell(row=row, column=col).value) - if content and content != 'None' and content != 'nan': - # X2: 评论长度(剔空格后的字符数) - length = len(content.replace(' ', '')) - comment_lengths.append(length) - - # X3: 评论复杂度(按空格拆分的分词数) - complexity = len(content.split()) - comment_complexities.append(complexity) - - # X5: 内容情感性(正面=1、中性=0、负面=-1) - positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent'] - negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor'] - - sentiment = 0 - lower_content = content.lower() - - if any(word in lower_content for word in positive_words): - sentiment = 1 - elif any(word in lower_content for word in negative_words): - sentiment = -1 - comment_sentiments.append(sentiment) - - # X6: 信息丰富度(含数字/链接/表情各1分,满分3分) - richness = 0 - if re.search(r'\d', content): - richness += 1 - if re.search(r'http[s]?://', content): - richness += 1 - if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content): - richness += 1 - comment_richness.append(richness) - - # X2: 评论长度平均值 - x2_value = sum(comment_lengths) / len(comment_lengths) if comment_lengths else 0 - ws_output.cell(row=row, column=3, value=x2_value) - - # X3: 评论复杂度平均值 - x3_value = sum(comment_complexities) / len(comment_complexities) if comment_complexities else 0 - ws_output.cell(row=row, column=4, value=x3_value) - - # X4: 评论可读性(X2/X3,X3为0时记0) - x4_value = x2_value / x3_value if x3_value > 0 else 0 - ws_output.cell(row=row, column=5, value=x4_value) - - # X5: 内容情感性平均值 - x5_value = sum(comment_sentiments) / len(comment_sentiments) if comment_sentiments else 0 - ws_output.cell(row=row, column=6, value=x5_value) - - # X6: 信息丰富度平均值 - x6_value = sum(comment_richness) / len(comment_richness) if comment_richness else 0 - ws_output.cell(row=row, column=7, value=x6_value) - - # 保存文件 - print("\n保存文件...") - wb_output.save(output_file) - - print(f"文件已成功保存: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - print(f"处理完成,共 {total_rows} 行数据") - - # 验证文件 - print("\n验证文件...") - if os.path.exists(output_file): - print("文件保存成功!") - # 重新打开文件检查 - wb_check = openpyxl.load_workbook(output_file) - ws_check = wb_check.active - print(f"输出文件行数: {ws_check.max_row - 1}") - print(f"输出文件列数: {ws_check.max_column}") - - # 显示前5行数据 - print("\n前5行数据:") - for row in range(1, min(6, ws_check.max_row + 1)): - row_data = [] - for col in range(1, ws_check.max_column + 1): - value = ws_check.cell(row=row, column=col).value - row_data.append(value) - print(f"行 {row}: {row_data}") - else: - print("文件保存失败!") - - print() - print("========================================") - print(" 任务完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/process_all_data.py b/project/process_all_data.py deleted file mode 100644 index e7db13c..0000000 --- a/project/process_all_data.py +++ /dev/null @@ -1,190 +0,0 @@ -import os -import openpyxl -import re - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' - -print("========================================") -print(" 处理所有数据") -print("========================================") -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - -# 读取原始数据 -try: - print("正在读取原始数据...") - wb_input = openpyxl.load_workbook(input_file) - ws_input = wb_input.active - - print(f"工作表名称: {ws_input.title}") - print(f"最大行数: {ws_input.max_row}") - print(f"最大列数: {ws_input.max_column}") - - # 识别列 - print("\n识别列...") - headers = [] - helpfull_col = None - comment_count_col = None - comment_cols = [] - - for col in range(1, ws_input.max_column + 1): - header = ws_input.cell(row=1, column=col).value - headers.append(header) - - if header: - header_str = str(header).lower() - if 'helpfull' in header_str or 'helpful' in header_str: - helpfull_col = col - print(f"找到 Y 列 (helpfull): 列 {col}") - elif '评论总数' in str(header) or '帖子评论总数' in str(header): - comment_count_col = col - print(f"找到 X1 列 (评论总数): 列 {col}") - elif '评论' in str(header) and any(str(i) in str(header) for i in range(1, 6)): - comment_cols.append(col) - print(f"找到评论列 {len(comment_cols)}: 列 {col} - {header}") - - print(f"\n共找到 {len(comment_cols)} 个评论列") - - # 创建新的输出文件 - print("\n创建新的输出文件...") - wb_output = openpyxl.Workbook() - ws_output = wb_output.active - - # 写入表头 - headers_output = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] - for i, header in enumerate(headers_output, 1): - ws_output.cell(row=1, column=i, value=header) - - # 计算并填充数据 - print("\n计算并填充数据...") - total_rows = ws_input.max_row - 1 - print(f"总数据行数: {total_rows}") - - for row in range(2, ws_input.max_row + 1): - if row % 1000 == 0: - print(f"处理到第 {row-1} 行...") - - # Y (UGC有用性) - if helpfull_col: - y_value = ws_input.cell(row=row, column=helpfull_col).value - y_value = float(y_value) if y_value else 0 - else: - y_value = 0 - ws_output.cell(row=row, column=1, value=y_value) - - # X1 (评论数量) - if comment_count_col: - x1_value = ws_input.cell(row=row, column=comment_count_col).value - x1_value = float(x1_value) if x1_value else 0 - else: - x1_value = 0 - ws_output.cell(row=row, column=2, value=x1_value) - - # 计算评论相关指标 - comment_lengths = [] - comment_complexities = [] - comment_sentiments = [] - comment_richness = [] - - for col in comment_cols: - content = str(ws_input.cell(row=row, column=col).value) - if content and content != 'None' and content != 'nan': - # X2: 评论长度(剔空格后的字符数) - length = len(content.replace(' ', '')) - comment_lengths.append(length) - - # X3: 评论复杂度(按空格拆分的分词数) - complexity = len(content.split()) - comment_complexities.append(complexity) - - # X5: 内容情感性(正面=1、中性=0、负面=-1) - positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent'] - negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor'] - - sentiment = 0 - lower_content = content.lower() - - if any(word in lower_content for word in positive_words): - sentiment = 1 - elif any(word in lower_content for word in negative_words): - sentiment = -1 - comment_sentiments.append(sentiment) - - # X6: 信息丰富度(含数字/链接/表情各1分,满分3分) - richness = 0 - if re.search(r'\d', content): - richness += 1 - if re.search(r'http[s]?://', content): - richness += 1 - if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content): - richness += 1 - comment_richness.append(richness) - - # X2: 评论长度平均值 - x2_value = sum(comment_lengths) / len(comment_lengths) if comment_lengths else 0 - ws_output.cell(row=row, column=3, value=x2_value) - - # X3: 评论复杂度平均值 - x3_value = sum(comment_complexities) / len(comment_complexities) if comment_complexities else 0 - ws_output.cell(row=row, column=4, value=x3_value) - - # X4: 评论可读性(X2/X3,X3为0时记0) - x4_value = x2_value / x3_value if x3_value > 0 else 0 - ws_output.cell(row=row, column=5, value=x4_value) - - # X5: 内容情感性平均值 - x5_value = sum(comment_sentiments) / len(comment_sentiments) if comment_sentiments else 0 - ws_output.cell(row=row, column=6, value=x5_value) - - # X6: 信息丰富度平均值 - x6_value = sum(comment_richness) / len(comment_richness) if comment_richness else 0 - ws_output.cell(row=row, column=7, value=x6_value) - - # 保存文件 - print("\n保存文件...") - wb_output.save(output_file) - - print(f"文件已成功保存: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - print(f"处理完成,共 {total_rows} 行数据") - - # 验证文件 - print("\n验证文件...") - if os.path.exists(output_file): - print("文件保存成功!") - # 重新打开文件检查 - wb_check = openpyxl.load_workbook(output_file) - ws_check = wb_check.active - print(f"输出文件行数: {ws_check.max_row - 1}") - print(f"输出文件列数: {ws_check.max_column}") - - # 显示前5行数据 - print("\n前5行数据:") - for row in range(1, min(6, ws_check.max_row + 1)): - row_data = [] - for col in range(1, ws_check.max_column + 1): - value = ws_check.cell(row=row, column=col).value - row_data.append(value) - print(f"行 {row}: {row_data}") - else: - print("文件保存失败!") - - print() - print("========================================") - print(" 任务完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/process_all_rows.py b/project/process_all_rows.py deleted file mode 100644 index 62d277c..0000000 --- a/project/process_all_rows.py +++ /dev/null @@ -1,157 +0,0 @@ -import os -import pandas as pd -import re - -print("=" * 60) -print(" 处理全部数据") -print("=" * 60) - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' - -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 读取全部数据 -print("读取全部数据...") -df = pd.read_excel(input_file, engine='openpyxl') -print(f"成功读取 {len(df)} 行数据") -print(f"原始列数: {len(df.columns)}") - -# 识别列 -print("\n识别列...") -helpfull_col = None -comment_count_col = None -comment_cols = [] - -for col in df.columns: - col_str = str(col).lower() - if 'helpfull' in col_str or 'helpful' in col_str: - helpfull_col = col - print(f"找到 Y 列 (helpfull): {col}") - elif '评论总数' in str(col) or '帖子评论总数' in str(col): - comment_count_col = col - print(f"找到 X1 列 (评论总数): {col}") - elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): - comment_cols.append(col) - -print(f"\n共找到 {len(comment_cols)} 个评论内容列") - -# 添加回归数据列 -print("\n添加回归数据列...") - -# Y (UGC有用性) -print("1. 添加 Y (UGC有用性)") -if helpfull_col: - df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) -else: - df['Y'] = 0 - -# X1 (评论数量) -print("2. 添加 X1 (评论数量)") -if comment_count_col: - df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) -else: - df['X1'] = 0 - -# 定义函数计算评论指标 -def calculate_comment_metrics(content): - if pd.isna(content) or str(content) in ['None', 'nan', '']: - return 0, 0, 0, 0 - - content = str(content) - length = len(content.replace(' ', '').replace('\u3000', '')) - complexity = len(content.split()) - - positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] - negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] - - sentiment = 0 - lower_content = content.lower() - if any(word in lower_content for word in positive_words): - sentiment = 1 - elif any(word in lower_content for word in negative_words): - sentiment = -1 - - richness = 0 - if re.search(r'\d', content): - richness += 1 - if re.search(r'http[s]?://|www\.', content): - richness += 1 - if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): - richness += 1 - - return length, complexity, sentiment, richness - -# 计算评论相关指标 -print("3. 计算评论相关指标...") -print(f"总数据行数: {len(df)}") - -df['X2'] = 0.0 -df['X3'] = 0.0 -df['X5'] = 0.0 -df['X6'] = 0.0 - -for i in range(len(df)): - if i % 1000 == 0: - print(f" 处理第 {i}/{len(df)} 行...") - - lengths = [] - complexities = [] - sentiments = [] - richness = [] - - for col in comment_cols: - content = df.iloc[i].get(col, '') - length, complexity, sentiment, r = calculate_comment_metrics(content) - if length > 0: - lengths.append(length) - complexities.append(complexity) - sentiments.append(sentiment) - richness.append(r) - - if lengths: - df.loc[i, 'X2'] = sum(lengths) / len(lengths) - df.loc[i, 'X3'] = sum(complexities) / len(complexities) - df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) - df.loc[i, 'X6'] = sum(richness) / len(richness) - -# X4: 评论可读性 -print("4. 计算 X4 (评论可读性)") -df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) - -# 数据清洗 -print("\n5. 数据清洗...") -regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] -for col in regression_cols: - df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) - df[col] = df[col].replace([float('inf'), float('-inf')], 0) - -# 验证数据 -print("\n6. 验证数据...") -print(f"总行数: {len(df)}") -print(f"总列数: {len(df.columns)}") -print(f"\n回归数据列统计:") -print(df[regression_cols].describe()) - -# 保存文件 -print("\n7. 保存文件...") -df.to_excel(output_file, index=False, engine='openpyxl') - -# 验证文件 -print("\n8. 验证文件...") -if os.path.exists(output_file): - print(f"文件已成功保存: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - df_check = pd.read_excel(output_file) - print(f"输出文件行数: {len(df_check)}") - print(f"输出文件列数: {len(df_check.columns)}") -else: - print("文件保存失败!") - -print() -print("=" * 60) -print(" 任务完成") -print("=" * 60) diff --git a/project/process_efficient.py b/project/process_efficient.py deleted file mode 100644 index f78f977..0000000 --- a/project/process_efficient.py +++ /dev/null @@ -1,180 +0,0 @@ -import os -import pandas as pd -import re - -print("=" * 60) -print(" 高效处理全部数据") -print("=" * 60) - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' - -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 首先读取表头来识别列 -print("1. 读取表头...") -df_header = pd.read_excel(input_file, engine='openpyxl', nrows=0) -print(f"总列数: {len(df_header.columns)}") - -# 识别列 -helpfull_col = None -comment_count_col = None -comment_cols = [] - -for col in df_header.columns: - col_str = str(col).lower() - if 'helpfull' in col_str or 'helpful' in col_str: - helpfull_col = col - print(f"找到 Y 列 (helpfull): {col}") - elif '评论总数' in str(col) or '帖子评论总数' in str(col): - comment_count_col = col - print(f"找到 X1 列 (评论总数): {col}") - elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): - comment_cols.append(col) - -print(f"共找到 {len(comment_cols)} 个评论内容列") - -# 定义函数计算评论指标 -def calculate_comment_metrics(content): - if pd.isna(content) or str(content) in ['None', 'nan', '']: - return 0, 0, 0, 0 - - content = str(content) - length = len(content.replace(' ', '').replace('\u3000', '')) - complexity = len(content.split()) - - positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] - negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] - - sentiment = 0 - lower_content = content.lower() - if any(word in lower_content for word in positive_words): - sentiment = 1 - elif any(word in lower_content for word in negative_words): - sentiment = -1 - - richness = 0 - if re.search(r'\d', content): - richness += 1 - if re.search(r'http[s]?://|www\.', content): - richness += 1 - if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): - richness += 1 - - return length, complexity, sentiment, richness - -# 分批处理数据 -print("\n2. 分批处理数据...") -batch_size = 5000 -batch_num = 0 -all_data = [] - -while True: - skip_rows = batch_num * batch_size + 1 if batch_num > 0 else 0 - nrows = batch_size - - print(f" 处理批次 {batch_num + 1} (跳过 {skip_rows} 行,读取 {nrows} 行)...") - - try: - if batch_num == 0: - df_batch = pd.read_excel(input_file, engine='openpyxl', nrows=nrows) - else: - df_batch = pd.read_excel(input_file, engine='openpyxl', skiprows=skip_rows, nrows=nrows, header=None) - df_batch.columns = df_header.columns - except Exception as e: - print(f" 读取完成或出错: {e}") - break - - if len(df_batch) == 0: - print(" 没有更多数据") - break - - print(f" 读取了 {len(df_batch)} 行") - - # 添加Y和X1 - if helpfull_col: - df_batch['Y'] = pd.to_numeric(df_batch[helpfull_col], errors='coerce').fillna(0) - else: - df_batch['Y'] = 0 - - if comment_count_col: - df_batch['X1'] = pd.to_numeric(df_batch[comment_count_col], errors='coerce').fillna(0) - else: - df_batch['X1'] = 0 - - # 初始化X2-X6 - df_batch['X2'] = 0.0 - df_batch['X3'] = 0.0 - df_batch['X5'] = 0.0 - df_batch['X6'] = 0.0 - - # 计算评论指标 - for i in range(len(df_batch)): - lengths = [] - complexities = [] - sentiments = [] - richness = [] - - for col in comment_cols: - content = df_batch.iloc[i].get(col, '') - length, complexity, sentiment, r = calculate_comment_metrics(content) - if length > 0: - lengths.append(length) - complexities.append(complexity) - sentiments.append(sentiment) - richness.append(r) - - if lengths: - df_batch.loc[i, 'X2'] = sum(lengths) / len(lengths) - df_batch.loc[i, 'X3'] = sum(complexities) / len(complexities) - df_batch.loc[i, 'X5'] = sum(sentiments) / len(sentiments) - df_batch.loc[i, 'X6'] = sum(richness) / len(richness) - - # 计算X4 - df_batch['X4'] = df_batch.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) - - # 数据清洗 - regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] - for col in regression_cols: - df_batch[col] = pd.to_numeric(df_batch[col], errors='coerce').fillna(0) - df_batch[col] = df_batch[col].replace([float('inf'), float('-inf')], 0) - - all_data.append(df_batch) - batch_num += 1 - - print(f" 批次 {batch_num} 完成,当前总行数: {sum(len(d) for d in all_data)}") - -# 合并所有数据 -print("\n3. 合并数据...") -df_final = pd.concat(all_data, ignore_index=True) -print(f"合并后总行数: {len(df_final)}") - -# 验证数据 -print("\n4. 验证数据...") -regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] -print(f"总列数: {len(df_final.columns)}") -print(f"\n回归数据列统计:") -print(df_final[regression_cols].describe()) - -# 保存文件 -print("\n5. 保存文件...") -df_final.to_excel(output_file, index=False, engine='openpyxl') - -# 验证文件 -print("\n6. 验证文件...") -if os.path.exists(output_file): - print(f"文件已成功保存: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - df_check = pd.read_excel(output_file) - print(f"输出文件行数: {len(df_check)}") - print(f"输出文件列数: {len(df_check.columns)}") -else: - print("文件保存失败!") - -print() -print("=" * 60) -print(" 任务完成") -print("=" * 60) diff --git a/project/process_large_file.py b/project/process_large_file.py deleted file mode 100644 index 304be6d..0000000 --- a/project/process_large_file.py +++ /dev/null @@ -1,177 +0,0 @@ -import os -import pandas as pd -import re - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' - -print("========================================") -print(" 处理大型Excel文件") -print("========================================") -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - -# 读取原始数据 -try: - print("正在读取原始数据...") - # 使用pandas读取Excel文件,设置引擎为openpyxl - df = pd.read_excel(input_file, engine='openpyxl') - print(f"成功读取 {len(df)} 行数据") - print(f"列名: {list(df.columns)}") - - # 识别列 - print("\n识别列...") - helpfull_col = None - comment_count_col = None - comment_cols = [] - - for col in df.columns: - col_str = str(col).lower() - if 'helpfull' in col_str or 'helpful' in col_str: - helpfull_col = col - print(f"找到 Y 列 (helpfull): {col}") - elif '评论总数' in str(col) or '帖子评论总数' in str(col): - comment_count_col = col - print(f"找到 X1 列 (评论总数): {col}") - elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)): - comment_cols.append(col) - print(f"找到评论列 {len(comment_cols)}: {col}") - - print(f"\n共找到 {len(comment_cols)} 个评论列") - - # 创建回归数据 - print("\n创建回归数据...") - regression_data = pd.DataFrame() - - # Y (UGC有用性) - print("1. 计算 Y (UGC有用性)") - if helpfull_col: - regression_data['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) - else: - regression_data['Y'] = 0 - - # X1 (评论数量) - print("2. 计算 X1 (评论数量)") - if comment_count_col: - regression_data['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) - else: - regression_data['X1'] = 0 - - # 定义函数计算评论指标 - def calculate_comment_metrics(content): - if pd.isna(content) or str(content) in ['None', 'nan']: - return 0, 0, 0, 0 - - content = str(content) - # 评论长度 - length = len(content.replace(' ', '')) - # 评论复杂度 - complexity = len(content.split()) - # 情感分析 - positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent'] - negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor'] - - sentiment = 0 - lower_content = content.lower() - if any(word in lower_content for word in positive_words): - sentiment = 1 - elif any(word in lower_content for word in negative_words): - sentiment = -1 - # 信息丰富度 - richness = 0 - if re.search(r'\d', content): - richness += 1 - if re.search(r'http[s]?://', content): - richness += 1 - if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content): - richness += 1 - - return length, complexity, sentiment, richness - - # 计算评论相关指标 - print("3. 计算评论相关指标...") - - # 初始化列 - regression_data['X2'] = 0 # 评论长度 - regression_data['X3'] = 0 # 评论复杂度 - regression_data['X5'] = 0 # 情感性 - regression_data['X6'] = 0 # 信息丰富度 - - # 逐行计算 - total_rows = len(df) - for i in range(total_rows): - if i % 1000 == 0: - print(f"处理到第 {i} 行...") - - lengths = [] - complexities = [] - sentiments = [] - richness = [] - - for col in comment_cols: - content = df.iloc[i].get(col, '') - length, complexity, sentiment, r = calculate_comment_metrics(content) - if length > 0: - lengths.append(length) - complexities.append(complexity) - sentiments.append(sentiment) - richness.append(r) - - # 计算平均值 - if lengths: - regression_data.loc[i, 'X2'] = sum(lengths) / len(lengths) - regression_data.loc[i, 'X3'] = sum(complexities) / len(complexities) - regression_data.loc[i, 'X5'] = sum(sentiments) / len(sentiments) - regression_data.loc[i, 'X6'] = sum(richness) / len(richness) - - # X4: 评论可读性 - print("4. 计算 X4 (评论可读性)") - regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) - - # 数据清洗 - print("\n5. 数据清洗...") - for col in regression_data.columns: - regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0) - - # 验证数据 - print("\n6. 验证数据...") - print(f"行数: {len(regression_data)}") - print(f"列数: {len(regression_data.columns)}") - print(f"列名: {list(regression_data.columns)}") - print(f"\n前5行数据:") - print(regression_data.head()) - - # 保存文件 - print("\n7. 保存文件...") - regression_data.to_excel(output_file, index=False) - - # 验证文件 - print("\n8. 验证文件...") - if os.path.exists(output_file): - print(f"文件已成功保存: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - # 重新读取检查 - df_check = pd.read_excel(output_file) - print(f"输出文件行数: {len(df_check)}") - print(f"输出文件列数: {len(df_check.columns)}") - else: - print("文件保存失败!") - - print() - print("========================================") - print(" 任务完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/process_log.txt b/project/process_log.txt deleted file mode 100644 index afe1ed8..0000000 --- a/project/process_log.txt +++ /dev/null @@ -1,9 +0,0 @@ -======================================== - 在原表中添加回归数据列 -======================================== -输入文件: D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx -输出文件: D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx - -输入文件大小: 21607.43 KB - -正在读取原始数据... diff --git a/project/process_regression_final.py b/project/process_regression_final.py deleted file mode 100644 index cca17c2..0000000 --- a/project/process_regression_final.py +++ /dev/null @@ -1,192 +0,0 @@ -import os -import pandas as pd -import re - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' - -print("========================================") -print(" 在原表中添加回归数据列") -print("========================================") -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - -# 读取原始数据 -try: - print("\n正在读取原始数据...") - df = pd.read_excel(input_file) - print(f"成功读取 {len(df)} 行数据") - print(f"原始列数: {len(df.columns)}") - - # 识别列 - print("\n识别列...") - helpfull_col = None - comment_count_col = None - comment_cols = [] - - for col in df.columns: - col_str = str(col).lower() - if 'helpfull' in col_str or 'helpful' in col_str: - helpfull_col = col - print(f"找到 Y 列 (helpfull): {col}") - elif '评论总数' in str(col) or '帖子评论总数' in str(col): - comment_count_col = col - print(f"找到 X1 列 (评论总数): {col}") - elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): - comment_cols.append(col) - print(f"找到评论列 {len(comment_cols)}: {col}") - - print(f"\n共找到 {len(comment_cols)} 个评论内容列") - - # 添加回归数据列 - print("\n添加回归数据列...") - - # Y (UGC有用性) - 直接复制helpfull列 - print("1. 添加 Y (UGC有用性)") - if helpfull_col: - df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) - else: - df['Y'] = 0 - - # X1 (评论数量) - 直接复制帖子评论总数列 - print("2. 添加 X1 (评论数量)") - if comment_count_col: - df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) - else: - df['X1'] = 0 - - # 定义函数计算评论指标 - def calculate_comment_metrics(content): - if pd.isna(content) or str(content) in ['None', 'nan', '']: - return 0, 0, 0, 0 - - content = str(content) - # X2: 评论长度(剔空格后的字符数) - length = len(content.replace(' ', '').replace('\u3000', '')) - # X3: 评论复杂度(按空格拆分的分词数) - complexity = len(content.split()) - # X5: 情感分析(正面=1、中性=0、负面=-1) - positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] - negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] - - sentiment = 0 - lower_content = content.lower() - if any(word in lower_content for word in positive_words): - sentiment = 1 - elif any(word in lower_content for word in negative_words): - sentiment = -1 - # X6: 信息丰富度(含数字/链接/表情各1分,满分3分) - richness = 0 - if re.search(r'\d', content): # 含数字 - richness += 1 - if re.search(r'http[s]?://|www\.', content): # 含链接 - richness += 1 - if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情 - richness += 1 - - return length, complexity, sentiment, richness - - # 计算评论相关指标 - print("3. 计算评论相关指标...") - - # 初始化列 - df['X2'] = 0.0 # 评论长度 - df['X3'] = 0.0 # 评论复杂度 - df['X5'] = 0.0 # 情感性 - df['X6'] = 0.0 # 信息丰富度 - - # 逐行计算 - total_rows = len(df) - print(f"总数据行数: {total_rows}") - - for i in range(total_rows): - if i % 1000 == 0: - print(f" 处理第 {i}/{total_rows} 行...") - - lengths = [] - complexities = [] - sentiments = [] - richness = [] - - for col in comment_cols: - content = df.iloc[i].get(col, '') - length, complexity, sentiment, r = calculate_comment_metrics(content) - if length > 0: # 只统计有内容的评论 - lengths.append(length) - complexities.append(complexity) - sentiments.append(sentiment) - richness.append(r) - - # 计算平均值(无评论记0) - if lengths: - df.loc[i, 'X2'] = sum(lengths) / len(lengths) - df.loc[i, 'X3'] = sum(complexities) / len(complexities) - df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) - df.loc[i, 'X6'] = sum(richness) / len(richness) - - # X4: 评论可读性 = X2/X3(X3为0时记0,避免报错) - print("4. 计算 X4 (评论可读性)") - df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) - - # 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误 - print("\n5. 数据清洗...") - regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] - for col in regression_cols: - # 转换为数字,错误值转为0 - df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) - # 替换无穷大 - df[col] = df[col].replace([float('inf'), float('-inf')], 0) - - # 验证数据 - print("\n6. 验证数据...") - print(f"总行数: {len(df)}") - print(f"总列数: {len(df.columns)}") - print(f"\n回归数据列统计:") - print(df[regression_cols].describe()) - print(f"\n前5行回归数据:") - print(df[regression_cols].head()) - - # 检查是否有空值或错误值 - print(f"\n空值检查:") - for col in regression_cols: - null_count = df[col].isnull().sum() - print(f" {col}: {null_count} 个空值") - - # 保存文件 - print("\n7. 保存文件...") - print(f"正在保存到: {output_file}") - df.to_excel(output_file, index=False, engine='openpyxl') - - # 验证文件 - print("\n8. 验证文件...") - if os.path.exists(output_file): - print(f"文件已成功保存: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - # 重新读取检查 - df_check = pd.read_excel(output_file) - print(f"输出文件行数: {len(df_check)}") - print(f"输出文件列数: {len(df_check.columns)}") - print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}") - else: - print("文件保存失败!") - - print() - print("========================================") - print(" 任务完成") - print("========================================") - print(f"新文件已保存: {output_file}") - print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/process_with_csv.py b/project/process_with_csv.py deleted file mode 100644 index f2f6797..0000000 --- a/project/process_with_csv.py +++ /dev/null @@ -1,202 +0,0 @@ -import os -import pandas as pd -import re - -print("=" * 60) -print(" 使用CSV处理回归数据") -print("=" * 60) - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' - -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - -# 读取原始数据 -print("\n正在读取原始数据...") -try: - df = pd.read_excel(input_file, engine='openpyxl') - print(f"成功读取 {len(df)} 行数据") - print(f"原始列数: {len(df.columns)}") -except Exception as e: - print(f"读取失败: {e}") - exit(1) - -# 识别列 -print("\n识别列...") -helpfull_col = None -comment_count_col = None -comment_cols = [] - -for col in df.columns: - col_str = str(col).lower() - if 'helpfull' in col_str or 'helpful' in col_str: - helpfull_col = col - print(f"找到 Y 列 (helpfull): {col}") - elif '评论总数' in str(col) or '帖子评论总数' in str(col): - comment_count_col = col - print(f"找到 X1 列 (评论总数): {col}") - elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): - comment_cols.append(col) - print(f"找到评论列 {len(comment_cols)}: {col}") - -print(f"\n共找到 {len(comment_cols)} 个评论内容列") - -# 添加回归数据列 -print("\n添加回归数据列...") - -# Y (UGC有用性) - 直接复制helpfull列 -print("1. 添加 Y (UGC有用性)") -if helpfull_col: - df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) -else: - df['Y'] = 0 - -# X1 (评论数量) - 直接复制帖子评论总数列 -print("2. 添加 X1 (评论数量)") -if comment_count_col: - df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) -else: - df['X1'] = 0 - -# 定义函数计算评论指标 -def calculate_comment_metrics(content): - if pd.isna(content) or str(content) in ['None', 'nan', '']: - return 0, 0, 0, 0 - - content = str(content) - # X2: 评论长度(剔空格后的字符数) - length = len(content.replace(' ', '').replace('\u3000', '')) - # X3: 评论复杂度(按空格拆分的分词数) - complexity = len(content.split()) - # X5: 情感分析(正面=1、中性=0、负面=-1) - positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] - negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] - - sentiment = 0 - lower_content = content.lower() - if any(word in lower_content for word in positive_words): - sentiment = 1 - elif any(word in lower_content for word in negative_words): - sentiment = -1 - # X6: 信息丰富度(含数字/链接/表情各1分,满分3分) - richness = 0 - if re.search(r'\d', content): # 含数字 - richness += 1 - if re.search(r'http[s]?://|www\.', content): # 含链接 - richness += 1 - if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情 - richness += 1 - - return length, complexity, sentiment, richness - -# 计算评论相关指标 -print("3. 计算评论相关指标...") - -# 初始化列 -df['X2'] = 0.0 # 评论长度 -df['X3'] = 0.0 # 评论复杂度 -df['X5'] = 0.0 # 情感性 -df['X6'] = 0.0 # 信息丰富度 - -# 逐行计算 -total_rows = len(df) -print(f"总数据行数: {total_rows}") - -for i in range(total_rows): - if i % 1000 == 0: - print(f" 处理第 {i}/{total_rows} 行...") - - lengths = [] - complexities = [] - sentiments = [] - richness = [] - - for col in comment_cols: - content = df.iloc[i].get(col, '') - length, complexity, sentiment, r = calculate_comment_metrics(content) - if length > 0: # 只统计有内容的评论 - lengths.append(length) - complexities.append(complexity) - sentiments.append(sentiment) - richness.append(r) - - # 计算平均值(无评论记0) - if lengths: - df.loc[i, 'X2'] = sum(lengths) / len(lengths) - df.loc[i, 'X3'] = sum(complexities) / len(complexities) - df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) - df.loc[i, 'X6'] = sum(richness) / len(richness) - -# X4: 评论可读性 = X2/X3(X3为0时记0,避免报错) -print("4. 计算 X4 (评论可读性)") -df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) - -# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误 -print("\n5. 数据清洗...") -regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] -for col in regression_cols: - # 转换为数字,错误值转为0 - df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) - # 替换无穷大 - df[col] = df[col].replace([float('inf'), float('-inf')], 0) - -# 验证数据 -print("\n6. 验证数据...") -print(f"总行数: {len(df)}") -print(f"总列数: {len(df.columns)}") -print(f"\n回归数据列统计:") -print(df[regression_cols].describe()) -print(f"\n前5行回归数据:") -print(df[regression_cols].head()) - -# 检查是否有空值或错误值 -print(f"\n空值检查:") -for col in regression_cols: - null_count = df[col].isnull().sum() - print(f" {col}: {null_count} 个空值") - -# 保存为CSV中间文件 -print("\n7. 保存为CSV中间文件...") -csv_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\temp_regression.csv' -df.to_csv(csv_file, index=False, encoding='utf-8-sig') -print(f"CSV文件已保存: {csv_file}") -print(f"CSV文件大小: {os.path.getsize(csv_file) / 1024:.2f} KB") - -# 从CSV读取并保存为Excel -print("\n8. 转换为Excel文件...") -df_csv = pd.read_csv(csv_file, encoding='utf-8-sig') -df_csv.to_excel(output_file, index=False, engine='openpyxl') - -# 验证文件 -print("\n9. 验证文件...") -if os.path.exists(output_file): - print(f"文件已成功保存: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - # 重新读取检查 - df_check = pd.read_excel(output_file) - print(f"输出文件行数: {len(df_check)}") - print(f"输出文件列数: {len(df_check.columns)}") - print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}") - - # 删除临时CSV文件 - os.remove(csv_file) - print(f"\n临时CSV文件已删除") -else: - print("文件保存失败!") - -print() -print("=" * 60) -print(" 任务完成") -print("=" * 60) -print(f"新文件已保存: {output_file}") -print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列") diff --git a/project/process_with_pandas.py b/project/process_with_pandas.py deleted file mode 100644 index 5a09d25..0000000 --- a/project/process_with_pandas.py +++ /dev/null @@ -1,168 +0,0 @@ -import os -import pandas as pd -import re - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' - -print("========================================") -print(" 使用pandas处理所有数据") -print("========================================") -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - -# 读取原始数据 -try: - print("正在读取原始数据...") - df = pd.read_excel(input_file) - print(f"成功读取 {len(df)} 行数据") - print(f"列名: {list(df.columns)}") - - # 识别列 - print("\n识别列...") - helpfull_col = None - comment_count_col = None - comment_cols = [] - - for col in df.columns: - col_str = str(col).lower() - if 'helpfull' in col_str or 'helpful' in col_str: - helpfull_col = col - print(f"找到 Y 列 (helpfull): {col}") - elif '评论总数' in str(col) or '帖子评论总数' in str(col): - comment_count_col = col - print(f"找到 X1 列 (评论总数): {col}") - elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)): - comment_cols.append(col) - print(f"找到评论列 {len(comment_cols)}: {col}") - - print(f"\n共找到 {len(comment_cols)} 个评论列") - - # 创建回归数据 - print("\n创建回归数据...") - regression_data = pd.DataFrame() - - # Y (UGC有用性) - print("1. 计算 Y (UGC有用性)") - if helpfull_col: - regression_data['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) - else: - regression_data['Y'] = 0 - - # X1 (评论数量) - print("2. 计算 X1 (评论数量)") - if comment_count_col: - regression_data['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) - else: - regression_data['X1'] = 0 - - # 定义函数计算评论指标 - def calculate_comment_metrics(row): - lengths = [] - complexities = [] - sentiments = [] - richness = [] - - for col in comment_cols: - content = str(row.get(col, '')) - if content and content != 'None' and content != 'nan': - # 评论长度 - lengths.append(len(content.replace(' ', ''))) - # 评论复杂度 - complexities.append(len(content.split())) - # 情感分析 - positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent'] - negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor'] - - sentiment = 0 - lower_content = content.lower() - if any(word in lower_content for word in positive_words): - sentiment = 1 - elif any(word in lower_content for word in negative_words): - sentiment = -1 - sentiments.append(sentiment) - # 信息丰富度 - r = 0 - if re.search(r'\d', content): - r += 1 - if re.search(r'http[s]?://', content): - r += 1 - if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content): - r += 1 - richness.append(r) - - return lengths, complexities, sentiments, richness - - # 计算评论相关指标 - print("3. 计算评论相关指标...") - comment_metrics = df.apply(calculate_comment_metrics, axis=1) - - # X2: 评论长度平均值 - print("4. 计算 X2 (评论长度)") - regression_data['X2'] = comment_metrics.apply(lambda x: sum(x[0]) / len(x[0]) if x[0] else 0) - - # X3: 评论复杂度平均值 - print("5. 计算 X3 (评论复杂度)") - regression_data['X3'] = comment_metrics.apply(lambda x: sum(x[1]) / len(x[1]) if x[1] else 0) - - # X4: 评论可读性 - print("6. 计算 X4 (评论可读性)") - regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) - - # X5: 内容情感性平均值 - print("7. 计算 X5 (内容情感性)") - regression_data['X5'] = comment_metrics.apply(lambda x: sum(x[2]) / len(x[2]) if x[2] else 0) - - # X6: 信息丰富度平均值 - print("8. 计算 X6 (信息丰富度)") - regression_data['X6'] = comment_metrics.apply(lambda x: sum(x[3]) / len(x[3]) if x[3] else 0) - - # 数据清洗 - print("\n9. 数据清洗...") - for col in regression_data.columns: - regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0) - - # 验证数据 - print("\n10. 验证数据...") - print(f"行数: {len(regression_data)}") - print(f"列数: {len(regression_data.columns)}") - print(f"列名: {list(regression_data.columns)}") - print(f"数据类型:") - print(regression_data.dtypes) - print(f"\n前5行数据:") - print(regression_data.head()) - - # 保存文件 - print("\n11. 保存文件...") - regression_data.to_excel(output_file, index=False) - - # 验证文件 - print("\n12. 验证文件...") - if os.path.exists(output_file): - print(f"文件已成功保存: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - # 重新读取检查 - df_check = pd.read_excel(output_file) - print(f"输出文件行数: {len(df_check)}") - print(f"输出文件列数: {len(df_check.columns)}") - else: - print("文件保存失败!") - - print() - print("========================================") - print(" 任务完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/quick_process.py b/project/quick_process.py deleted file mode 100644 index 2d6ce03..0000000 --- a/project/quick_process.py +++ /dev/null @@ -1,83 +0,0 @@ -import os -import pandas as pd -import re - -print("开始处理...") - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' - -# 读取数据 -print("读取数据...") -df = pd.read_excel(input_file) -print(f"读取完成: {len(df)} 行") - -# 识别列 -helpfull_col = [c for c in df.columns if 'helpfull' in str(c).lower()][0] if any('helpfull' in str(c).lower() for c in df.columns) else None -comment_count_col = [c for c in df.columns if '评论总数' in str(c)][0] if any('评论总数' in str(c) for c in df.columns) else None -comment_cols = [c for c in df.columns if '评论' in str(c) and any(str(i) in str(c) for i in range(1, 6)) and '内容' in str(c)] - -print(f"找到列: Y={helpfull_col}, X1={comment_count_col}, 评论列={len(comment_cols)}") - -# 添加Y和X1 -df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) if helpfull_col else 0 -df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) if comment_count_col else 0 - -# 计算评论指标 -print("计算评论指标...") - -def calc_metrics(content): - if pd.isna(content) or str(content) in ['None', 'nan', '']: - return 0, 0, 0, 0 - content = str(content) - length = len(content.replace(' ', '').replace('\u3000', '')) - complexity = len(content.split()) - - pos_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent'] - neg_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor'] - sentiment = 1 if any(w in content.lower() for w in pos_words) else (-1 if any(w in content.lower() for w in neg_words) else 0) - - richness = (1 if re.search(r'\d', content) else 0) + (1 if re.search(r'http[s]?://|www\.', content) else 0) + (1 if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]', content) else 0) - - return length, complexity, sentiment, richness - -# 批量计算 -x2_list, x3_list, x5_list, x6_list = [], [], [], [] - -for i in range(len(df)): - if i % 5000 == 0: - print(f"处理 {i}/{len(df)}") - - lengths, complexities, sentiments, richness = [], [], [], [] - - for col in comment_cols: - l, c, s, r = calc_metrics(df.iloc[i].get(col, '')) - if l > 0: - lengths.append(l) - complexities.append(c) - sentiments.append(s) - richness.append(r) - - x2_list.append(sum(lengths)/len(lengths) if lengths else 0) - x3_list.append(sum(complexities)/len(complexities) if complexities else 0) - x5_list.append(sum(sentiments)/len(sentiments) if sentiments else 0) - x6_list.append(sum(richness)/len(richness) if richness else 0) - -df['X2'] = x2_list -df['X3'] = x3_list -df['X5'] = x5_list -df['X6'] = x6_list - -# 计算X4 -df['X4'] = df.apply(lambda r: r['X2']/r['X3'] if r['X3']>0 else 0, axis=1) - -# 清洗数据 -for col in ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']: - df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).replace([float('inf'), float('-inf')], 0) - -print("保存文件...") -df.to_excel(output_file, index=False, engine='openpyxl') - -print(f"完成!文件大小: {os.path.getsize(output_file)/1024:.2f} KB") -print(f"行数: {len(df)}, 列数: {len(df.columns)}") diff --git a/project/read_excel_test.py b/project/read_excel_test.py deleted file mode 100644 index 08e509f..0000000 --- a/project/read_excel_test.py +++ /dev/null @@ -1,54 +0,0 @@ -import os -import openpyxl - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' - -print("========================================") -print(" 读取Excel测试") -print("========================================") -print(f"输入文件: {input_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - -# 读取Excel文件 -try: - print("正在读取Excel文件...") - wb = openpyxl.load_workbook(input_file) - ws = wb.active - - print(f"工作表名称: {ws.title}") - print(f"最大行数: {ws.max_row}") - print(f"最大列数: {ws.max_column}") - - # 读取表头 - print("\n表头:") - headers = [] - for col in range(1, ws.max_column + 1): - header = ws.cell(row=1, column=col).value - headers.append(header) - print(f"{col}. {header}") - - # 读取前3行数据 - print("\n前3行数据:") - for row in range(2, min(5, ws.max_row + 1)): - row_data = [] - for col in range(1, min(10, ws.max_column + 1)): - value = ws.cell(row=row, column=col).value - row_data.append(value) - print(f"行 {row}: {row_data}") - - print("\n========================================") - print(" 读取完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/run_with_output.py b/project/run_with_output.py deleted file mode 100644 index 6555dc4..0000000 --- a/project/run_with_output.py +++ /dev/null @@ -1,216 +0,0 @@ -import os -import pandas as pd -import re -import sys - -# 重定向输出到文件和屏幕 -class Tee: - def __init__(self, *files): - self.files = files - def write(self, obj): - for f in self.files: - f.write(obj) - f.flush() - def flush(self): - for f in self.files: - f.flush() - -log_file = open(r'D:\java\project\process_log.txt', 'w', encoding='utf-8') -original_stdout = sys.stdout -sys.stdout = Tee(original_stdout, log_file) - -print("========================================") -print(" 在原表中添加回归数据列") -print("========================================") - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' - -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - sys.stdout = original_stdout - log_file.close() - exit(1) - -print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - -# 读取原始数据 -try: - print("\n正在读取原始数据...") - df = pd.read_excel(input_file) - print(f"成功读取 {len(df)} 行数据") - print(f"原始列数: {len(df.columns)}") - - # 识别列 - print("\n识别列...") - helpfull_col = None - comment_count_col = None - comment_cols = [] - - for col in df.columns: - col_str = str(col).lower() - if 'helpfull' in col_str or 'helpful' in col_str: - helpfull_col = col - print(f"找到 Y 列 (helpfull): {col}") - elif '评论总数' in str(col) or '帖子评论总数' in str(col): - comment_count_col = col - print(f"找到 X1 列 (评论总数): {col}") - elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): - comment_cols.append(col) - print(f"找到评论列 {len(comment_cols)}: {col}") - - print(f"\n共找到 {len(comment_cols)} 个评论内容列") - - # 添加回归数据列 - print("\n添加回归数据列...") - - # Y (UGC有用性) - 直接复制helpfull列 - print("1. 添加 Y (UGC有用性)") - if helpfull_col: - df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) - else: - df['Y'] = 0 - - # X1 (评论数量) - 直接复制帖子评论总数列 - print("2. 添加 X1 (评论数量)") - if comment_count_col: - df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) - else: - df['X1'] = 0 - - # 定义函数计算评论指标 - def calculate_comment_metrics(content): - if pd.isna(content) or str(content) in ['None', 'nan', '']: - return 0, 0, 0, 0 - - content = str(content) - # X2: 评论长度(剔空格后的字符数) - length = len(content.replace(' ', '').replace('\u3000', '')) - # X3: 评论复杂度(按空格拆分的分词数) - complexity = len(content.split()) - # X5: 情感分析(正面=1、中性=0、负面=-1) - positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] - negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] - - sentiment = 0 - lower_content = content.lower() - if any(word in lower_content for word in positive_words): - sentiment = 1 - elif any(word in lower_content for word in negative_words): - sentiment = -1 - # X6: 信息丰富度(含数字/链接/表情各1分,满分3分) - richness = 0 - if re.search(r'\d', content): # 含数字 - richness += 1 - if re.search(r'http[s]?://|www\.', content): # 含链接 - richness += 1 - if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情 - richness += 1 - - return length, complexity, sentiment, richness - - # 计算评论相关指标 - print("3. 计算评论相关指标...") - - # 初始化列 - df['X2'] = 0.0 # 评论长度 - df['X3'] = 0.0 # 评论复杂度 - df['X5'] = 0.0 # 情感性 - df['X6'] = 0.0 # 信息丰富度 - - # 逐行计算 - total_rows = len(df) - print(f"总数据行数: {total_rows}") - - for i in range(total_rows): - if i % 1000 == 0: - print(f" 处理第 {i}/{total_rows} 行...") - - lengths = [] - complexities = [] - sentiments = [] - richness = [] - - for col in comment_cols: - content = df.iloc[i].get(col, '') - length, complexity, sentiment, r = calculate_comment_metrics(content) - if length > 0: # 只统计有内容的评论 - lengths.append(length) - complexities.append(complexity) - sentiments.append(sentiment) - richness.append(r) - - # 计算平均值(无评论记0) - if lengths: - df.loc[i, 'X2'] = sum(lengths) / len(lengths) - df.loc[i, 'X3'] = sum(complexities) / len(complexities) - df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) - df.loc[i, 'X6'] = sum(richness) / len(richness) - - # X4: 评论可读性 = X2/X3(X3为0时记0,避免报错) - print("4. 计算 X4 (评论可读性)") - df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) - - # 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误 - print("\n5. 数据清洗...") - regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] - for col in regression_cols: - # 转换为数字,错误值转为0 - df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) - # 替换无穷大 - df[col] = df[col].replace([float('inf'), float('-inf')], 0) - - # 验证数据 - print("\n6. 验证数据...") - print(f"总行数: {len(df)}") - print(f"总列数: {len(df.columns)}") - print(f"\n回归数据列统计:") - print(df[regression_cols].describe()) - print(f"\n前5行回归数据:") - print(df[regression_cols].head()) - - # 检查是否有空值或错误值 - print(f"\n空值检查:") - for col in regression_cols: - null_count = df[col].isnull().sum() - print(f" {col}: {null_count} 个空值") - - # 保存文件 - print("\n7. 保存文件...") - print(f"正在保存到: {output_file}") - df.to_excel(output_file, index=False, engine='openpyxl') - - # 验证文件 - print("\n8. 验证文件...") - if os.path.exists(output_file): - print(f"文件已成功保存: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - # 重新读取检查 - df_check = pd.read_excel(output_file) - print(f"输出文件行数: {len(df_check)}") - print(f"输出文件列数: {len(df_check.columns)}") - print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}") - else: - print("文件保存失败!") - - print() - print("========================================") - print(" 任务完成") - print("========================================") - print(f"新文件已保存: {output_file}") - print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() -finally: - sys.stdout = original_stdout - log_file.close() - print("日志已保存到: D:\\java\\project\\process_log.txt") diff --git a/project/simple_add_columns.py b/project/simple_add_columns.py deleted file mode 100644 index fb4663b..0000000 --- a/project/simple_add_columns.py +++ /dev/null @@ -1,187 +0,0 @@ -import os -import pandas as pd -import re - -print("=" * 60) -print(" 在原表中添加回归数据列") -print("=" * 60) - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新)_回归.xlsx' - -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - -# 读取原始数据 -print("\n正在读取原始数据...") -df = pd.read_excel(input_file) -print(f"成功读取 {len(df)} 行数据") -print(f"原始列数: {len(df.columns)}") - -# 识别列 -print("\n识别列...") -helpfull_col = None -comment_count_col = None -comment_cols = [] - -for col in df.columns: - col_str = str(col).lower() - if 'helpfull' in col_str or 'helpful' in col_str: - helpfull_col = col - print(f"找到 Y 列 (helpfull): {col}") - elif '评论总数' in str(col) or '帖子评论总数' in str(col): - comment_count_col = col - print(f"找到 X1 列 (评论总数): {col}") - elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col): - comment_cols.append(col) - print(f"找到评论列 {len(comment_cols)}: {col}") - -print(f"\n共找到 {len(comment_cols)} 个评论内容列") - -# 添加回归数据列 -print("\n添加回归数据列...") - -# Y (UGC有用性) - 直接复制helpfull列 -print("1. 添加 Y (UGC有用性)") -if helpfull_col: - df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) -else: - df['Y'] = 0 - -# X1 (评论数量) - 直接复制帖子评论总数列 -print("2. 添加 X1 (评论数量)") -if comment_count_col: - df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) -else: - df['X1'] = 0 - -# 定义函数计算评论指标 -def calculate_comment_metrics(content): - if pd.isna(content) or str(content) in ['None', 'nan', '']: - return 0, 0, 0, 0 - - content = str(content) - # X2: 评论长度(剔空格后的字符数) - length = len(content.replace(' ', '').replace('\u3000', '')) - # X3: 评论复杂度(按空格拆分的分词数) - complexity = len(content.split()) - # X5: 情感分析(正面=1、中性=0、负面=-1) - positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like'] - negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike'] - - sentiment = 0 - lower_content = content.lower() - if any(word in lower_content for word in positive_words): - sentiment = 1 - elif any(word in lower_content for word in negative_words): - sentiment = -1 - # X6: 信息丰富度(含数字/链接/表情各1分,满分3分) - richness = 0 - if re.search(r'\d', content): # 含数字 - richness += 1 - if re.search(r'http[s]?://|www\.', content): # 含链接 - richness += 1 - if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content): # 含表情 - richness += 1 - - return length, complexity, sentiment, richness - -# 计算评论相关指标 -print("3. 计算评论相关指标...") - -# 初始化列 -df['X2'] = 0.0 # 评论长度 -df['X3'] = 0.0 # 评论复杂度 -df['X5'] = 0.0 # 情感性 -df['X6'] = 0.0 # 信息丰富度 - -# 逐行计算 -total_rows = len(df) -print(f"总数据行数: {total_rows}") - -for i in range(total_rows): - if i % 1000 == 0: - print(f" 处理第 {i}/{total_rows} 行...") - - lengths = [] - complexities = [] - sentiments = [] - richness = [] - - for col in comment_cols: - content = df.iloc[i].get(col, '') - length, complexity, sentiment, r = calculate_comment_metrics(content) - if length > 0: # 只统计有内容的评论 - lengths.append(length) - complexities.append(complexity) - sentiments.append(sentiment) - richness.append(r) - - # 计算平均值(无评论记0) - if lengths: - df.loc[i, 'X2'] = sum(lengths) / len(lengths) - df.loc[i, 'X3'] = sum(complexities) / len(complexities) - df.loc[i, 'X5'] = sum(sentiments) / len(sentiments) - df.loc[i, 'X6'] = sum(richness) / len(richness) - -# X4: 评论可读性 = X2/X3(X3为0时记0,避免报错) -print("4. 计算 X4 (评论可读性)") -df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1) - -# 数据清洗 - 确保所有值都是纯数字,无文本、无空值、无错误 -print("\n5. 数据清洗...") -regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] -for col in regression_cols: - # 转换为数字,错误值转为0 - df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0) - # 替换无穷大 - df[col] = df[col].replace([float('inf'), float('-inf')], 0) - -# 验证数据 -print("\n6. 验证数据...") -print(f"总行数: {len(df)}") -print(f"总列数: {len(df.columns)}") -print(f"\n回归数据列统计:") -print(df[regression_cols].describe()) -print(f"\n前5行回归数据:") -print(df[regression_cols].head()) - -# 检查是否有空值或错误值 -print(f"\n空值检查:") -for col in regression_cols: - null_count = df[col].isnull().sum() - print(f" {col}: {null_count} 个空值") - -# 保存文件 -print("\n7. 保存文件...") -print(f"正在保存到: {output_file}") -df.to_excel(output_file, index=False, engine='openpyxl') - -# 验证文件 -print("\n8. 验证文件...") -if os.path.exists(output_file): - print(f"文件已成功保存: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - # 重新读取检查 - df_check = pd.read_excel(output_file) - print(f"输出文件行数: {len(df_check)}") - print(f"输出文件列数: {len(df_check.columns)}") - print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}") -else: - print("文件保存失败!") - -print() -print("=" * 60) -print(" 任务完成") -print("=" * 60) -print(f"新文件已保存: {output_file}") -print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列") diff --git a/project/simple_calculate.py b/project/simple_calculate.py deleted file mode 100644 index 3b4161c..0000000 --- a/project/simple_calculate.py +++ /dev/null @@ -1,100 +0,0 @@ -import os -import openpyxl -import re - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' - -print("========================================") -print(" 简单计算UGC回归数据") -print("========================================") -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -if not os.path.exists(output_file): - print("错误: 输出文件不存在!") - exit(1) - -print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - -# 读取输入文件 -try: - print("正在读取输入文件...") - wb_input = openpyxl.load_workbook(input_file) - ws_input = wb_input.active - - print(f"输入工作表名称: {ws_input.title}") - print(f"输入文件最大行数: {ws_input.max_row}") - print(f"输入文件最大列数: {ws_input.max_column}") - - # 读取输出文件 - print("\n正在读取输出文件...") - wb_output = openpyxl.load_workbook(output_file) - ws_output = wb_output.active - - print(f"输出工作表名称: {ws_output.title}") - - # 识别列 - print("\n识别列...") - headers = [] - for col in range(1, ws_input.max_column + 1): - header = ws_input.cell(row=1, column=col).value - headers.append(header) - if header and 'helpfull' in str(header): - helpfull_col = col - print(f"找到 helpfull 列: {col}") - elif header and ('评论总数' in str(header) or '帖子评论总数' in str(header)): - comment_count_col = col - print(f"找到评论总数列: {col}") - elif header and '评论' in str(header): - print(f"找到评论列: {col} - {header}") - - # 计算并填充数据 - print("\n计算并填充数据...") - max_rows = min(ws_input.max_row, 10) # 只处理前10行用于测试 - print(f"处理前 {max_rows - 1} 行数据") - - for row in range(2, max_rows + 1): - print(f"处理行 {row}") - - # Y (UGC有用性) - if 'helpfull_col' in locals(): - y_value = ws_input.cell(row=row, column=helpfull_col).value - ws_output.cell(row=row, column=1, value=y_value if y_value else 0) - else: - ws_output.cell(row=row, column=1, value=0) - - # X1 (评论数量) - if 'comment_count_col' in locals(): - x1_value = ws_input.cell(row=row, column=comment_count_col).value - ws_output.cell(row=row, column=2, value=x1_value if x1_value else 0) - else: - ws_output.cell(row=row, column=2, value=0) - - # X2-X6 暂时设为0 - for col in range(3, 8): - ws_output.cell(row=row, column=col, value=0) - - # 保存文件 - print("\n保存文件...") - wb_output.save(output_file) - - print(f"文件已成功保存: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - - print() - print("========================================") - print(" 任务完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/simple_copy.py b/project/simple_copy.py deleted file mode 100644 index 9077e92..0000000 --- a/project/simple_copy.py +++ /dev/null @@ -1,41 +0,0 @@ -import os -import shutil - -# 输入输出文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' - -print("========================================") -print(" 简单文件复制脚本") -print("========================================") -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") -print(f"文件存在: {os.path.exists(input_file)}") - -# 复制文件 -try: - print("正在复制文件...") - shutil.copy2(input_file, output_file) - - # 验证文件是否创建成功 - if os.path.exists(output_file): - print(f"文件已成功复制到: {output_file}") - print(f"复制文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - else: - print("错误: 文件复制失败,未找到输出文件") - - print() - print("========================================") - print(" 任务完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") diff --git a/project/simple_data_test.py b/project/simple_data_test.py deleted file mode 100644 index b45c1b2..0000000 --- a/project/simple_data_test.py +++ /dev/null @@ -1,54 +0,0 @@ -import os -import pandas as pd - -# 文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' - -print("========================================") -print(" 简单数据测试") -print("========================================") -print(f"输入文件: {input_file}") -print(f"输出文件: {output_file}") -print() - -# 检查文件是否存在 -if not os.path.exists(input_file): - print("错误: 输入文件不存在!") - exit(1) - -print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") - -# 读取原始数据 -try: - print("正在读取原始数据...") - df = pd.read_excel(input_file) - print(f"成功读取 {len(df)} 行数据") - print(f"列名: {list(df.columns)}") - - # 简单处理:创建一个只包含前5列的新文件 - print("\n创建测试文件...") - test_data = df.head(100) # 只取前100行 - test_output = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\test_output.xlsx' - test_data.to_excel(test_output, index=False) - - print(f"测试文件已创建: {test_output}") - print(f"测试文件大小: {os.path.getsize(test_output) / 1024:.2f} KB") - - # 验证测试文件 - if os.path.exists(test_output): - df_test = pd.read_excel(test_output) - print(f"测试文件行数: {len(df_test)}") - print(f"测试文件列数: {len(df_test.columns)}") - else: - print("测试文件创建失败!") - - print() - print("========================================") - print(" 测试完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/simple_excel_create.py b/project/simple_excel_create.py deleted file mode 100644 index 7538502..0000000 --- a/project/simple_excel_create.py +++ /dev/null @@ -1,57 +0,0 @@ -import os -import openpyxl - -# 文件路径 -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' - -print("========================================") -print(" 创建UGC回归数据文件") -print("========================================") -print(f"输出文件: {output_file}") -print() - -# 检查输出目录是否存在 -output_dir = os.path.dirname(output_file) -print(f"输出目录: {output_dir}") -print(f"目录存在: {os.path.exists(output_dir)}") - -if not os.path.exists(output_dir): - print("正在创建输出目录...") - try: - os.makedirs(output_dir) - print("目录创建成功") - except Exception as e: - print(f"创建目录失败: {e}") - exit(1) - -# 创建新的Excel文件 -try: - print("\n创建新的Excel文件...") - wb = openpyxl.Workbook() - ws = wb.active - - # 设置第一行列名 - headers = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'] - for i, header in enumerate(headers, 1): - ws.cell(row=1, column=i, value=header) - - # 保存文件 - print(f"保存文件到: {output_file}") - wb.save(output_file) - - # 验证文件是否创建成功 - if os.path.exists(output_file): - print(f"文件已成功创建: {output_file}") - print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB") - else: - print("错误: 文件创建失败") - - print() - print("========================================") - print(" 任务完成") - print("========================================") - -except Exception as e: - print(f"处理文件时出错: {str(e)}") - import traceback - traceback.print_exc() diff --git a/project/simple_test.py b/project/simple_test.py deleted file mode 100644 index d1889f5..0000000 --- a/project/simple_test.py +++ /dev/null @@ -1,22 +0,0 @@ -import os - -# 测试基本文件操作 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx' - -print("========================================") -print(" 简单测试") -print("========================================") -print(f"输入文件: {input_file}") -print() - -# 检查文件是否存在 -if os.path.exists(input_file): - print("文件存在!") - print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB") -else: - print("文件不存在!") - -print() -print("========================================") -print(" 测试完成") -print("========================================") diff --git a/project/test_file_access.py b/project/test_file_access.py deleted file mode 100644 index f46f67a..0000000 --- a/project/test_file_access.py +++ /dev/null @@ -1,49 +0,0 @@ -import os - -# 测试文件路径 -input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据(新).xlsx' -output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx' - -print("========================================") -print(" 测试文件访问") -print("========================================") -print(f"当前目录: {os.getcwd()}") -print() - -# 检查输入文件 -print("检查输入文件:") -print(f"路径: {input_file}") -print(f"存在: {os.path.exists(input_file)}") -if os.path.exists(input_file): - print(f"大小: {os.path.getsize(input_file) / 1024:.2f} KB") -else: - print("文件不存在!") - -# 检查输出文件 -print("\n检查输出文件:") -print(f"路径: {output_file}") -print(f"存在: {os.path.exists(output_file)}") -if os.path.exists(output_file): - print(f"大小: {os.path.getsize(output_file) / 1024:.2f} KB") -else: - print("文件不存在!") - -# 检查目录 -print("\n检查目录:") -dir_path = os.path.dirname(input_file) -print(f"目录: {dir_path}") -print(f"存在: {os.path.exists(dir_path)}") -if os.path.exists(dir_path): - print("目录内容:") - files = os.listdir(dir_path) - for file in files[:10]: # 只显示前10个文件 - file_path = os.path.join(dir_path, file) - size = os.path.getsize(file_path) / 1024 - print(f" {file}: {size:.2f} KB") - if len(files) > 10: - print(f" ... 还有 {len(files) - 10} 个文件") - -print() -print("========================================") -print(" 测试完成") -print("========================================")