import java.io.*; import java.time.LocalDate; import java.time.format.DateTimeFormatter; import java.util.ArrayList; import java.util.List; import java.util.Locale; public class DataCleaningScript { private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.CHINA); public static void main(String[] args) { String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx"; String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).csv"; System.out.println("========================================"); System.out.println(" 数据清洗脚本"); System.out.println("========================================"); System.out.println("输入文件: " + inputFile); System.out.println("输出文件: " + outputFile); System.out.println(); // 读取数据 List rawPosts = readExcelData(inputFile); System.out.println("读取数据完成,共 " + rawPosts.size() + " 条记录"); // 清洗数据 List cleanedPosts = cleanPosts(rawPosts); System.out.println("数据清洗完成,有效记录: " + cleanedPosts.size() + " 条"); // 保存清洗后的数据 saveToCSV(cleanedPosts, outputFile); System.out.println("数据保存完成!"); System.out.println(); System.out.println("========================================"); System.out.println(" 数据清洗任务完成"); System.out.println("========================================"); } private static List readExcelData(String filePath) { List posts = new ArrayList<>(); try (BufferedReader reader = new BufferedReader(new FileReader(filePath, java.nio.charset.StandardCharsets.UTF_8))) { String line; boolean isFirstLine = true; while ((line = reader.readLine()) != null) { if (isFirstLine) { isFirstLine = false; continue; } String[] parts = parseCSVLine(line); if (parts.length >= 9) { PostInfo post = parsePostInfo(parts); if (post != null) { posts.add(post); } } } } catch (IOException e) { System.err.println("读取文件时出错: " + e.getMessage()); } return posts; } private static String[] parseCSVLine(String line) { List fields = new ArrayList<>(); StringBuilder currentField = new StringBuilder(); boolean inQuotes = false; for (char c : line.toCharArray()) { if (c == '"') { inQuotes = !inQuotes; } else if (c == ',' && !inQuotes) { fields.add(currentField.toString().trim()); currentField.setLength(0); } else { currentField.append(c); } } fields.add(currentField.toString().trim()); return fields.toArray(new String[0]); } private static PostInfo parsePostInfo(String[] parts) { try { PostInfo post = new PostInfo(); post.setTitle(parts[0]); post.setContent(parts[1]); post.setAuthor(parts[2]); if (!parts[3].isEmpty()) { post.setPostDate(LocalDate.parse(parts[3], DATE_FORMATTER)); } post.setLikeCount(parseInt(parts[4])); post.setCommentCount(parseInt(parts[5])); post.setViewCount(parseInt(parts[6])); post.setTags(parts[7]); post.setSentiment(parts[8]); return post; } catch (Exception e) { return null; } } private static int parseInt(String value) { try { if (value == null || value.isEmpty()) { return 0; } return Integer.parseInt(value); } catch (NumberFormatException e) { return 0; } } private static List cleanPosts(List rawPosts) { List cleanedPosts = new ArrayList<>(); for (PostInfo post : rawPosts) { PostInfo cleaned = cleanPost(post); if (isValidPost(cleaned)) { cleanedPosts.add(cleaned); } } return cleanedPosts; } private static PostInfo cleanPost(PostInfo post) { PostInfo cleaned = new PostInfo(); cleaned.setTitle(cleanText(post.getTitle())); cleaned.setContent(cleanContent(post.getContent())); cleaned.setAuthor(cleanText(post.getAuthor())); cleaned.setPostDate(post.getPostDate()); cleaned.setLikeCount(post.getLikeCount()); cleaned.setCommentCount(post.getCommentCount()); cleaned.setViewCount(post.getViewCount()); cleaned.setTags(cleanText(post.getTags())); cleaned.setSentiment(normalizeSentiment(post.getSentiment())); return cleaned; } private static String cleanText(String text) { if (text == null) { return ""; } return text.trim().replaceAll("\\s+", " "); } private static String cleanContent(String content) { if (content == null) { return ""; } return content.trim() .replaceAll("\\s+", " ") .replaceAll("[\\r\\n]+", " ") .replaceAll("<[^>]+>", "") .replaceAll("\\[.*?\\]", "") .replaceAll("\\(.*?\\)", ""); } private static String normalizeSentiment(String sentiment) { if (sentiment == null || sentiment.isEmpty()) { return "中性"; } String lower = sentiment.toLowerCase(); if (lower.contains("积极") || lower.contains("正面") || lower.contains("positive")) { return "积极"; } else if (lower.contains("消极") || lower.contains("负面") || lower.contains("negative")) { return "消极"; } else { return "中性"; } } private static boolean isValidPost(PostInfo post) { return post.getTitle() != null && !post.getTitle().isEmpty() && post.getContent() != null && !post.getContent().isEmpty(); } private static void saveToCSV(List posts, String filePath) { if (posts == null || posts.isEmpty()) { System.out.println("没有数据需要保存"); return; } try { // 确保目录存在 File file = new File(filePath); File parentDir = file.getParentFile(); if (parentDir != null && !parentDir.exists()) { parentDir.mkdirs(); } try (BufferedWriter writer = new BufferedWriter( new FileWriter(file, java.nio.charset.StandardCharsets.UTF_8))) { writer.write("\uFEFF"); // BOM for UTF-8 writer.write("标题,内容,作者,发布日期,点赞数,评论数,浏览量,标签,情感倾向\n"); for (PostInfo post : posts) { writer.write(post.toCSV()); writer.write("\n"); } } System.out.println("数据已保存到: " + filePath); } catch (IOException e) { System.err.println("保存CSV文件时出错: " + e.getMessage()); } } }