package com.project.util; import com.project.model.PostInfo; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class DataCleaner { public static List cleanPosts(List rawPosts) { List cleanedPosts = new ArrayList<>(); for (PostInfo post : rawPosts) { PostInfo cleaned = cleanPost(post); if (isValidPost(cleaned)) { cleanedPosts.add(cleaned); } } System.out.println("数据清洗完成,有效数据: " + cleanedPosts.size() + " 条"); return cleanedPosts; } private static PostInfo cleanPost(PostInfo post) { PostInfo cleaned = new PostInfo(); cleaned.setTitle(cleanText(post.getTitle())); cleaned.setContent(cleanContent(post.getContent())); cleaned.setAuthor(cleanText(post.getAuthor())); cleaned.setPostDate(post.getPostDate()); cleaned.setLikeCount(post.getLikeCount()); cleaned.setCommentCount(post.getCommentCount()); cleaned.setViewCount(post.getViewCount()); cleaned.setTags(cleanText(post.getTags())); cleaned.setSentiment(normalizeSentiment(post.getSentiment())); return cleaned; } private static String cleanText(String text) { if (text == null) { return ""; } return text.trim().replaceAll("\\s+", " "); } private static String cleanContent(String content) { if (content == null) { return ""; } return content.trim() .replaceAll("\\s+", " ") .replaceAll("[\\r\\n]+", " ") .replaceAll("<[^>]+>", "") .replaceAll("\\[.*?\\]", "") .replaceAll("\\(.*?\\)", ""); } private static String normalizeSentiment(String sentiment) { if (sentiment == null || sentiment.isEmpty()) { return "中性"; } String lower = sentiment.toLowerCase(); if (lower.contains("积极") || lower.contains("正面") || lower.contains("positive")) { return "积极"; } else if (lower.contains("消极") || lower.contains("负面") || lower.contains("negative")) { return "消极"; } else { return "中性"; } } private static boolean isValidPost(PostInfo post) { return post.getTitle() != null && !post.getTitle().isEmpty() && post.getContent() != null && !post.getContent().isEmpty(); } public static String[] extractKeywords(String content) { if (content == null || content.isEmpty()) { return new String[0]; } String[] commonKeywords = { "数据", "分析", "学习", "技术", "互联网", "发展", "趋势", "工具", "方法", "实践", "经验", "案例", "应用", "创新", "挑战", "机遇", "未来", "智能", "算法", "模型", "平台" }; List keywords = new ArrayList<>(); String lowerContent = content.toLowerCase(); for (String keyword : commonKeywords) { if (lowerContent.contains(keyword.toLowerCase())) { keywords.add(keyword); } } return keywords.toArray(new String[0]); } }