You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

99 lines
3.3 KiB

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DataCleaner {
public static List<PostInfo> cleanPosts(List<PostInfo> rawPosts) {
List<PostInfo> cleanedPosts = new ArrayList<>();
for (PostInfo post : rawPosts) {
PostInfo cleaned = cleanPost(post);
if (isValidPost(cleaned)) {
cleanedPosts.add(cleaned);
}
}
System.out.println("数据清洗完成,有效数据: " + cleanedPosts.size() + " 条");
return cleanedPosts;
}
private static PostInfo cleanPost(PostInfo post) {
PostInfo cleaned = new PostInfo();
cleaned.setTitle(cleanText(post.getTitle()));
cleaned.setContent(cleanContent(post.getContent()));
cleaned.setAuthor(cleanText(post.getAuthor()));
cleaned.setPostDate(post.getPostDate());
cleaned.setLikeCount(post.getLikeCount());
cleaned.setCommentCount(post.getCommentCount());
cleaned.setViewCount(post.getViewCount());
cleaned.setTags(cleanText(post.getTags()));
cleaned.setSentiment(normalizeSentiment(post.getSentiment()));
return cleaned;
}
private static String cleanText(String text) {
if (text == null) {
return "";
}
return text.trim().replaceAll("\\s+", " ");
}
private static String cleanContent(String content) {
if (content == null) {
return "";
}
return content.trim()
.replaceAll("\\s+", " ")
.replaceAll("[\\r\\n]+", " ")
.replaceAll("<[^>]+>", "")
.replaceAll("\\[.*?\\]", "")
.replaceAll("\\(.*?\\)", "");
}
private static String normalizeSentiment(String sentiment) {
if (sentiment == null || sentiment.isEmpty()) {
return "中性";
}
String lower = sentiment.toLowerCase();
if (lower.contains("积极") || lower.contains("正面") || lower.contains("positive")) {
return "积极";
} else if (lower.contains("消极") || lower.contains("负面") || lower.contains("negative")) {
return "消极";
} else {
return "中性";
}
}
private static boolean isValidPost(PostInfo post) {
return post.getTitle() != null && !post.getTitle().isEmpty() &&
post.getContent() != null && !post.getContent().isEmpty();
}
public static String[] extractKeywords(String content) {
if (content == null || content.isEmpty()) {
return new String[0];
}
String[] commonKeywords = {
"数据", "分析", "学习", "技术", "互联网", "发展", "趋势",
"工具", "方法", "实践", "经验", "案例", "应用", "创新",
"挑战", "机遇", "未来", "智能", "算法", "模型", "平台"
};
List<String> keywords = new ArrayList<>();
String lowerContent = content.toLowerCase();
for (String keyword : commonKeywords) {
if (lowerContent.contains(keyword.toLowerCase())) {
keywords.add(keyword);
}
}
return keywords.toArray(new String[0]);
}
}