You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
103 lines
3.4 KiB
103 lines
3.4 KiB
package com.project.util;
|
|
|
|
import com.project.model.PostInfo;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
public class DataCleaner {
|
|
|
|
public static List<PostInfo> cleanPosts(List<PostInfo> rawPosts) {
|
|
List<PostInfo> cleanedPosts = new ArrayList<>();
|
|
|
|
for (PostInfo post : rawPosts) {
|
|
PostInfo cleaned = cleanPost(post);
|
|
if (isValidPost(cleaned)) {
|
|
cleanedPosts.add(cleaned);
|
|
}
|
|
}
|
|
|
|
System.out.println("数据清洗完成,有效数据: " + cleanedPosts.size() + " 条");
|
|
return cleanedPosts;
|
|
}
|
|
|
|
private static PostInfo cleanPost(PostInfo post) {
|
|
PostInfo cleaned = new PostInfo();
|
|
|
|
cleaned.setTitle(cleanText(post.getTitle()));
|
|
cleaned.setContent(cleanContent(post.getContent()));
|
|
cleaned.setAuthor(cleanText(post.getAuthor()));
|
|
cleaned.setPostDate(post.getPostDate());
|
|
cleaned.setLikeCount(post.getLikeCount());
|
|
cleaned.setCommentCount(post.getCommentCount());
|
|
cleaned.setViewCount(post.getViewCount());
|
|
cleaned.setTags(cleanText(post.getTags()));
|
|
cleaned.setSentiment(normalizeSentiment(post.getSentiment()));
|
|
|
|
return cleaned;
|
|
}
|
|
|
|
private static String cleanText(String text) {
|
|
if (text == null) {
|
|
return "";
|
|
}
|
|
return text.trim().replaceAll("\\s+", " ");
|
|
}
|
|
|
|
private static String cleanContent(String content) {
|
|
if (content == null) {
|
|
return "";
|
|
}
|
|
return content.trim()
|
|
.replaceAll("\\s+", " ")
|
|
.replaceAll("[\\r\\n]+", " ")
|
|
.replaceAll("<[^>]+>", "")
|
|
.replaceAll("\\[.*?\\]", "")
|
|
.replaceAll("\\(.*?\\)", "");
|
|
}
|
|
|
|
private static String normalizeSentiment(String sentiment) {
|
|
if (sentiment == null || sentiment.isEmpty()) {
|
|
return "中性";
|
|
}
|
|
|
|
String lower = sentiment.toLowerCase();
|
|
if (lower.contains("积极") || lower.contains("正面") || lower.contains("positive")) {
|
|
return "积极";
|
|
} else if (lower.contains("消极") || lower.contains("负面") || lower.contains("negative")) {
|
|
return "消极";
|
|
} else {
|
|
return "中性";
|
|
}
|
|
}
|
|
|
|
private static boolean isValidPost(PostInfo post) {
|
|
return post.getTitle() != null && !post.getTitle().isEmpty() &&
|
|
post.getContent() != null && !post.getContent().isEmpty();
|
|
}
|
|
|
|
public static String[] extractKeywords(String content) {
|
|
if (content == null || content.isEmpty()) {
|
|
return new String[0];
|
|
}
|
|
|
|
String[] commonKeywords = {
|
|
"数据", "分析", "学习", "技术", "互联网", "发展", "趋势",
|
|
"工具", "方法", "实践", "经验", "案例", "应用", "创新",
|
|
"挑战", "机遇", "未来", "智能", "算法", "模型", "平台"
|
|
};
|
|
|
|
List<String> keywords = new ArrayList<>();
|
|
String lowerContent = content.toLowerCase();
|
|
|
|
for (String keyword : commonKeywords) {
|
|
if (lowerContent.contains(keyword.toLowerCase())) {
|
|
keywords.add(keyword);
|
|
}
|
|
}
|
|
|
|
return keywords.toArray(new String[0]);
|
|
}
|
|
}
|
|
|