You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
226 lines
7.9 KiB
226 lines
7.9 KiB
import java.io.*;
|
|
import java.time.LocalDate;
|
|
import java.time.format.DateTimeFormatter;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.Locale;
|
|
|
|
public class DataCleaningScript {
|
|
|
|
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.CHINA);
|
|
|
|
public static void main(String[] args) {
|
|
String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx";
|
|
String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).csv";
|
|
|
|
System.out.println("========================================");
|
|
System.out.println(" 数据清洗脚本");
|
|
System.out.println("========================================");
|
|
System.out.println("输入文件: " + inputFile);
|
|
System.out.println("输出文件: " + outputFile);
|
|
System.out.println();
|
|
|
|
// 读取数据
|
|
List<PostInfo> rawPosts = readExcelData(inputFile);
|
|
System.out.println("读取数据完成,共 " + rawPosts.size() + " 条记录");
|
|
|
|
// 清洗数据
|
|
List<PostInfo> cleanedPosts = cleanPosts(rawPosts);
|
|
System.out.println("数据清洗完成,有效记录: " + cleanedPosts.size() + " 条");
|
|
|
|
// 保存清洗后的数据
|
|
saveToCSV(cleanedPosts, outputFile);
|
|
System.out.println("数据保存完成!");
|
|
System.out.println();
|
|
System.out.println("========================================");
|
|
System.out.println(" 数据清洗任务完成");
|
|
System.out.println("========================================");
|
|
}
|
|
|
|
private static List<PostInfo> readExcelData(String filePath) {
|
|
List<PostInfo> posts = new ArrayList<>();
|
|
|
|
try (BufferedReader reader = new BufferedReader(new FileReader(filePath, java.nio.charset.StandardCharsets.UTF_8))) {
|
|
|
|
String line;
|
|
boolean isFirstLine = true;
|
|
|
|
while ((line = reader.readLine()) != null) {
|
|
if (isFirstLine) {
|
|
isFirstLine = false;
|
|
continue;
|
|
}
|
|
|
|
String[] parts = parseCSVLine(line);
|
|
if (parts.length >= 9) {
|
|
PostInfo post = parsePostInfo(parts);
|
|
if (post != null) {
|
|
posts.add(post);
|
|
}
|
|
}
|
|
}
|
|
|
|
} catch (IOException e) {
|
|
System.err.println("读取文件时出错: " + e.getMessage());
|
|
}
|
|
|
|
return posts;
|
|
}
|
|
|
|
private static String[] parseCSVLine(String line) {
|
|
List<String> fields = new ArrayList<>();
|
|
StringBuilder currentField = new StringBuilder();
|
|
boolean inQuotes = false;
|
|
|
|
for (char c : line.toCharArray()) {
|
|
if (c == '"') {
|
|
inQuotes = !inQuotes;
|
|
} else if (c == ',' && !inQuotes) {
|
|
fields.add(currentField.toString().trim());
|
|
currentField.setLength(0);
|
|
} else {
|
|
currentField.append(c);
|
|
}
|
|
}
|
|
|
|
fields.add(currentField.toString().trim());
|
|
return fields.toArray(new String[0]);
|
|
}
|
|
|
|
private static PostInfo parsePostInfo(String[] parts) {
|
|
try {
|
|
PostInfo post = new PostInfo();
|
|
|
|
post.setTitle(parts[0]);
|
|
post.setContent(parts[1]);
|
|
post.setAuthor(parts[2]);
|
|
|
|
if (!parts[3].isEmpty()) {
|
|
post.setPostDate(LocalDate.parse(parts[3], DATE_FORMATTER));
|
|
}
|
|
|
|
post.setLikeCount(parseInt(parts[4]));
|
|
post.setCommentCount(parseInt(parts[5]));
|
|
post.setViewCount(parseInt(parts[6]));
|
|
|
|
post.setTags(parts[7]);
|
|
post.setSentiment(parts[8]);
|
|
|
|
return post;
|
|
} catch (Exception e) {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private static int parseInt(String value) {
|
|
try {
|
|
if (value == null || value.isEmpty()) {
|
|
return 0;
|
|
}
|
|
return Integer.parseInt(value);
|
|
} catch (NumberFormatException e) {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
private static List<PostInfo> cleanPosts(List<PostInfo> rawPosts) {
|
|
List<PostInfo> cleanedPosts = new ArrayList<>();
|
|
|
|
for (PostInfo post : rawPosts) {
|
|
PostInfo cleaned = cleanPost(post);
|
|
if (isValidPost(cleaned)) {
|
|
cleanedPosts.add(cleaned);
|
|
}
|
|
}
|
|
|
|
return cleanedPosts;
|
|
}
|
|
|
|
private static PostInfo cleanPost(PostInfo post) {
|
|
PostInfo cleaned = new PostInfo();
|
|
|
|
cleaned.setTitle(cleanText(post.getTitle()));
|
|
cleaned.setContent(cleanContent(post.getContent()));
|
|
cleaned.setAuthor(cleanText(post.getAuthor()));
|
|
cleaned.setPostDate(post.getPostDate());
|
|
cleaned.setLikeCount(post.getLikeCount());
|
|
cleaned.setCommentCount(post.getCommentCount());
|
|
cleaned.setViewCount(post.getViewCount());
|
|
cleaned.setTags(cleanText(post.getTags()));
|
|
cleaned.setSentiment(normalizeSentiment(post.getSentiment()));
|
|
|
|
return cleaned;
|
|
}
|
|
|
|
private static String cleanText(String text) {
|
|
if (text == null) {
|
|
return "";
|
|
}
|
|
return text.trim().replaceAll("\\s+", " ");
|
|
}
|
|
|
|
private static String cleanContent(String content) {
|
|
if (content == null) {
|
|
return "";
|
|
}
|
|
return content.trim()
|
|
.replaceAll("\\s+", " ")
|
|
.replaceAll("[\\r\\n]+", " ")
|
|
.replaceAll("<[^>]+>", "")
|
|
.replaceAll("\\[.*?\\]", "")
|
|
.replaceAll("\\(.*?\\)", "");
|
|
}
|
|
|
|
private static String normalizeSentiment(String sentiment) {
|
|
if (sentiment == null || sentiment.isEmpty()) {
|
|
return "中性";
|
|
}
|
|
|
|
String lower = sentiment.toLowerCase();
|
|
if (lower.contains("积极") || lower.contains("正面") || lower.contains("positive")) {
|
|
return "积极";
|
|
} else if (lower.contains("消极") || lower.contains("负面") || lower.contains("negative")) {
|
|
return "消极";
|
|
} else {
|
|
return "中性";
|
|
}
|
|
}
|
|
|
|
private static boolean isValidPost(PostInfo post) {
|
|
return post.getTitle() != null && !post.getTitle().isEmpty() &&
|
|
post.getContent() != null && !post.getContent().isEmpty();
|
|
}
|
|
|
|
private static void saveToCSV(List<PostInfo> posts, String filePath) {
|
|
if (posts == null || posts.isEmpty()) {
|
|
System.out.println("没有数据需要保存");
|
|
return;
|
|
}
|
|
|
|
try {
|
|
// 确保目录存在
|
|
File file = new File(filePath);
|
|
File parentDir = file.getParentFile();
|
|
if (parentDir != null && !parentDir.exists()) {
|
|
parentDir.mkdirs();
|
|
}
|
|
|
|
try (BufferedWriter writer = new BufferedWriter(
|
|
new FileWriter(file, java.nio.charset.StandardCharsets.UTF_8))) {
|
|
|
|
writer.write("\uFEFF"); // BOM for UTF-8
|
|
writer.write("标题,内容,作者,发布日期,点赞数,评论数,浏览量,标签,情感倾向\n");
|
|
|
|
for (PostInfo post : posts) {
|
|
writer.write(post.toCSV());
|
|
writer.write("\n");
|
|
}
|
|
}
|
|
|
|
System.out.println("数据已保存到: " + filePath);
|
|
|
|
} catch (IOException e) {
|
|
System.err.println("保存CSV文件时出错: " + e.getMessage());
|
|
}
|
|
}
|
|
}
|
|
|