You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

226 lines
7.9 KiB

import java.io.*;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
public class DataCleaningScript {
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.CHINA);
public static void main(String[] args) {
String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx";
String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据(新).csv";
System.out.println("========================================");
System.out.println(" 数据清洗脚本");
System.out.println("========================================");
System.out.println("输入文件: " + inputFile);
System.out.println("输出文件: " + outputFile);
System.out.println();
// 读取数据
List<PostInfo> rawPosts = readExcelData(inputFile);
System.out.println("读取数据完成,共 " + rawPosts.size() + " 条记录");
// 清洗数据
List<PostInfo> cleanedPosts = cleanPosts(rawPosts);
System.out.println("数据清洗完成,有效记录: " + cleanedPosts.size() + " 条");
// 保存清洗后的数据
saveToCSV(cleanedPosts, outputFile);
System.out.println("数据保存完成!");
System.out.println();
System.out.println("========================================");
System.out.println(" 数据清洗任务完成");
System.out.println("========================================");
}
private static List<PostInfo> readExcelData(String filePath) {
List<PostInfo> posts = new ArrayList<>();
try (BufferedReader reader = new BufferedReader(new FileReader(filePath, java.nio.charset.StandardCharsets.UTF_8))) {
String line;
boolean isFirstLine = true;
while ((line = reader.readLine()) != null) {
if (isFirstLine) {
isFirstLine = false;
continue;
}
String[] parts = parseCSVLine(line);
if (parts.length >= 9) {
PostInfo post = parsePostInfo(parts);
if (post != null) {
posts.add(post);
}
}
}
} catch (IOException e) {
System.err.println("读取文件时出错: " + e.getMessage());
}
return posts;
}
private static String[] parseCSVLine(String line) {
List<String> fields = new ArrayList<>();
StringBuilder currentField = new StringBuilder();
boolean inQuotes = false;
for (char c : line.toCharArray()) {
if (c == '"') {
inQuotes = !inQuotes;
} else if (c == ',' && !inQuotes) {
fields.add(currentField.toString().trim());
currentField.setLength(0);
} else {
currentField.append(c);
}
}
fields.add(currentField.toString().trim());
return fields.toArray(new String[0]);
}
private static PostInfo parsePostInfo(String[] parts) {
try {
PostInfo post = new PostInfo();
post.setTitle(parts[0]);
post.setContent(parts[1]);
post.setAuthor(parts[2]);
if (!parts[3].isEmpty()) {
post.setPostDate(LocalDate.parse(parts[3], DATE_FORMATTER));
}
post.setLikeCount(parseInt(parts[4]));
post.setCommentCount(parseInt(parts[5]));
post.setViewCount(parseInt(parts[6]));
post.setTags(parts[7]);
post.setSentiment(parts[8]);
return post;
} catch (Exception e) {
return null;
}
}
private static int parseInt(String value) {
try {
if (value == null || value.isEmpty()) {
return 0;
}
return Integer.parseInt(value);
} catch (NumberFormatException e) {
return 0;
}
}
private static List<PostInfo> cleanPosts(List<PostInfo> rawPosts) {
List<PostInfo> cleanedPosts = new ArrayList<>();
for (PostInfo post : rawPosts) {
PostInfo cleaned = cleanPost(post);
if (isValidPost(cleaned)) {
cleanedPosts.add(cleaned);
}
}
return cleanedPosts;
}
private static PostInfo cleanPost(PostInfo post) {
PostInfo cleaned = new PostInfo();
cleaned.setTitle(cleanText(post.getTitle()));
cleaned.setContent(cleanContent(post.getContent()));
cleaned.setAuthor(cleanText(post.getAuthor()));
cleaned.setPostDate(post.getPostDate());
cleaned.setLikeCount(post.getLikeCount());
cleaned.setCommentCount(post.getCommentCount());
cleaned.setViewCount(post.getViewCount());
cleaned.setTags(cleanText(post.getTags()));
cleaned.setSentiment(normalizeSentiment(post.getSentiment()));
return cleaned;
}
private static String cleanText(String text) {
if (text == null) {
return "";
}
return text.trim().replaceAll("\\s+", " ");
}
private static String cleanContent(String content) {
if (content == null) {
return "";
}
return content.trim()
.replaceAll("\\s+", " ")
.replaceAll("[\\r\\n]+", " ")
.replaceAll("<[^>]+>", "")
.replaceAll("\\[.*?\\]", "")
.replaceAll("\\(.*?\\)", "");
}
private static String normalizeSentiment(String sentiment) {
if (sentiment == null || sentiment.isEmpty()) {
return "中性";
}
String lower = sentiment.toLowerCase();
if (lower.contains("积极") || lower.contains("正面") || lower.contains("positive")) {
return "积极";
} else if (lower.contains("消极") || lower.contains("负面") || lower.contains("negative")) {
return "消极";
} else {
return "中性";
}
}
private static boolean isValidPost(PostInfo post) {
return post.getTitle() != null && !post.getTitle().isEmpty() &&
post.getContent() != null && !post.getContent().isEmpty();
}
private static void saveToCSV(List<PostInfo> posts, String filePath) {
if (posts == null || posts.isEmpty()) {
System.out.println("没有数据需要保存");
return;
}
try {
// 确保目录存在
File file = new File(filePath);
File parentDir = file.getParentFile();
if (parentDir != null && !parentDir.exists()) {
parentDir.mkdirs();
}
try (BufferedWriter writer = new BufferedWriter(
new FileWriter(file, java.nio.charset.StandardCharsets.UTF_8))) {
writer.write("\uFEFF"); // BOM for UTF-8
writer.write("标题,内容,作者,发布日期,点赞数,评论数,浏览量,标签,情感倾向\n");
for (PostInfo post : posts) {
writer.write(post.toCSV());
writer.write("\n");
}
}
System.out.println("数据已保存到: " + filePath);
} catch (IOException e) {
System.err.println("保存CSV文件时出错: " + e.getMessage());
}
}
}