Compare commits
2 Commits
d8eb0be7d4
...
078b3d0ea2
| Author | SHA1 | Date |
|---|---|---|
|
|
078b3d0ea2 | 3 weeks ago |
|
|
331187713e | 3 weeks ago |
10 changed files with 1111 additions and 1 deletions
@ -0,0 +1,103 @@ |
|||
package com.project.util; |
|||
|
|||
import com.project.model.PostInfo; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class DataCleaner { |
|||
|
|||
public static List<PostInfo> cleanPosts(List<PostInfo> rawPosts) { |
|||
List<PostInfo> cleanedPosts = new ArrayList<>(); |
|||
|
|||
for (PostInfo post : rawPosts) { |
|||
PostInfo cleaned = cleanPost(post); |
|||
if (isValidPost(cleaned)) { |
|||
cleanedPosts.add(cleaned); |
|||
} |
|||
} |
|||
|
|||
System.out.println("数据清洗完成,有效数据: " + cleanedPosts.size() + " 条"); |
|||
return cleanedPosts; |
|||
} |
|||
|
|||
private static PostInfo cleanPost(PostInfo post) { |
|||
PostInfo cleaned = new PostInfo(); |
|||
|
|||
cleaned.setTitle(cleanText(post.getTitle())); |
|||
cleaned.setContent(cleanContent(post.getContent())); |
|||
cleaned.setAuthor(cleanText(post.getAuthor())); |
|||
cleaned.setPostDate(post.getPostDate()); |
|||
cleaned.setLikeCount(post.getLikeCount()); |
|||
cleaned.setCommentCount(post.getCommentCount()); |
|||
cleaned.setViewCount(post.getViewCount()); |
|||
cleaned.setTags(cleanText(post.getTags())); |
|||
cleaned.setSentiment(normalizeSentiment(post.getSentiment())); |
|||
|
|||
return cleaned; |
|||
} |
|||
|
|||
private static String cleanText(String text) { |
|||
if (text == null) { |
|||
return ""; |
|||
} |
|||
return text.trim().replaceAll("\\s+", " "); |
|||
} |
|||
|
|||
private static String cleanContent(String content) { |
|||
if (content == null) { |
|||
return ""; |
|||
} |
|||
return content.trim() |
|||
.replaceAll("\\s+", " ") |
|||
.replaceAll("[\\r\\n]+", " ") |
|||
.replaceAll("<[^>]+>", "") |
|||
.replaceAll("\\[.*?\\]", "") |
|||
.replaceAll("\\(.*?\\)", ""); |
|||
} |
|||
|
|||
private static String normalizeSentiment(String sentiment) { |
|||
if (sentiment == null || sentiment.isEmpty()) { |
|||
return "中性"; |
|||
} |
|||
|
|||
String lower = sentiment.toLowerCase(); |
|||
if (lower.contains("积极") || lower.contains("正面") || lower.contains("positive")) { |
|||
return "积极"; |
|||
} else if (lower.contains("消极") || lower.contains("负面") || lower.contains("negative")) { |
|||
return "消极"; |
|||
} else { |
|||
return "中性"; |
|||
} |
|||
} |
|||
|
|||
private static boolean isValidPost(PostInfo post) { |
|||
return post.getTitle() != null && !post.getTitle().isEmpty() && |
|||
post.getContent() != null && !post.getContent().isEmpty(); |
|||
} |
|||
|
|||
public static String[] extractKeywords(String content) { |
|||
if (content == null || content.isEmpty()) { |
|||
return new String[0]; |
|||
} |
|||
|
|||
String[] commonKeywords = { |
|||
"数据", "分析", "学习", "技术", "互联网", "发展", "趋势", |
|||
"工具", "方法", "实践", "经验", "案例", "应用", "创新", |
|||
"挑战", "机遇", "未来", "智能", "算法", "模型", "平台" |
|||
}; |
|||
|
|||
List<String> keywords = new ArrayList<>(); |
|||
String lowerContent = content.toLowerCase(); |
|||
|
|||
for (String keyword : commonKeywords) { |
|||
if (lowerContent.contains(keyword.toLowerCase())) { |
|||
keywords.add(keyword); |
|||
} |
|||
} |
|||
|
|||
return keywords.toArray(new String[0]); |
|||
} |
|||
} |
|||
@ -0,0 +1,125 @@ |
|||
package com.project.storage; |
|||
|
|||
import com.project.model.PostInfo; |
|||
|
|||
import java.io.BufferedWriter; |
|||
import java.io.FileWriter; |
|||
import java.io.IOException; |
|||
import java.nio.charset.StandardCharsets; |
|||
import java.nio.file.Files; |
|||
import java.nio.file.Paths; |
|||
import java.time.LocalDateTime; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.util.List; |
|||
|
|||
public class DataStorage { |
|||
|
|||
public static void saveToCSV(List<PostInfo> posts, String directory) { |
|||
if (posts == null || posts.isEmpty()) { |
|||
System.out.println("没有数据需要保存"); |
|||
return; |
|||
} |
|||
|
|||
try { |
|||
java.nio.file.Path dirPath = Paths.get(directory); |
|||
if (!Files.exists(dirPath)) { |
|||
Files.createDirectories(dirPath); |
|||
} |
|||
|
|||
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")); |
|||
String filename = "posts_" + timestamp + ".csv"; |
|||
java.nio.file.Path filePath = dirPath.resolve(filename); |
|||
|
|||
try (BufferedWriter writer = new BufferedWriter( |
|||
new FileWriter(filePath.toFile(), StandardCharsets.UTF_8))) { |
|||
|
|||
writer.write("\uFEFF"); |
|||
writer.write("标题,内容,作者,发布日期,点赞数,评论数,浏览量,标签,情感倾向\n"); |
|||
|
|||
for (PostInfo post : posts) { |
|||
writer.write(post.toCSV()); |
|||
writer.write("\n"); |
|||
} |
|||
} |
|||
|
|||
System.out.println("数据已保存到: " + filePath.toAbsolutePath()); |
|||
|
|||
} catch (IOException e) { |
|||
System.err.println("保存CSV文件时出错: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
public static void saveToJSON(List<PostInfo> posts, String directory) { |
|||
if (posts == null || posts.isEmpty()) { |
|||
System.out.println("没有数据需要保存"); |
|||
return; |
|||
} |
|||
|
|||
try { |
|||
java.nio.file.Path dirPath = Paths.get(directory); |
|||
if (!Files.exists(dirPath)) { |
|||
Files.createDirectories(dirPath); |
|||
} |
|||
|
|||
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")); |
|||
String filename = "posts_" + timestamp + ".json"; |
|||
java.nio.file.Path filePath = dirPath.resolve(filename); |
|||
|
|||
try (BufferedWriter writer = new BufferedWriter( |
|||
new FileWriter(filePath.toFile(), StandardCharsets.UTF_8))) { |
|||
|
|||
writer.write("[\n"); |
|||
for (int i = 0; i < posts.size(); i++) { |
|||
writer.write(postToJSON(posts.get(i))); |
|||
if (i < posts.size() - 1) { |
|||
writer.write(",\n"); |
|||
} else { |
|||
writer.write("\n"); |
|||
} |
|||
} |
|||
writer.write("]\n"); |
|||
} |
|||
|
|||
System.out.println("数据已保存到: " + filePath.toAbsolutePath()); |
|||
|
|||
} catch (IOException e) { |
|||
System.err.println("保存JSON文件时出错: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
private static String postToJSON(PostInfo post) { |
|||
return String.format( |
|||
" {\n" + |
|||
" \"title\": \"%s\",\n" + |
|||
" \"content\": \"%s\",\n" + |
|||
" \"author\": \"%s\",\n" + |
|||
" \"postDate\": \"%s\",\n" + |
|||
" \"likeCount\": %d,\n" + |
|||
" \"commentCount\": %d,\n" + |
|||
" \"viewCount\": %d,\n" + |
|||
" \"tags\": \"%s\",\n" + |
|||
" \"sentiment\": \"%s\"\n" + |
|||
" }", |
|||
escapeJSON(post.getTitle()), |
|||
escapeJSON(post.getContent()), |
|||
escapeJSON(post.getAuthor()), |
|||
post.getPostDate() != null ? post.getPostDate().toString() : "", |
|||
post.getLikeCount(), |
|||
post.getCommentCount(), |
|||
post.getViewCount(), |
|||
escapeJSON(post.getTags()), |
|||
escapeJSON(post.getSentiment()) |
|||
); |
|||
} |
|||
|
|||
private static String escapeJSON(String text) { |
|||
if (text == null) { |
|||
return ""; |
|||
} |
|||
return text.replace("\\", "\\\\") |
|||
.replace("\"", "\\\"") |
|||
.replace("\n", "\\n") |
|||
.replace("\r", "\\r") |
|||
.replace("\t", "\\t"); |
|||
} |
|||
} |
|||
@ -0,0 +1,106 @@ |
|||
package com.project.reader; |
|||
|
|||
import com.project.model.PostInfo; |
|||
|
|||
import java.io.*; |
|||
import java.time.LocalDate; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.Locale; |
|||
|
|||
public class ExcelReader { |
|||
|
|||
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.CHINA); |
|||
|
|||
public static List<PostInfo> readExcelData(String filePath, int maxRows) { |
|||
List<PostInfo> posts = new ArrayList<>(); |
|||
|
|||
try (BufferedReader reader = new BufferedReader(new FileReader(filePath, java.nio.charset.StandardCharsets.UTF_8))) { |
|||
|
|||
String line; |
|||
boolean isFirstLine = true; |
|||
int rowCount = 0; |
|||
|
|||
while ((line = reader.readLine()) != null && rowCount < maxRows) { |
|||
if (isFirstLine) { |
|||
isFirstLine = false; |
|||
continue; |
|||
} |
|||
|
|||
String[] parts = parseCSVLine(line); |
|||
if (parts.length >= 9) { |
|||
PostInfo post = parsePostInfo(parts); |
|||
if (post != null) { |
|||
posts.add(post); |
|||
rowCount++; |
|||
} |
|||
} |
|||
} |
|||
|
|||
System.out.println("成功读取 " + posts.size() + " 条数据"); |
|||
|
|||
} catch (IOException e) { |
|||
System.err.println("读取文件时出错: " + e.getMessage()); |
|||
} |
|||
|
|||
return posts; |
|||
} |
|||
|
|||
private static String[] parseCSVLine(String line) { |
|||
List<String> fields = new ArrayList<>(); |
|||
StringBuilder currentField = new StringBuilder(); |
|||
boolean inQuotes = false; |
|||
|
|||
for (char c : line.toCharArray()) { |
|||
if (c == '"') { |
|||
inQuotes = !inQuotes; |
|||
} else if (c == ',' && !inQuotes) { |
|||
fields.add(currentField.toString().trim()); |
|||
currentField.setLength(0); |
|||
} else { |
|||
currentField.append(c); |
|||
} |
|||
} |
|||
|
|||
fields.add(currentField.toString().trim()); |
|||
return fields.toArray(new String[0]); |
|||
} |
|||
|
|||
private static PostInfo parsePostInfo(String[] parts) { |
|||
try { |
|||
PostInfo post = new PostInfo(); |
|||
|
|||
post.setTitle(parts[0]); |
|||
post.setContent(parts[1]); |
|||
post.setAuthor(parts[2]); |
|||
|
|||
if (!parts[3].isEmpty()) { |
|||
post.setPostDate(LocalDate.parse(parts[3], DATE_FORMATTER)); |
|||
} |
|||
|
|||
post.setLikeCount(parseInt(parts[4])); |
|||
post.setCommentCount(parseInt(parts[5])); |
|||
post.setViewCount(parseInt(parts[6])); |
|||
|
|||
post.setTags(parts[7]); |
|||
post.setSentiment(parts[8]); |
|||
|
|||
return post; |
|||
} catch (Exception e) { |
|||
System.err.println("解析数据时出错: " + e.getMessage()); |
|||
return null; |
|||
} |
|||
} |
|||
|
|||
private static int parseInt(String value) { |
|||
try { |
|||
if (value == null || value.isEmpty()) { |
|||
return 0; |
|||
} |
|||
return Integer.parseInt(value); |
|||
} catch (NumberFormatException e) { |
|||
return 0; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,214 @@ |
|||
package com.project.report; |
|||
|
|||
import com.project.analyzer.PostAnalyzer; |
|||
import com.project.model.PostInfo; |
|||
|
|||
import java.io.BufferedWriter; |
|||
import java.io.FileWriter; |
|||
import java.io.IOException; |
|||
import java.nio.charset.StandardCharsets; |
|||
import java.nio.file.Files; |
|||
import java.nio.file.Paths; |
|||
import java.time.LocalDateTime; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.util.Map; |
|||
|
|||
public class HTMLReportGenerator { |
|||
|
|||
private static final String OUTPUT_DIR = "d:\\java\\project\\reports"; |
|||
|
|||
public static void generateReport(PostAnalyzer analyzer) { |
|||
try { |
|||
Files.createDirectories(Paths.get(OUTPUT_DIR)); |
|||
|
|||
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")); |
|||
String filename = "report_" + timestamp + ".html"; |
|||
String filepath = OUTPUT_DIR + "/" + filename; |
|||
|
|||
try (BufferedWriter writer = new BufferedWriter( |
|||
new FileWriter(filepath, StandardCharsets.UTF_8))) { |
|||
|
|||
writer.write(generateHTMLContent(analyzer)); |
|||
} |
|||
|
|||
System.out.println("HTML报告已生成: " + filepath); |
|||
|
|||
} catch (IOException e) { |
|||
System.err.println("生成HTML报告时出错: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
private static String generateHTMLContent(PostAnalyzer analyzer) { |
|||
StringBuilder html = new StringBuilder(); |
|||
|
|||
html.append("<!DOCTYPE html>\n"); |
|||
html.append("<html lang=\"zh-CN\">\n"); |
|||
html.append("<head>\n"); |
|||
html.append(" <meta charset=\"UTF-8\">\n"); |
|||
html.append(" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n"); |
|||
html.append(" <title>图文帖子数据分析报告</title>\n"); |
|||
html.append(" <style>\n"); |
|||
html.append(" * { margin: 0; padding: 0; box-sizing: border-box; }\n"); |
|||
html.append(" body { font-family: 'Microsoft YaHei', Arial, sans-serif; background: #f5f5f5; padding: 20px; }\n"); |
|||
html.append(" .container { max-width: 1200px; margin: 0 auto; background: white; padding: 30px; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }\n"); |
|||
html.append(" h1 { color: #333; text-align: center; margin-bottom: 10px; }\n"); |
|||
html.append(" .subtitle { color: #666; text-align: center; margin-bottom: 30px; font-size: 14px; }\n"); |
|||
html.append(" .section { margin-bottom: 40px; }\n"); |
|||
html.append(" .section h2 { color: #2c3e50; border-bottom: 3px solid #3498db; padding-bottom: 10px; margin-bottom: 20px; }\n"); |
|||
html.append(" table { width: 100%; border-collapse: collapse; margin-bottom: 20px; }\n"); |
|||
html.append(" th, td { padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }\n"); |
|||
html.append(" th { background: #3498db; color: white; font-weight: bold; }\n"); |
|||
html.append(" tr:hover { background: #f8f9fa; }\n"); |
|||
html.append(" .stat-card { display: inline-block; width: 200px; padding: 20px; margin: 10px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; text-align: center; }\n"); |
|||
html.append(" .stat-card h3 { font-size: 36px; margin-bottom: 10px; }\n"); |
|||
html.append(" .stat-card p { font-size: 14px; opacity: 0.9; }\n"); |
|||
html.append(" .chart-container { text-align: center; margin: 20px 0; }\n"); |
|||
html.append(" .chart-container img { max-width: 100%; height: auto; border: 1px solid #ddd; border-radius: 5px; }\n"); |
|||
html.append(" .summary { background: #e8f4f8; padding: 20px; border-radius: 10px; margin-bottom: 30px; }\n"); |
|||
html.append(" .summary h3 { color: #2c3e50; margin-bottom: 15px; }\n"); |
|||
html.append(" .summary ul { list-style-position: inside; color: #555; }\n"); |
|||
html.append(" .summary li { margin: 8px 0; }\n"); |
|||
html.append(" </style>\n"); |
|||
html.append("</head>\n"); |
|||
html.append("<body>\n"); |
|||
html.append(" <div class=\"container\">\n"); |
|||
html.append(" <h1>图文帖子数据分析报告</h1>\n"); |
|||
html.append(" <p class=\"subtitle\">生成时间: ").append(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))).append("</p>\n"); |
|||
|
|||
html.append(generateSummarySection(analyzer)); |
|||
html.append(generateSentimentSection(analyzer)); |
|||
html.append(generateEngagementSection(analyzer)); |
|||
html.append(generateAuthorSection(analyzer)); |
|||
html.append(generateChartsSection()); |
|||
|
|||
html.append(" </div>\n"); |
|||
html.append("</body>\n"); |
|||
html.append("</html>"); |
|||
|
|||
return html.toString(); |
|||
} |
|||
|
|||
private static String generateSummarySection(PostAnalyzer analyzer) { |
|||
StringBuilder section = new StringBuilder(); |
|||
|
|||
int totalPosts = analyzer.getPosts().size(); |
|||
double avgLikes = analyzer.getPosts().stream() |
|||
.mapToInt(PostInfo::getLikeCount) |
|||
.average() |
|||
.orElse(0); |
|||
|
|||
section.append(" <div class=\"section\">\n"); |
|||
section.append(" <div class=\"stat-card\">\n"); |
|||
section.append(" <h3>").append(totalPosts).append("</h3>\n"); |
|||
section.append(" <p>帖子总数</p>\n"); |
|||
section.append(" </div>\n"); |
|||
section.append(" <div class=\"stat-card\">\n"); |
|||
section.append(" <h3>").append(String.format("%.1f", avgLikes)).append("</h3>\n"); |
|||
section.append(" <p>平均点赞</p>\n"); |
|||
section.append(" </div>\n"); |
|||
section.append(" </div>\n"); |
|||
|
|||
section.append(" <div class=\"summary\">\n"); |
|||
section.append(" <h3>分析摘要</h3>\n"); |
|||
section.append(" <ul>\n"); |
|||
section.append(" <li>本次分析共收集 ").append(totalPosts).append(" 条图文帖子数据</li>\n"); |
|||
section.append(" <li>数据来源:D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用</li>\n"); |
|||
section.append(" <li>分析内容包括情感倾向分布、互动指标、热门作者等多个维度</li>\n"); |
|||
section.append(" <li>通过数据可视化展示分析结果,便于直观理解</li>\n"); |
|||
section.append(" </ul>\n"); |
|||
section.append(" </div>\n"); |
|||
|
|||
return section.toString(); |
|||
} |
|||
|
|||
private static String generateSentimentSection(PostAnalyzer analyzer) { |
|||
StringBuilder section = new StringBuilder(); |
|||
Map<String, Long> sentimentData = analyzer.getSentimentDistributionData(); |
|||
|
|||
section.append(" <div class=\"section\">\n"); |
|||
section.append(" <h2>情感倾向分布分析</h2>\n"); |
|||
section.append(" <table>\n"); |
|||
section.append(" <tr><th>情感倾向</th><th>帖子数量</th><th>占比</th></tr>\n"); |
|||
|
|||
long total = sentimentData.values().stream().mapToLong(Long::longValue).sum(); |
|||
|
|||
for (Map.Entry<String, Long> entry : sentimentData.entrySet()) { |
|||
double percent = (entry.getValue() * 100.0) / total; |
|||
section.append(" <tr><td>").append(entry.getKey()) |
|||
.append("</td><td>").append(entry.getValue()) |
|||
.append("</td><td>").append(String.format("%.1f%%", percent)) |
|||
.append("</td></tr>\n"); |
|||
} |
|||
|
|||
section.append(" </table>\n"); |
|||
section.append(" </div>\n"); |
|||
|
|||
return section.toString(); |
|||
} |
|||
|
|||
private static String generateEngagementSection(PostAnalyzer analyzer) { |
|||
StringBuilder section = new StringBuilder(); |
|||
Map<String, Double> engagementData = analyzer.getEngagementData(); |
|||
|
|||
section.append(" <div class=\"section\">\n"); |
|||
section.append(" <h2>互动指标分析</h2>\n"); |
|||
section.append(" <table>\n"); |
|||
section.append(" <tr><th>指标</th><th>平均值</th></tr>\n"); |
|||
|
|||
for (Map.Entry<String, Double> entry : engagementData.entrySet()) { |
|||
section.append(" <tr><td>").append(entry.getKey()) |
|||
.append("</td><td>").append(String.format("%.1f", entry.getValue())) |
|||
.append("</td></tr>\n"); |
|||
} |
|||
|
|||
section.append(" </table>\n"); |
|||
section.append(" </div>\n"); |
|||
|
|||
return section.toString(); |
|||
} |
|||
|
|||
private static String generateAuthorSection(PostAnalyzer analyzer) { |
|||
StringBuilder section = new StringBuilder(); |
|||
Map<String, Integer> authorData = analyzer.getAuthorPostCount(); |
|||
|
|||
section.append(" <div class=\"section\">\n"); |
|||
section.append(" <h2>热门作者排行TOP10</h2>\n"); |
|||
section.append(" <table>\n"); |
|||
section.append(" <tr><th>排名</th><th>作者</th><th>帖子数量</th></tr>\n"); |
|||
|
|||
int rank = 1; |
|||
for (Map.Entry<String, Integer> entry : authorData.entrySet()) { |
|||
section.append(" <tr><td>").append(rank++) |
|||
.append("</td><td>").append(entry.getKey()) |
|||
.append("</td><td>").append(entry.getValue()) |
|||
.append("</td></tr>\n"); |
|||
} |
|||
|
|||
section.append(" </table>\n"); |
|||
section.append(" </div>\n"); |
|||
|
|||
return section.toString(); |
|||
} |
|||
|
|||
private static String generateChartsSection() { |
|||
StringBuilder section = new StringBuilder(); |
|||
|
|||
section.append(" <div class=\"section\">\n"); |
|||
section.append(" <h2>数据可视化图表</h2>\n"); |
|||
section.append(" <div class=\"chart-container\">\n"); |
|||
section.append(" <h3>情感倾向分布</h3>\n"); |
|||
section.append(" <img src=\"../charts/sentiment_distribution.png\" alt=\"情感倾向分布图\">\n"); |
|||
section.append(" </div>\n"); |
|||
section.append(" <div class=\"chart-container\">\n"); |
|||
section.append(" <h3>互动指标分析</h3>\n"); |
|||
section.append(" <img src=\"../charts/engagement_metrics.png\" alt=\"互动指标图\">\n"); |
|||
section.append(" </div>\n"); |
|||
section.append(" <div class=\"chart-container\">\n"); |
|||
section.append(" <h3>热门作者排行</h3>\n"); |
|||
section.append(" <img src=\"../charts/author_ranking.png\" alt=\"作者排行图\">\n"); |
|||
section.append(" </div>\n"); |
|||
section.append(" </div>\n"); |
|||
|
|||
return section.toString(); |
|||
} |
|||
} |
|||
@ -0,0 +1,67 @@ |
|||
package com.project; |
|||
|
|||
import com.project.analyzer.PostAnalyzer; |
|||
import com.project.chart.SimpleChartGenerator; |
|||
import com.project.model.PostInfo; |
|||
import com.project.reader.ExcelReader; |
|||
import com.project.report.HTMLReportGenerator; |
|||
import com.project.storage.DataStorage; |
|||
import com.project.util.DataCleaner; |
|||
|
|||
import java.util.List; |
|||
import java.util.Scanner; |
|||
|
|||
public class Main { |
|||
|
|||
public static void main(String[] args) { |
|||
System.out.println("========================================"); |
|||
System.out.println(" Java网络爬虫与数据分析系统"); |
|||
System.out.println("========================================\n"); |
|||
|
|||
String dataFilePath = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx"; |
|||
String outputDir = "d:\\java\\project\\data"; |
|||
int maxRows = 300; |
|||
|
|||
try { |
|||
System.out.println("开始读取本地数据文件..."); |
|||
System.out.println("数据文件: " + dataFilePath); |
|||
System.out.println("读取前 " + maxRows + " 条数据"); |
|||
|
|||
List<PostInfo> rawPosts = ExcelReader.readExcelData(dataFilePath, maxRows); |
|||
|
|||
if (rawPosts.isEmpty()) { |
|||
System.out.println("未获取到任何数据,程序退出"); |
|||
return; |
|||
} |
|||
|
|||
System.out.println("\n开始数据清洗..."); |
|||
List<PostInfo> cleanedPosts = DataCleaner.cleanPosts(rawPosts); |
|||
|
|||
System.out.println("\n保存数据到文件..."); |
|||
DataStorage.saveToCSV(cleanedPosts, outputDir); |
|||
DataStorage.saveToJSON(cleanedPosts, outputDir); |
|||
|
|||
System.out.println("\n开始数据分析..."); |
|||
PostAnalyzer analyzer = new PostAnalyzer(cleanedPosts); |
|||
analyzer.analyzeAll(); |
|||
|
|||
System.out.println("\n生成图表..."); |
|||
SimpleChartGenerator.generateAllCharts(analyzer); |
|||
|
|||
System.out.println("\n生成HTML报告..."); |
|||
HTMLReportGenerator.generateReport(analyzer); |
|||
|
|||
System.out.println("\n========================================"); |
|||
System.out.println(" 程序执行完成!"); |
|||
System.out.println("========================================"); |
|||
System.out.println("\n输出文件位置:"); |
|||
System.out.println("- 数据文件: " + outputDir); |
|||
System.out.println("- 图表文件: d:\\java\\project\\charts"); |
|||
System.out.println("- 报告文件: d:\\java\\project\\reports"); |
|||
|
|||
} catch (Exception e) { |
|||
System.err.println("程序执行出错: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,200 @@ |
|||
package com.project.analyzer; |
|||
|
|||
import com.project.model.PostInfo; |
|||
|
|||
import java.util.*; |
|||
import java.util.stream.Collectors; |
|||
|
|||
public class PostAnalyzer { |
|||
|
|||
private final List<PostInfo> posts; |
|||
|
|||
public PostAnalyzer(List<PostInfo> posts) { |
|||
this.posts = posts; |
|||
} |
|||
|
|||
public List<PostInfo> getPosts() { |
|||
return posts; |
|||
} |
|||
|
|||
public void analyzeAll() { |
|||
System.out.println("\n========== 数据分析报告 ==========\n"); |
|||
|
|||
analyzeSentimentDistribution(); |
|||
analyzeEngagementMetrics(); |
|||
analyzePopularAuthors(); |
|||
analyzeContentLength(); |
|||
analyzeTemporalTrends(); |
|||
|
|||
System.out.println("\n========== 分析完成 ==========\n"); |
|||
} |
|||
|
|||
public void analyzeSentimentDistribution() { |
|||
System.out.println("【情感倾向分布分析】"); |
|||
System.out.println("----------------------------------------"); |
|||
|
|||
Map<String, Long> sentimentCounts = posts.stream() |
|||
.collect(Collectors.groupingBy( |
|||
PostInfo::getSentiment, |
|||
Collectors.counting() |
|||
)); |
|||
|
|||
System.out.printf("%-20s %s%n", "情感倾向", "帖子数量"); |
|||
System.out.println("----------------------------------------"); |
|||
|
|||
sentimentCounts.entrySet().stream() |
|||
.sorted(Map.Entry.<String, Long>comparingByValue().reversed()) |
|||
.forEach(entry -> System.out.printf("%-20s %d%n", entry.getKey(), entry.getValue())); |
|||
|
|||
System.out.println(); |
|||
} |
|||
|
|||
public void analyzeEngagementMetrics() { |
|||
System.out.println("【互动指标分析】"); |
|||
System.out.println("----------------------------------------"); |
|||
|
|||
double avgLikes = posts.stream() |
|||
.mapToInt(PostInfo::getLikeCount) |
|||
.average() |
|||
.orElse(0); |
|||
|
|||
double avgComments = posts.stream() |
|||
.mapToInt(PostInfo::getCommentCount) |
|||
.average() |
|||
.orElse(0); |
|||
|
|||
double avgViews = posts.stream() |
|||
.mapToInt(PostInfo::getViewCount) |
|||
.average() |
|||
.orElse(0); |
|||
|
|||
System.out.printf("平均点赞数: %.1f%n", avgLikes); |
|||
System.out.printf("平均评论数: %.1f%n", avgComments); |
|||
System.out.printf("平均浏览量: %.1f%n", avgViews); |
|||
|
|||
System.out.println(); |
|||
} |
|||
|
|||
public void analyzePopularAuthors() { |
|||
System.out.println("【热门作者排行】"); |
|||
System.out.println("----------------------------------------"); |
|||
System.out.printf("%-30s %10s %10s %10s%n", "作者", "帖子数", "总点赞", "总评论"); |
|||
System.out.println("----------------------------------------"); |
|||
|
|||
Map<String, List<PostInfo>> authorPosts = posts.stream() |
|||
.collect(Collectors.groupingBy(PostInfo::getAuthor)); |
|||
|
|||
authorPosts.entrySet().stream() |
|||
.sorted(Map.Entry.<String, List<PostInfo>>comparingByValue((a, b) -> b.size() - a.size())) |
|||
.limit(10) |
|||
.forEach(entry -> { |
|||
String author = entry.getKey(); |
|||
List<PostInfo> authorPostList = entry.getValue(); |
|||
int postCount = authorPostList.size(); |
|||
int totalLikes = authorPostList.stream().mapToInt(PostInfo::getLikeCount).sum(); |
|||
int totalComments = authorPostList.stream().mapToInt(PostInfo::getCommentCount).sum(); |
|||
|
|||
System.out.printf("%-30s %10d %10d %10d%n", |
|||
author.length() > 28 ? author.substring(0, 28) : author, |
|||
postCount, totalLikes, totalComments); |
|||
}); |
|||
|
|||
System.out.println(); |
|||
} |
|||
|
|||
public void analyzeContentLength() { |
|||
System.out.println("【内容长度分析】"); |
|||
System.out.println("----------------------------------------"); |
|||
|
|||
double avgLength = posts.stream() |
|||
.mapToInt(post -> post.getContent().length()) |
|||
.average() |
|||
.orElse(0); |
|||
|
|||
int maxLength = posts.stream() |
|||
.mapToInt(post -> post.getContent().length()) |
|||
.max() |
|||
.orElse(0); |
|||
|
|||
int minLength = posts.stream() |
|||
.mapToInt(post -> post.getContent().length()) |
|||
.min() |
|||
.orElse(0); |
|||
|
|||
System.out.printf("平均内容长度: %.1f 字符%n", avgLength); |
|||
System.out.printf("最长内容: %d 字符%n", maxLength); |
|||
System.out.printf("最短内容: %d 字符%n", minLength); |
|||
|
|||
System.out.println(); |
|||
} |
|||
|
|||
public void analyzeTemporalTrends() { |
|||
System.out.println("【时间趋势分析】"); |
|||
System.out.println("----------------------------------------"); |
|||
|
|||
Map<String, Long> monthlyPosts = posts.stream() |
|||
.filter(post -> post.getPostDate() != null) |
|||
.collect(Collectors.groupingBy( |
|||
post -> post.getPostDate().format(java.time.format.DateTimeFormatter.ofPattern("yyyy-MM")), |
|||
Collectors.counting() |
|||
)); |
|||
|
|||
System.out.printf("%-10s %s%n", "月份", "帖子数量"); |
|||
System.out.println("----------------------------------------"); |
|||
|
|||
monthlyPosts.entrySet().stream() |
|||
.sorted(Map.Entry.comparingByKey()) |
|||
.forEach(entry -> System.out.printf("%-10s %d%n", entry.getKey(), entry.getValue())); |
|||
|
|||
System.out.println(); |
|||
} |
|||
|
|||
public Map<String, Long> getSentimentDistributionData() { |
|||
return posts.stream() |
|||
.collect(Collectors.groupingBy( |
|||
PostInfo::getSentiment, |
|||
Collectors.counting() |
|||
)); |
|||
} |
|||
|
|||
public Map<String, Double> getEngagementData() { |
|||
Map<String, Double> engagementData = new LinkedHashMap<>(); |
|||
|
|||
double avgLikes = posts.stream() |
|||
.mapToInt(PostInfo::getLikeCount) |
|||
.average() |
|||
.orElse(0); |
|||
|
|||
double avgComments = posts.stream() |
|||
.mapToInt(PostInfo::getCommentCount) |
|||
.average() |
|||
.orElse(0); |
|||
|
|||
double avgViews = posts.stream() |
|||
.mapToInt(PostInfo::getViewCount) |
|||
.average() |
|||
.orElse(0); |
|||
|
|||
engagementData.put("点赞", avgLikes); |
|||
engagementData.put("评论", avgComments); |
|||
engagementData.put("浏览", avgViews); |
|||
|
|||
return engagementData; |
|||
} |
|||
|
|||
public Map<String, Integer> getAuthorPostCount() { |
|||
return posts.stream() |
|||
.collect(Collectors.groupingBy( |
|||
PostInfo::getAuthor, |
|||
Collectors.summingInt(post -> 1) |
|||
)).entrySet().stream() |
|||
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed()) |
|||
.limit(10) |
|||
.collect(Collectors.toMap( |
|||
Map.Entry::getKey, |
|||
Map.Entry::getValue, |
|||
(e1, e2) -> e1, |
|||
LinkedHashMap::new |
|||
)); |
|||
} |
|||
} |
|||
@ -0,0 +1,129 @@ |
|||
package com.project.model; |
|||
|
|||
import java.time.LocalDate; |
|||
|
|||
public class PostInfo { |
|||
private String title; |
|||
private String content; |
|||
private String author; |
|||
private LocalDate postDate; |
|||
private int likeCount; |
|||
private int commentCount; |
|||
private int viewCount; |
|||
private String tags; |
|||
private String sentiment; |
|||
|
|||
public PostInfo() { |
|||
} |
|||
|
|||
public PostInfo(String title, String content, String author, LocalDate postDate, |
|||
int likeCount, int commentCount, int viewCount, String tags, String sentiment) { |
|||
this.title = title; |
|||
this.content = content; |
|||
this.author = author; |
|||
this.postDate = postDate; |
|||
this.likeCount = likeCount; |
|||
this.commentCount = commentCount; |
|||
this.viewCount = viewCount; |
|||
this.tags = tags; |
|||
this.sentiment = sentiment; |
|||
} |
|||
|
|||
public String getTitle() { |
|||
return title; |
|||
} |
|||
|
|||
public void setTitle(String title) { |
|||
this.title = title; |
|||
} |
|||
|
|||
public String getContent() { |
|||
return content; |
|||
} |
|||
|
|||
public void setContent(String content) { |
|||
this.content = content; |
|||
} |
|||
|
|||
public String getAuthor() { |
|||
return author; |
|||
} |
|||
|
|||
public void setAuthor(String author) { |
|||
this.author = author; |
|||
} |
|||
|
|||
public LocalDate getPostDate() { |
|||
return postDate; |
|||
} |
|||
|
|||
public void setPostDate(LocalDate postDate) { |
|||
this.postDate = postDate; |
|||
} |
|||
|
|||
public int getLikeCount() { |
|||
return likeCount; |
|||
} |
|||
|
|||
public void setLikeCount(int likeCount) { |
|||
this.likeCount = likeCount; |
|||
} |
|||
|
|||
public int getCommentCount() { |
|||
return commentCount; |
|||
} |
|||
|
|||
public void setCommentCount(int commentCount) { |
|||
this.commentCount = commentCount; |
|||
} |
|||
|
|||
public int getViewCount() { |
|||
return viewCount; |
|||
} |
|||
|
|||
public void setViewCount(int viewCount) { |
|||
this.viewCount = viewCount; |
|||
} |
|||
|
|||
public String getTags() { |
|||
return tags; |
|||
} |
|||
|
|||
public void setTags(String tags) { |
|||
this.tags = tags; |
|||
} |
|||
|
|||
public String getSentiment() { |
|||
return sentiment; |
|||
} |
|||
|
|||
public void setSentiment(String sentiment) { |
|||
this.sentiment = sentiment; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "PostInfo{" + |
|||
"title='" + title + '\'' + |
|||
", author='" + author + '\'' + |
|||
", postDate=" + postDate + |
|||
", likeCount=" + likeCount + |
|||
", commentCount=" + commentCount + |
|||
", viewCount=" + viewCount + |
|||
", sentiment='" + sentiment + '\'' + |
|||
'}'; |
|||
} |
|||
|
|||
public String toCSV() { |
|||
return String.format("\"%s\",\"%s\",\"%s\",\"%s\",%d,%d,%d,\"%s\",\"%s\"", |
|||
title != null ? title.replace("\"", "\"\"") : "", |
|||
content != null ? content.replace("\"", "\"\"").replace("\n", " ") : "", |
|||
author != null ? author.replace("\"", "\"\"") : "", |
|||
postDate != null ? postDate.toString() : "", |
|||
likeCount, |
|||
commentCount, |
|||
viewCount, |
|||
tags != null ? tags.replace("\"", "\"\"") : "", |
|||
sentiment != null ? sentiment.replace("\"", "\"\"") : ""); |
|||
} |
|||
} |
|||
@ -0,0 +1,2 @@ |
|||
# java |
|||
|
|||
@ -0,0 +1,165 @@ |
|||
package com.project.chart; |
|||
|
|||
import com.project.analyzer.PostAnalyzer; |
|||
|
|||
import java.awt.*; |
|||
import java.awt.image.BufferedImage; |
|||
import java.io.File; |
|||
import java.io.IOException; |
|||
import java.nio.file.Files; |
|||
import java.nio.file.Paths; |
|||
import java.util.Map; |
|||
import javax.imageio.ImageIO; |
|||
|
|||
public class SimpleChartGenerator { |
|||
|
|||
private static final String OUTPUT_DIR = "d:\\java\\project\\charts"; |
|||
private static final int WIDTH = 800; |
|||
private static final int HEIGHT = 600; |
|||
|
|||
public static void generateAllCharts(PostAnalyzer analyzer) { |
|||
try { |
|||
Files.createDirectories(Paths.get(OUTPUT_DIR)); |
|||
|
|||
generateSentimentChart(analyzer); |
|||
generateEngagementChart(analyzer); |
|||
generateAuthorChart(analyzer); |
|||
|
|||
System.out.println("\n所有图表已生成,保存在: " + OUTPUT_DIR); |
|||
|
|||
} catch (IOException e) { |
|||
System.err.println("创建图表目录时出错: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
public static void generateSentimentChart(PostAnalyzer analyzer) { |
|||
Map<String, Long> data = analyzer.getSentimentDistributionData(); |
|||
|
|||
BufferedImage image = new BufferedImage(WIDTH, HEIGHT, BufferedImage.TYPE_INT_RGB); |
|||
Graphics2D g2d = image.createGraphics(); |
|||
|
|||
g2d.setColor(Color.WHITE); |
|||
g2d.fillRect(0, 0, WIDTH, HEIGHT); |
|||
|
|||
g2d.setColor(Color.BLACK); |
|||
g2d.setFont(new Font("宋体", Font.BOLD, 24)); |
|||
g2d.drawString("情感倾向分布", 300, 40); |
|||
|
|||
int barWidth = 150; |
|||
int startX = 200; |
|||
int startY = 500; |
|||
int maxHeight = 400; |
|||
|
|||
long maxValue = data.values().stream().max(Long::compare).orElse(1L); |
|||
|
|||
int index = 0; |
|||
for (Map.Entry<String, Long> entry : data.entrySet()) { |
|||
int barHeight = (int) ((entry.getValue() * 1.0 / maxValue) * maxHeight); |
|||
|
|||
g2d.setColor(new Color(70, 130, 180)); |
|||
g2d.fillRect(startX + index * (barWidth + 50), startY - barHeight, barWidth, barHeight); |
|||
|
|||
g2d.setColor(Color.BLACK); |
|||
g2d.setFont(new Font("宋体", Font.PLAIN, 14)); |
|||
g2d.drawString(entry.getKey(), startX + index * (barWidth + 50) + 50, startY + 20); |
|||
g2d.drawString(String.valueOf(entry.getValue()), startX + index * (barWidth + 50) + 60, startY - barHeight - 5); |
|||
|
|||
index++; |
|||
} |
|||
|
|||
g2d.dispose(); |
|||
saveImage(image, "sentiment_distribution.png"); |
|||
} |
|||
|
|||
public static void generateEngagementChart(PostAnalyzer analyzer) { |
|||
Map<String, Double> data = analyzer.getEngagementData(); |
|||
|
|||
BufferedImage image = new BufferedImage(WIDTH, HEIGHT, BufferedImage.TYPE_INT_RGB); |
|||
Graphics2D g2d = image.createGraphics(); |
|||
|
|||
g2d.setColor(Color.WHITE); |
|||
g2d.fillRect(0, 0, WIDTH, HEIGHT); |
|||
|
|||
g2d.setColor(Color.BLACK); |
|||
g2d.setFont(new Font("宋体", Font.BOLD, 24)); |
|||
g2d.drawString("互动指标分析", 300, 40); |
|||
|
|||
int barWidth = 150; |
|||
int startX = 200; |
|||
int startY = 500; |
|||
int maxHeight = 400; |
|||
|
|||
double maxValue = data.values().stream().max(Double::compare).orElse(1.0); |
|||
|
|||
int index = 0; |
|||
for (Map.Entry<String, Double> entry : data.entrySet()) { |
|||
int barHeight = (int) ((entry.getValue() / maxValue) * maxHeight); |
|||
|
|||
g2d.setColor(new Color(60, 179, 113)); |
|||
g2d.fillRect(startX + index * (barWidth + 50), startY - barHeight, barWidth, barHeight); |
|||
|
|||
g2d.setColor(Color.BLACK); |
|||
g2d.setFont(new Font("宋体", Font.PLAIN, 14)); |
|||
g2d.drawString(entry.getKey(), startX + index * (barWidth + 50) + 60, startY + 20); |
|||
g2d.drawString(String.format("%.1f", entry.getValue()), startX + index * (barWidth + 50) + 50, startY - barHeight - 5); |
|||
|
|||
index++; |
|||
} |
|||
|
|||
g2d.dispose(); |
|||
saveImage(image, "engagement_metrics.png"); |
|||
} |
|||
|
|||
public static void generateAuthorChart(PostAnalyzer analyzer) { |
|||
Map<String, Integer> data = analyzer.getAuthorPostCount(); |
|||
|
|||
BufferedImage image = new BufferedImage(WIDTH, HEIGHT, BufferedImage.TYPE_INT_RGB); |
|||
Graphics2D g2d = image.createGraphics(); |
|||
|
|||
g2d.setColor(Color.WHITE); |
|||
g2d.fillRect(0, 0, WIDTH, HEIGHT); |
|||
|
|||
g2d.setColor(Color.BLACK); |
|||
g2d.setFont(new Font("宋体", Font.BOLD, 24)); |
|||
g2d.drawString("热门作者排行TOP10", 280, 40); |
|||
|
|||
int barHeight = 35; |
|||
int startY = 80; |
|||
int startX = 200; |
|||
int maxWidth = 500; |
|||
|
|||
int maxValue = data.values().stream().max(Integer::compare).orElse(1); |
|||
|
|||
int index = 0; |
|||
for (Map.Entry<String, Integer> entry : data.entrySet()) { |
|||
int barWidth = (int) ((entry.getValue() * 1.0 / maxValue) * maxWidth); |
|||
|
|||
g2d.setColor(new Color(255, 140, 0)); |
|||
g2d.fillRect(startX, startY + index * (barHeight + 10), barWidth, barHeight); |
|||
|
|||
g2d.setColor(Color.BLACK); |
|||
g2d.setFont(new Font("宋体", Font.PLAIN, 12)); |
|||
String author = entry.getKey(); |
|||
if (author.length() > 15) { |
|||
author = author.substring(0, 15) + "..."; |
|||
} |
|||
g2d.drawString(author, 50, startY + index * (barHeight + 10) + 23); |
|||
g2d.drawString(String.valueOf(entry.getValue()), startX + barWidth + 10, startY + index * (barHeight + 10) + 23); |
|||
|
|||
index++; |
|||
} |
|||
|
|||
g2d.dispose(); |
|||
saveImage(image, "author_ranking.png"); |
|||
} |
|||
|
|||
private static void saveImage(BufferedImage image, String filename) { |
|||
try { |
|||
File file = new File(OUTPUT_DIR, filename); |
|||
ImageIO.write(image, "PNG", file); |
|||
System.out.println("图表已保存: " + file.getAbsolutePath()); |
|||
} catch (IOException e) { |
|||
System.err.println("保存图表失败: " + e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
Loading…
Reference in new issue