34 changed files with 4369 additions and 0 deletions
Binary file not shown.
|
|
@ -0,0 +1,4 @@ |
|||
*.jar |
|||
*.jar |
|||
*.class |
|||
*.log |
|||
@ -0,0 +1,17 @@ |
|||
# DataCollect 教学项目 — 最小可运行版本 |
|||
|
|||
这是一个最小可用的 Java CLI 演示工程,目标:打印帮助信息以验证运行环境。 |
|||
|
|||
构建: |
|||
```bash |
|||
mvn -q package |
|||
``` |
|||
|
|||
运行(示例): |
|||
```bash |
|||
java -jar target/datacollect-cli-0.1.0-jar-with-dependencies.jar --help |
|||
``` |
|||
|
|||
项目结构(最小): |
|||
- `src/main/java/com/example/datacollect/Main.java` — CLI 入口,打印帮助 |
|||
- `pom.xml` — Maven 构建配置,生成可执行 jar |
|||
@ -0,0 +1,62 @@ |
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" |
|||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|||
<modelVersion>4.0.0</modelVersion> |
|||
<groupId>com.example</groupId> |
|||
<artifactId>datacollect-cli</artifactId> |
|||
<version>0.1.0</version> |
|||
<properties> |
|||
<maven.compiler.source>11</maven.compiler.source> |
|||
<maven.compiler.target>11</maven.compiler.target> |
|||
</properties> |
|||
<dependencies> |
|||
<dependency> |
|||
<groupId>org.jsoup</groupId> |
|||
<artifactId>jsoup</artifactId> |
|||
<version>1.14.3</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>ch.qos.logback</groupId> |
|||
<artifactId>logback-classic</artifactId> |
|||
<version>1.2.11</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.slf4j</groupId> |
|||
<artifactId>slf4j-api</artifactId> |
|||
<version>1.7.36</version> |
|||
</dependency> |
|||
</dependencies> |
|||
<build> |
|||
<plugins> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-compiler-plugin</artifactId> |
|||
<version>3.8.1</version> |
|||
</plugin> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-assembly-plugin</artifactId> |
|||
<version>3.3.0</version> |
|||
<configuration> |
|||
<archive> |
|||
<manifest> |
|||
<mainClass>com.example.datacollect.Main</mainClass> |
|||
</manifest> |
|||
</archive> |
|||
<descriptorRefs> |
|||
<descriptorRef>jar-with-dependencies</descriptorRef> |
|||
</descriptorRefs> |
|||
</configuration> |
|||
<executions> |
|||
<execution> |
|||
<id>make-assembly</id> |
|||
<phase>package</phase> |
|||
<goals> |
|||
<goal>single</goal> |
|||
</goals> |
|||
</execution> |
|||
</executions> |
|||
</plugin> |
|||
</plugins> |
|||
</build> |
|||
</project> |
|||
@ -0,0 +1,155 @@ |
|||
package com.example.datacollect; |
|||
|
|||
import com.example.datacollect.controller.CrawlerController; |
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.strategy.StrategyFactory; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
public class Main { |
|||
private static final Logger logger = LoggerFactory.getLogger(Main.class); |
|||
|
|||
public static void main(String[] args) { |
|||
logger.info("Starting CLI Crawler project"); |
|||
ConsoleView view = new ConsoleView(); |
|||
ArticleRepository repository = new ArticleRepository(); |
|||
StrategyFactory strategyFactory = new StrategyFactory(); |
|||
CrawlerController controller = new CrawlerController(view, repository, strategyFactory); |
|||
|
|||
view.printSuccess("Welcome to CLI Crawler (project)!"); |
|||
|
|||
while (true) { |
|||
view.printInfo("\n请选择要爬取的网站:"); |
|||
view.printInfo("1. 湖南大学新闻网 (https://news.hnu.edu.cn)"); |
|||
view.printInfo("2. 豆瓣电影Top250 (https://movie.douban.com/top250)"); |
|||
view.printInfo("3. 天气数据(输入城市查询近30天天气)"); |
|||
view.printInfo("4. 其他网站(手动输入URL)"); |
|||
view.printInfo("5. 退出"); |
|||
|
|||
view.print("\n请输入选项 (1-5): "); |
|||
String choice = view.readLine().trim(); |
|||
|
|||
String url = null; |
|||
switch (choice) { |
|||
case "1": |
|||
url = "https://news.hnu.edu.cn"; |
|||
break; |
|||
case "2": |
|||
url = "https://movie.douban.com/top250"; |
|||
break; |
|||
case "3": |
|||
view.printInfo("\n请选择城市:"); |
|||
view.printInfo("1. 北京 2. 上海 3. 广州 4. 深圳"); |
|||
view.printInfo("5. 长沙 6. 武汉 7. 成都 8. 杭州"); |
|||
view.printInfo("9. 南京 10. 天津 11. 重庆 12. 西安"); |
|||
view.printInfo("13. 苏州 14. 郑州 15. 沈阳 16. 青岛"); |
|||
view.printInfo("17. 厦门 18. 济南 19. 哈尔滨 20. 长春"); |
|||
view.printInfo("21. 昆明 22. 贵阳 23. 南宁 24. 海口"); |
|||
view.printInfo("25. 兰州 26. 拉萨 27. 乌鲁木齐"); |
|||
view.printInfo("或直接输入城市拼音(如:changsha)"); |
|||
|
|||
view.print("\n请输入选项或拼音: "); |
|||
String cityInput = view.readLine().trim(); |
|||
if (cityInput.isEmpty()) { |
|||
view.printError("输入不能为空"); |
|||
continue; |
|||
} |
|||
|
|||
String cityPinyin = getCityByInput(cityInput); |
|||
if (cityPinyin == null || cityPinyin.isEmpty()) { |
|||
view.printError("无效的城市选项: " + cityInput); |
|||
continue; |
|||
} |
|||
java.time.LocalDate now = java.time.LocalDate.now(); |
|||
String yearMonth = String.format("%04d%02d", now.getYear(), now.getMonthValue()); |
|||
url = "https://www.tianqihoubao.com/lishi/" + cityPinyin + "/month/" + yearMonth + ".html"; |
|||
view.printInfo("将爬取近30天天气数据: " + url); |
|||
break; |
|||
case "4": |
|||
view.print("请输入要爬取的网站URL: "); |
|||
url = view.readLine().trim(); |
|||
if (url.isEmpty()) { |
|||
view.printError("URL不能为空"); |
|||
continue; |
|||
} |
|||
break; |
|||
case "5": |
|||
view.printInfo("再见!"); |
|||
System.exit(0); |
|||
default: |
|||
view.printError("无效选项,请重新选择"); |
|||
continue; |
|||
} |
|||
|
|||
view.printInfo("\n正在爬取: " + url); |
|||
controller.handle("crawl " + url); |
|||
|
|||
if (repository.size() > 0) { |
|||
if (url.contains("stats.gov.cn")) { |
|||
view.print("\n是否显示柱状图? (y/n): "); |
|||
String chartChoice = view.readLine().trim().toLowerCase(); |
|||
if ("y".equals(chartChoice)) { |
|||
controller.handle("chart"); |
|||
|
|||
view.print("\n是否导出图表? (y/n): "); |
|||
String exportChartChoice = view.readLine().trim().toLowerCase(); |
|||
if ("y".equals(exportChartChoice)) { |
|||
view.print("请输入文件名 (如: chart.txt): "); |
|||
String chartFile = view.readLine().trim(); |
|||
if (chartFile.isEmpty()) { |
|||
chartFile = "chart.txt"; |
|||
} |
|||
controller.handle("chart " + chartFile); |
|||
} |
|||
} |
|||
} |
|||
|
|||
view.print("\n是否导出CSV数据? (y/n): "); |
|||
String exportChoice = view.readLine().trim().toLowerCase(); |
|||
if ("y".equals(exportChoice)) { |
|||
view.print("请输入文件名 (如: data.csv): "); |
|||
String filename = view.readLine().trim(); |
|||
if (filename.isEmpty()) { |
|||
filename = "data.csv"; |
|||
} |
|||
controller.handle("export " + filename); |
|||
} |
|||
} |
|||
|
|||
view.print("\n是否继续爬取其他网站? (y/n): "); |
|||
String continueChoice = view.readLine().trim().toLowerCase(); |
|||
if (!"y".equals(continueChoice)) { |
|||
view.printInfo("再见!"); |
|||
break; |
|||
} |
|||
} |
|||
} |
|||
|
|||
private static String getCityByInput(String input) { |
|||
String[] cities = { |
|||
"beijing", "shanghai", "guangzhou", "shenzhen", |
|||
"changsha", "wuhan", "chengdu", "hangzhou", |
|||
"nanjing", "tianjin", "chongqing", "xian", |
|||
"suzhou", "zhengzhou", "shenyang", "qingdao", |
|||
"xiamen", "jinan", "haerbin", "changchun", |
|||
"kunming", "guiyang", "nanning", "haikou", |
|||
"lanzhou", "lasa", "wulumuqi" |
|||
}; |
|||
|
|||
try { |
|||
int index = Integer.parseInt(input); |
|||
if (index >= 1 && index <= cities.length) { |
|||
return cities[index - 1]; |
|||
} |
|||
} catch (NumberFormatException e) { |
|||
} |
|||
|
|||
String lower = input.toLowerCase().trim(); |
|||
if (lower.matches("[a-z]+")) { |
|||
return lower; |
|||
} |
|||
|
|||
return null; |
|||
} |
|||
} |
|||
@ -0,0 +1,255 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
import java.io.FileWriter; |
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.Comparator; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class ChartCommand implements Command { |
|||
private static final Logger logger = LoggerFactory.getLogger(ChartCommand.class); |
|||
private static final Pattern NUMBER_PATTERN = Pattern.compile("[\\d,]+\\.?\\d*"); |
|||
|
|||
private static final String[] COLORS = { |
|||
"\u001B[31m", // 红色
|
|||
"\u001B[32m", // 绿色
|
|||
"\u001B[33m", // 黄色
|
|||
"\u001B[34m", // 蓝色
|
|||
"\u001B[35m", // 紫色
|
|||
"\u001B[36m", // 青色
|
|||
"\u001B[91m", // 亮红
|
|||
"\u001B[92m", // 亮绿
|
|||
"\u001B[93m", // 亮黄
|
|||
"\u001B[94m", // 亮蓝
|
|||
}; |
|||
private static final String RESET = "\u001B[0m"; |
|||
|
|||
private final ConsoleView view; |
|||
|
|||
public ChartCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "chart"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
List<Article> articles = repository.getAll(); |
|||
|
|||
if (articles.isEmpty()) { |
|||
view.printError("没有数据,请先爬取数据"); |
|||
return; |
|||
} |
|||
|
|||
List<DataItem> dataItems = new ArrayList<>(); |
|||
for (Article article : articles) { |
|||
String title = article.getTitle(); |
|||
String content = article.getContent(); |
|||
|
|||
if (content != null && !content.isEmpty()) { |
|||
double value = extractNumber(content); |
|||
if (value > 0) { |
|||
String name = title.replace(" 人口数据", "").trim(); |
|||
if (name.isEmpty()) { |
|||
name = title; |
|||
} |
|||
dataItems.add(new DataItem(name, value)); |
|||
} |
|||
} |
|||
} |
|||
|
|||
if (dataItems.isEmpty()) { |
|||
view.printError("没有找到可用的数值数据"); |
|||
return; |
|||
} |
|||
|
|||
dataItems.sort(Comparator.comparingDouble(DataItem::getValue).reversed()); |
|||
|
|||
if (dataItems.size() > 15) { |
|||
dataItems = dataItems.subList(0, 15); |
|||
} |
|||
|
|||
view.printInfo("\n=== 各省人口数据柱状图 ===\n"); |
|||
drawVerticalBarChart(dataItems); |
|||
|
|||
if (args.length >= 2) { |
|||
String filename = args[1]; |
|||
if (!filename.toLowerCase().endsWith(".txt")) { |
|||
filename += ".txt"; |
|||
} |
|||
exportChart(dataItems, filename); |
|||
} |
|||
} |
|||
|
|||
private double extractNumber(String text) { |
|||
Matcher matcher = NUMBER_PATTERN.matcher(text.replace(",", "")); |
|||
if (matcher.find()) { |
|||
try { |
|||
return Double.parseDouble(matcher.group()); |
|||
} catch (NumberFormatException e) { |
|||
return 0; |
|||
} |
|||
} |
|||
return 0; |
|||
} |
|||
|
|||
private void drawVerticalBarChart(List<DataItem> items) { |
|||
if (items.isEmpty()) return; |
|||
|
|||
int maxBarHeight = 15; |
|||
int barWidth = 4; |
|||
double maxValue = items.stream().mapToDouble(DataItem::getValue).max().orElse(1); |
|||
|
|||
int[] heights = new int[items.size()]; |
|||
for (int i = 0; i < items.size(); i++) { |
|||
heights[i] = (int) ((items.get(i).value / maxValue) * maxBarHeight); |
|||
if (heights[i] == 0 && items.get(i).value > 0) { |
|||
heights[i] = 1; |
|||
} |
|||
} |
|||
|
|||
System.out.println(); |
|||
for (int row = maxBarHeight; row >= 0; row--) { |
|||
System.out.print(" "); |
|||
for (int col = 0; col < items.size(); col++) { |
|||
String color = COLORS[col % COLORS.length]; |
|||
if (heights[col] >= row && row > 0) { |
|||
System.out.print(color + " " + "█".repeat(barWidth) + " " + RESET); |
|||
} else if (row == 0) { |
|||
System.out.print(" " + "─".repeat(barWidth) + " "); |
|||
} else { |
|||
System.out.print(" " + " ".repeat(barWidth) + " "); |
|||
} |
|||
} |
|||
|
|||
if (row > 0 && row % 3 == 0) { |
|||
double yValue = (maxValue * row / maxBarHeight); |
|||
System.out.print(" " + formatValue(yValue)); |
|||
} |
|||
System.out.println(); |
|||
} |
|||
|
|||
System.out.print(" "); |
|||
for (int col = 0; col < items.size(); col++) { |
|||
String color = COLORS[col % COLORS.length]; |
|||
System.out.print(color + "┬" + "─".repeat(barWidth) + "┬" + RESET); |
|||
} |
|||
System.out.println(); |
|||
|
|||
String[] names = new String[items.size()]; |
|||
for (int i = 0; i < items.size(); i++) { |
|||
names[i] = truncate(items.get(i).name, barWidth); |
|||
} |
|||
|
|||
for (int line = 0; line < 2; line++) { |
|||
System.out.print(" "); |
|||
for (int col = 0; col < items.size(); col++) { |
|||
String color = COLORS[col % COLORS.length]; |
|||
String name = names[col]; |
|||
if (line == 0) { |
|||
String part = name.length() >= 2 ? name.substring(0, Math.min(2, name.length())) : name; |
|||
System.out.print(color + " " + center(part, barWidth) + " " + RESET); |
|||
} else { |
|||
String part = name.length() > 2 ? name.substring(2, Math.min(4, name.length())) : ""; |
|||
System.out.print(color + " " + center(part, barWidth) + " " + RESET); |
|||
} |
|||
} |
|||
System.out.println(); |
|||
} |
|||
|
|||
System.out.println(); |
|||
System.out.println("数据详情:"); |
|||
for (int i = 0; i < items.size(); i++) { |
|||
String color = COLORS[i % COLORS.length]; |
|||
System.out.println(color + "■" + RESET + " " + items.get(i).name + ": " + formatValue(items.get(i).value)); |
|||
} |
|||
} |
|||
|
|||
private String center(String s, int width) { |
|||
if (s.length() >= width) return s.substring(0, width); |
|||
int padding = (width - s.length()) / 2; |
|||
return " ".repeat(padding) + s + " ".repeat(width - s.length() - padding); |
|||
} |
|||
|
|||
private String truncate(String s, int maxLen) { |
|||
if (s.length() <= maxLen) return s; |
|||
return s.substring(0, maxLen); |
|||
} |
|||
|
|||
private String formatValue(double value) { |
|||
if (value >= 100000000) { |
|||
return String.format("%.2f亿", value / 100000000); |
|||
} else if (value >= 10000) { |
|||
return String.format("%.2f万", value / 10000); |
|||
} else { |
|||
return String.format("%.0f", value); |
|||
} |
|||
} |
|||
|
|||
private void exportChart(List<DataItem> items, String filename) { |
|||
try (FileWriter writer = new FileWriter(filename)) { |
|||
writer.write("各省人口数据柱状图\n"); |
|||
writer.write("========================\n\n"); |
|||
|
|||
int maxBarHeight = 15; |
|||
int barWidth = 4; |
|||
double maxValue = items.stream().mapToDouble(DataItem::getValue).max().orElse(1); |
|||
|
|||
int[] heights = new int[items.size()]; |
|||
for (int i = 0; i < items.size(); i++) { |
|||
heights[i] = (int) ((items.get(i).value / maxValue) * maxBarHeight); |
|||
if (heights[i] == 0 && items.get(i).value > 0) { |
|||
heights[i] = 1; |
|||
} |
|||
} |
|||
|
|||
writer.write("\n"); |
|||
for (int row = maxBarHeight; row >= 0; row--) { |
|||
writer.write(" "); |
|||
for (int col = 0; col < items.size(); col++) { |
|||
if (heights[col] >= row && row > 0) { |
|||
writer.write(" " + "*".repeat(barWidth) + " "); |
|||
} else if (row == 0) { |
|||
writer.write(" " + "-".repeat(barWidth) + " "); |
|||
} else { |
|||
writer.write(" " + " ".repeat(barWidth) + " "); |
|||
} |
|||
} |
|||
writer.write("\n"); |
|||
} |
|||
|
|||
writer.write("\n原始数据:\n"); |
|||
for (DataItem item : items) { |
|||
writer.write(item.name + ": " + String.format("%.0f", item.value) + "\n"); |
|||
} |
|||
|
|||
view.printSuccess("图表已导出到: " + filename); |
|||
} catch (IOException e) { |
|||
view.printError("导出失败: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
private static class DataItem { |
|||
String name; |
|||
double value; |
|||
|
|||
DataItem(String name, double value) { |
|||
this.name = name; |
|||
this.value = value; |
|||
} |
|||
|
|||
double getValue() { |
|||
return value; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,8 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
|
|||
public interface Command { |
|||
String getName(); |
|||
void execute(String[] args, ArticleRepository repository); |
|||
} |
|||
@ -0,0 +1,179 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.exception.CrawlerException; |
|||
import com.example.datacollect.exception.NetworkException; |
|||
import com.example.datacollect.exception.ParseException; |
|||
import com.example.datacollect.model.Article; |
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.strategy.CrawlStrategy; |
|||
import com.example.datacollect.strategy.StrategyFactory; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
import java.util.List; |
|||
|
|||
public class CrawlCommand implements Command { |
|||
private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class); |
|||
private static final int MAX_RETRIES = 3; |
|||
private static final int RETRY_DELAY_MS = 1000; |
|||
|
|||
private final ConsoleView view; |
|||
private final StrategyFactory strategyFactory; |
|||
|
|||
public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) { |
|||
this.view = view; |
|||
this.strategyFactory = strategyFactory; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "crawl"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
if (args.length < 2) { |
|||
view.printError("Usage: crawl <url>"); |
|||
return; |
|||
} |
|||
|
|||
String url = args[1]; |
|||
CrawlStrategy strategy = strategyFactory.getStrategy(url); |
|||
|
|||
if (strategy == null) { |
|||
view.printError("No strategy found for URL: " + url); |
|||
return; |
|||
} |
|||
|
|||
// 检测是否是豆瓣电影Top250,需要分页爬取
|
|||
if (url.contains("douban.com/top250")) { |
|||
crawlDoubanTop250(url, repository); |
|||
return; |
|||
} |
|||
|
|||
int attempts = 0; |
|||
Exception lastException = null; |
|||
|
|||
while (attempts < MAX_RETRIES) { |
|||
attempts++; |
|||
try { |
|||
Document doc = fetchWithRetry(url, attempts); |
|||
List<Article> articles = strategy.parse(url, doc); |
|||
|
|||
for (Article article : articles) { |
|||
repository.add(article); |
|||
} |
|||
|
|||
logger.info("Successfully crawled {} - {} article(s)", url, articles.size()); |
|||
view.printSuccess("Crawled " + articles.size() + " article(s) from " + url); |
|||
return; |
|||
} catch (NetworkException e) { |
|||
lastException = e; |
|||
logger.warn("Network error fetching {} (attempt {}/{}): {}", |
|||
url, attempts, MAX_RETRIES, e.getMessage()); |
|||
if (attempts < MAX_RETRIES) { |
|||
try { |
|||
Thread.sleep(RETRY_DELAY_MS * attempts); |
|||
} catch (InterruptedException ie) { |
|||
Thread.currentThread().interrupt(); |
|||
break; |
|||
} |
|||
} |
|||
} catch (ParseException e) { |
|||
lastException = e; |
|||
logger.error("Parse error for {} (attempt {}/{}): {}", |
|||
url, attempts, MAX_RETRIES, e.getMessage()); |
|||
break; |
|||
} catch (CrawlerException e) { |
|||
lastException = e; |
|||
logger.error("Crawler error for {}: {}", url, e.getMessage()); |
|||
break; |
|||
} catch (Exception e) { |
|||
lastException = e; |
|||
logger.error("Unexpected error fetching {}: {}", url, e.getMessage()); |
|||
break; |
|||
} |
|||
} |
|||
|
|||
logger.error("Failed to crawl {} after {} attempts", url, attempts); |
|||
view.printError("Failed to crawl: " + (lastException != null ? lastException.getMessage() : "Unknown error")); |
|||
} |
|||
|
|||
private void crawlDoubanTop250(String baseUrl, ArticleRepository repository) { |
|||
view.printInfo("开始爬取豆瓣电影Top250,共10页..."); |
|||
int totalArticles = 0; |
|||
|
|||
for (int page = 0; page < 10; page++) { |
|||
String url = baseUrl + "?start=" + (page * 25); |
|||
view.printInfo("正在爬取第" + (page + 1) + "/10页: " + url); |
|||
|
|||
int attempts = 0; |
|||
boolean success = false; |
|||
|
|||
while (attempts < MAX_RETRIES) { |
|||
attempts++; |
|||
try { |
|||
Document doc = fetchWithRetry(url, attempts); |
|||
CrawlStrategy strategy = strategyFactory.getStrategy(url); |
|||
|
|||
if (strategy != null) { |
|||
List<Article> articles = strategy.parse(url, doc); |
|||
for (Article article : articles) { |
|||
repository.add(article); |
|||
} |
|||
totalArticles += articles.size(); |
|||
logger.info("成功爬取第{}页 - {}条数据", page + 1, articles.size()); |
|||
success = true; |
|||
} |
|||
break; |
|||
} catch (Exception e) { |
|||
logger.warn("爬取第{}页失败(尝试{}/{}): {}", page + 1, attempts, MAX_RETRIES, e.getMessage()); |
|||
if (attempts < MAX_RETRIES) { |
|||
try { |
|||
Thread.sleep(RETRY_DELAY_MS * attempts); |
|||
} catch (InterruptedException ie) { |
|||
Thread.currentThread().interrupt(); |
|||
break; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
if (!success) { |
|||
view.printError("第" + (page + 1) + "页爬取失败"); |
|||
} |
|||
|
|||
// 每页之间延迟1秒,避免请求过快
|
|||
try { |
|||
Thread.sleep(1000); |
|||
} catch (InterruptedException e) { |
|||
Thread.currentThread().interrupt(); |
|||
break; |
|||
} |
|||
} |
|||
|
|||
logger.info("豆瓣电影Top250爬取完成,共获取{}条数据", totalArticles); |
|||
view.printSuccess("豆瓣电影Top250爬取完成,共获取" + totalArticles + "条数据"); |
|||
} |
|||
|
|||
private Document fetchWithRetry(String url, int attempt) throws NetworkException { |
|||
try { |
|||
logger.debug("Fetching {} (attempt {})", url, attempt); |
|||
return Jsoup.connect(url) |
|||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") |
|||
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") |
|||
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") |
|||
.header("Accept-Encoding", "gzip, deflate") |
|||
.header("Connection", "keep-alive") |
|||
.header("Cache-Control", "max-age=0") |
|||
.timeout(10000) |
|||
.followRedirects(true) |
|||
.ignoreHttpErrors(true) |
|||
.get(); |
|||
} catch (Exception e) { |
|||
throw new NetworkException("Failed to fetch " + url, e); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,27 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
public class ExitCommand implements Command { |
|||
private static final Logger logger = LoggerFactory.getLogger(ExitCommand.class); |
|||
private final ConsoleView view; |
|||
|
|||
public ExitCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "exit"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
logger.info("User exiting application"); |
|||
view.printSuccess("Bye!"); |
|||
System.exit(0); |
|||
} |
|||
} |
|||
@ -0,0 +1,129 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
import java.io.File; |
|||
import java.io.FileWriter; |
|||
import java.io.IOException; |
|||
import java.nio.file.Files; |
|||
import java.nio.file.Path; |
|||
import java.nio.file.Paths; |
|||
import java.time.LocalDate; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.util.List; |
|||
|
|||
public class ExportCommand implements Command { |
|||
private static final Logger logger = LoggerFactory.getLogger(ExportCommand.class); |
|||
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd"); |
|||
|
|||
private final ConsoleView view; |
|||
|
|||
public ExportCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "export"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
if (args.length < 2) { |
|||
view.printError("Usage: export <filename> [-csv]"); |
|||
return; |
|||
} |
|||
|
|||
String filename = args[1]; |
|||
if (!filename.toLowerCase().endsWith(".csv")) { |
|||
filename += ".csv"; |
|||
} |
|||
|
|||
List<Article> articles = repository.getAll(); |
|||
|
|||
if (articles.isEmpty()) { |
|||
view.printError("No articles to export"); |
|||
return; |
|||
} |
|||
|
|||
try { |
|||
Path path = Paths.get(filename); |
|||
Files.createDirectories(path.getParent() != null ? path.getParent() : Paths.get(".")); |
|||
|
|||
int exportedCount = 0; |
|||
try (FileWriter writer = new FileWriter(filename)) { |
|||
// 检测是否是电影数据(内容包含|分隔符)
|
|||
boolean isMovieData = !articles.isEmpty() && articles.get(0).getContent() != null && |
|||
articles.get(0).getContent().contains("|"); |
|||
|
|||
if (isMovieData) { |
|||
writer.append("排名,电影名称,链接,导演,评分,简介\n"); |
|||
} else { |
|||
writer.append("日期,标题,链接,内容\n"); |
|||
} |
|||
|
|||
for (Article article : articles) { |
|||
String date = ""; |
|||
if (article.getPublishDate() != null) { |
|||
date = article.getPublishDate().format(DATE_FORMATTER); |
|||
} |
|||
|
|||
String title = escapeCSV(article.getTitle()); |
|||
String url = escapeCSV(article.getUrl()); |
|||
String content = article.getContent() != null ? article.getContent() : ""; |
|||
|
|||
if (isMovieData) { |
|||
// 电影数据格式: 导演|评分|简介
|
|||
String[] parts = content.split("\\|", -1); |
|||
String director = parts.length > 0 ? escapeCSV(parts[0]) : ""; |
|||
String rating = parts.length > 1 ? escapeCSV(parts[1]) : ""; |
|||
String quote = parts.length > 2 ? escapeCSV(parts[2]) : ""; |
|||
|
|||
// 从日期中提取排名(存储为 2000 + rank)
|
|||
String rank = ""; |
|||
if (article.getPublishDate() != null) { |
|||
int year = article.getPublishDate().getYear(); |
|||
if (year >= 2001 && year <= 2025) { |
|||
rank = String.valueOf(year - 2000); |
|||
} |
|||
} |
|||
|
|||
writer.append(rank).append(",") |
|||
.append(title).append(",") |
|||
.append(url).append(",") |
|||
.append(director).append(",") |
|||
.append(rating).append(",") |
|||
.append(quote).append("\n"); |
|||
} else { |
|||
writer.append(date).append(",") |
|||
.append(title).append(",") |
|||
.append(url).append(",") |
|||
.append(escapeCSV(content)).append("\n"); |
|||
} |
|||
|
|||
exportedCount++; |
|||
} |
|||
} |
|||
|
|||
logger.info("Exported {} articles to {}", exportedCount, filename); |
|||
view.printSuccess("Exported " + exportedCount + " articles to " + filename); |
|||
|
|||
} catch (IOException e) { |
|||
logger.error("Failed to export articles: {}", e.getMessage()); |
|||
view.printError("Failed to export: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
private String escapeCSV(String value) { |
|||
if (value == null) { |
|||
return ""; |
|||
} |
|||
if (value.contains(",") || value.contains("\"") || value.contains("\n")) { |
|||
return "\"" + value.replace("\"", "\"\"") + "\""; |
|||
} |
|||
return value; |
|||
} |
|||
} |
|||
@ -0,0 +1,26 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
public class HelpCommand implements Command { |
|||
private static final Logger logger = LoggerFactory.getLogger(HelpCommand.class); |
|||
private final ConsoleView view; |
|||
|
|||
public HelpCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "help"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
logger.debug("Showing help"); |
|||
view.printInfo("Commands: crawl <url>, list, help, history, exit"); |
|||
} |
|||
} |
|||
@ -0,0 +1,60 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class HistoryCommand implements Command { |
|||
private static final Logger logger = LoggerFactory.getLogger(HistoryCommand.class); |
|||
private final ConsoleView view; |
|||
private final List<String> commandHistory; |
|||
|
|||
public HistoryCommand(ConsoleView view) { |
|||
this.view = view; |
|||
this.commandHistory = new ArrayList<>(); |
|||
} |
|||
|
|||
public void addCommand(String command) { |
|||
commandHistory.add(command); |
|||
} |
|||
|
|||
public List<String> getAllHistory() { |
|||
return new ArrayList<>(commandHistory); |
|||
} |
|||
|
|||
public String getCommand(int index) { |
|||
if (index < 0 || index >= commandHistory.size()) { |
|||
return null; |
|||
} |
|||
return commandHistory.get(index); |
|||
} |
|||
|
|||
public void clearHistory() { |
|||
commandHistory.clear(); |
|||
} |
|||
|
|||
public int getHistorySize() { |
|||
return commandHistory.size(); |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "history"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
if (commandHistory.isEmpty()) { |
|||
view.printInfo("No command history."); |
|||
return; |
|||
} |
|||
|
|||
view.printInfo("Command History:"); |
|||
for (int i = 0; i < commandHistory.size(); i++) { |
|||
view.printInfo((i + 1) + ". " + commandHistory.get(i)); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,26 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
public class ListCommand implements Command { |
|||
private static final Logger logger = LoggerFactory.getLogger(ListCommand.class); |
|||
private final ConsoleView view; |
|||
|
|||
public ListCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "list"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
logger.debug("Listing articles"); |
|||
view.display(repository.getAll()); |
|||
} |
|||
} |
|||
@ -0,0 +1,63 @@ |
|||
package com.example.datacollect.controller; |
|||
|
|||
import com.example.datacollect.command.ChartCommand; |
|||
import com.example.datacollect.command.Command; |
|||
import com.example.datacollect.command.CrawlCommand; |
|||
import com.example.datacollect.command.ExitCommand; |
|||
import com.example.datacollect.command.ExportCommand; |
|||
import com.example.datacollect.command.HelpCommand; |
|||
import com.example.datacollect.command.HistoryCommand; |
|||
import com.example.datacollect.command.ListCommand; |
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.strategy.StrategyFactory; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
import java.util.HashMap; |
|||
import java.util.Map; |
|||
|
|||
public class CrawlerController { |
|||
private static final Logger logger = LoggerFactory.getLogger(CrawlerController.class); |
|||
private final Map<String, Command> commands = new HashMap<>(); |
|||
private final ConsoleView view; |
|||
private final ArticleRepository repository; |
|||
private HistoryCommand historyCommand; |
|||
|
|||
public CrawlerController(ConsoleView view, ArticleRepository repository, StrategyFactory strategyFactory) { |
|||
this.view = view; |
|||
this.repository = repository; |
|||
register(new HelpCommand(view)); |
|||
register(new ListCommand(view)); |
|||
register(new CrawlCommand(view, strategyFactory)); |
|||
register(new ExportCommand(view)); |
|||
register(new ChartCommand(view)); |
|||
register(new ExitCommand(view)); |
|||
historyCommand = new HistoryCommand(view); |
|||
register(historyCommand); |
|||
logger.info("CrawlerController initialized"); |
|||
} |
|||
|
|||
private void register(Command command) { |
|||
commands.put(command.getName(), command); |
|||
} |
|||
|
|||
public void handle(String input) { |
|||
String text = input == null ? "" : input.trim(); |
|||
if (text.isEmpty()) { |
|||
return; |
|||
} |
|||
|
|||
historyCommand.addCommand(text); |
|||
|
|||
String[] args = text.split("\\s+"); |
|||
String cmdName = args[0].toLowerCase(); |
|||
Command command = commands.get(cmdName); |
|||
if (command == null) { |
|||
logger.warn("Unknown command: {}", cmdName); |
|||
view.printError("Unknown command: " + cmdName); |
|||
return; |
|||
} |
|||
logger.debug("Executing command: {}", cmdName); |
|||
command.execute(args, repository); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.example.datacollect.exception; |
|||
|
|||
public class CrawlerException extends Exception { |
|||
public CrawlerException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public CrawlerException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.example.datacollect.exception; |
|||
|
|||
public class NetworkException extends CrawlerException { |
|||
public NetworkException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public NetworkException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.example.datacollect.exception; |
|||
|
|||
public class ParseException extends CrawlerException { |
|||
public ParseException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public ParseException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,75 @@ |
|||
package com.example.datacollect.model; |
|||
|
|||
import java.time.LocalDate; |
|||
|
|||
public class Article { |
|||
private String title; |
|||
private String url; |
|||
private String content; |
|||
private String author; |
|||
private LocalDate publishDate; |
|||
|
|||
public Article(String title, String url, String content) { |
|||
this.title = title; |
|||
this.url = url; |
|||
this.content = content; |
|||
} |
|||
|
|||
public Article(String title, String url, String content, String author, LocalDate publishDate) { |
|||
this.title = title; |
|||
this.url = url; |
|||
this.content = content; |
|||
this.author = author; |
|||
this.publishDate = publishDate; |
|||
} |
|||
|
|||
public String getTitle() { |
|||
return title; |
|||
} |
|||
|
|||
public void setTitle(String title) { |
|||
this.title = title; |
|||
} |
|||
|
|||
public String getUrl() { |
|||
return url; |
|||
} |
|||
|
|||
public void setUrl(String url) { |
|||
this.url = url; |
|||
} |
|||
|
|||
public String getContent() { |
|||
return content; |
|||
} |
|||
|
|||
public void setContent(String content) { |
|||
this.content = content; |
|||
} |
|||
|
|||
public String getAuthor() { |
|||
return author; |
|||
} |
|||
|
|||
public void setAuthor(String author) { |
|||
this.author = author; |
|||
} |
|||
|
|||
public LocalDate getPublishDate() { |
|||
return publishDate; |
|||
} |
|||
|
|||
public void setPublishDate(LocalDate publishDate) { |
|||
this.publishDate = publishDate; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "Article{" |
|||
+ "title='" + title + '\'' |
|||
+ ", url='" + url + '\'' |
|||
+ ", author='" + author + '\'' |
|||
+ ", publishDate=" + publishDate |
|||
+ '}'; |
|||
} |
|||
} |
|||
@ -0,0 +1,44 @@ |
|||
package com.example.datacollect.repository; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
import java.util.ArrayList; |
|||
import java.util.Collections; |
|||
import java.util.List; |
|||
|
|||
public class ArticleRepository { |
|||
private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class); |
|||
private static final int MAX_CAPACITY = 10000; |
|||
private final List<Article> articles = new ArrayList<>(); |
|||
|
|||
public void add(Article article) { |
|||
if (article == null) { |
|||
throw new IllegalArgumentException("Article cannot be null"); |
|||
} |
|||
if (article.getTitle() == null || article.getTitle().trim().isEmpty()) { |
|||
throw new IllegalArgumentException("Article title cannot be null or empty"); |
|||
} |
|||
if (article.getUrl() == null || article.getUrl().trim().isEmpty()) { |
|||
throw new IllegalArgumentException("Article URL cannot be null or empty"); |
|||
} |
|||
if (articles.size() >= MAX_CAPACITY) { |
|||
throw new IllegalStateException("Repository capacity exceeded: " + MAX_CAPACITY); |
|||
} |
|||
articles.add(article); |
|||
logger.debug("Added article: {}", article.getTitle()); |
|||
} |
|||
|
|||
public List<Article> getAll() { |
|||
return Collections.unmodifiableList(articles); |
|||
} |
|||
|
|||
public int size() { |
|||
return articles.size(); |
|||
} |
|||
|
|||
public void clear() { |
|||
articles.clear(); |
|||
logger.debug("Cleared all articles"); |
|||
} |
|||
} |
|||
@ -0,0 +1,40 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.exception.ParseException; |
|||
import com.example.datacollect.model.Article; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class BlogStrategy implements CrawlStrategy { |
|||
@Override |
|||
public boolean supports(String url) { |
|||
return url.contains("blog.example.com"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> parse(String url, Document doc) throws ParseException { |
|||
List<Article> articles = new ArrayList<>(); |
|||
Elements postItems = doc.select(".post-item"); |
|||
|
|||
for (Element item : postItems) { |
|||
Element titleEl = item.selectFirst(".post-title"); |
|||
Element linkEl = item.selectFirst("a"); |
|||
Element contentEl = item.selectFirst(".post-excerpt"); |
|||
|
|||
if (titleEl == null) continue; |
|||
|
|||
String title = titleEl.text().trim(); |
|||
String articleUrl = linkEl != null ? linkEl.attr("href") : url; |
|||
String content = contentEl != null ? contentEl.text().trim() : ""; |
|||
|
|||
if (!title.isEmpty()) { |
|||
articles.add(new Article(title, articleUrl, content)); |
|||
} |
|||
} |
|||
|
|||
return articles; |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.exception.ParseException; |
|||
import com.example.datacollect.model.Article; |
|||
import org.jsoup.nodes.Document; |
|||
import java.util.List; |
|||
|
|||
public interface CrawlStrategy { |
|||
List<Article> parse(String url, Document doc) throws ParseException; |
|||
boolean supports(String url); |
|||
} |
|||
@ -0,0 +1,77 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.exception.ParseException; |
|||
import com.example.datacollect.model.Article; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class DoubanMovieStrategy implements CrawlStrategy { |
|||
@Override |
|||
public boolean supports(String url) { |
|||
return url.contains("douban.com/top250"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> parse(String url, Document doc) throws ParseException { |
|||
List<Article> articles = new ArrayList<>(); |
|||
|
|||
System.out.println("DEBUG: Page title: " + doc.title()); |
|||
System.out.println("DEBUG: Page URL: " + url); |
|||
|
|||
Elements movieItems = doc.select(".item"); |
|||
System.out.println("DEBUG: Found " + movieItems.size() + " movie items"); |
|||
|
|||
for (Element item : movieItems) { |
|||
try { |
|||
// 排名
|
|||
String rank = item.selectFirst(".pic em").text().trim(); |
|||
|
|||
// 标题(只保留中文)
|
|||
Element titleElement = item.selectFirst(".title"); |
|||
String title = titleElement != null ? titleElement.text().trim() : ""; |
|||
|
|||
// 评分
|
|||
Element ratingElement = item.selectFirst(".rating_num"); |
|||
String rating = ratingElement != null ? ratingElement.text().trim() : ""; |
|||
|
|||
// 导演
|
|||
Element infoElement = item.selectFirst(".bd p"); |
|||
String info = infoElement != null ? infoElement.text().trim() : ""; |
|||
String director = parseDirector(info); |
|||
|
|||
// 简介
|
|||
Element quoteElement = item.selectFirst(".quote span"); |
|||
String quote = quoteElement != null ? quoteElement.text().trim() : ""; |
|||
|
|||
String articleTitle = title; |
|||
String content = director + "|" + rating + "|" + quote; |
|||
|
|||
Article article = new Article(articleTitle, url, content); |
|||
article.setPublishDate(java.time.LocalDate.of(Integer.parseInt("2000") + Integer.parseInt(rank), 1, 1)); |
|||
articles.add(article); |
|||
System.out.println("DEBUG: Added movie: " + articleTitle); |
|||
|
|||
} catch (Exception e) { |
|||
System.out.println("DEBUG: Error parsing movie item: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
System.out.println("DEBUG: Parsed " + articles.size() + " movies"); |
|||
return articles; |
|||
} |
|||
|
|||
private String parseDirector(String info) { |
|||
if (info == null || info.isEmpty()) return ""; |
|||
// 格式:导演: 克里斯托弗·诺兰 Christopher Nolan 主演: 基里安·墨菲...
|
|||
int start = info.indexOf("导演:"); |
|||
if (start == -1) return ""; |
|||
start += 3; // 跳过"导演:"
|
|||
int end = info.indexOf("主演:"); |
|||
if (end == -1) end = info.indexOf("\n"); |
|||
if (end == -1) end = info.length(); |
|||
return info.substring(start, end).trim(); |
|||
} |
|||
} |
|||
@ -0,0 +1,102 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.exception.ParseException; |
|||
import com.example.datacollect.model.Article; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.time.LocalDate; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class HnuNewsStrategy implements CrawlStrategy { |
|||
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd"); |
|||
private static final Pattern DATE_PATTERN = Pattern.compile("(\\d{4})-(\\d{1,2})-(\\d{1,2})"); |
|||
|
|||
@Override |
|||
public boolean supports(String url) { |
|||
return url.contains("news.hnu.edu.cn"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> parse(String url, Document doc) throws ParseException { |
|||
List<Article> articles = new ArrayList<>(); |
|||
|
|||
System.out.println("DEBUG: Page title: " + doc.title()); |
|||
System.out.println("DEBUG: Looking for list items with selector: ul.list11 li"); |
|||
Elements listItems = doc.select("ul.list11 li"); |
|||
System.out.println("DEBUG: Found " + listItems.size() + " list items"); |
|||
|
|||
if (listItems.isEmpty()) { |
|||
System.out.println("DEBUG: Trying alternative selectors..."); |
|||
Elements alternatives = doc.select("ul li, .news-list li, .article-list li, .list li"); |
|||
System.out.println("DEBUG: Found " + alternatives.size() + " alternative items"); |
|||
if (!alternatives.isEmpty()) { |
|||
listItems = alternatives; |
|||
} |
|||
} |
|||
|
|||
LocalDate threeYearsAgo = LocalDate.now().minusYears(3); |
|||
|
|||
for (Element li : listItems) { |
|||
Element link = li.selectFirst("a"); |
|||
if (link == null) continue; |
|||
|
|||
String articleUrl = link.attr("href"); |
|||
if (!articleUrl.startsWith("http")) { |
|||
articleUrl = "https://news.hnu.edu.cn" + articleUrl.replace("..", ""); |
|||
} |
|||
|
|||
String title = ""; |
|||
Element titleEl = link.selectFirst("h4.l2.h4s2"); |
|||
if (titleEl != null) { |
|||
title = titleEl.text().trim(); |
|||
} |
|||
|
|||
String content = ""; |
|||
Element contentEl = link.selectFirst("p.l3.ps3"); |
|||
if (contentEl != null) { |
|||
content = contentEl.text().trim(); |
|||
} |
|||
|
|||
LocalDate publishDate = null; |
|||
Element dateEl = li.selectFirst("span.time, .time, span.date, .date, [class*=time], [class*=date]"); |
|||
if (dateEl != null) { |
|||
String dateText = dateEl.text().trim(); |
|||
publishDate = parseDate(dateText); |
|||
} |
|||
|
|||
if (!title.isEmpty()) { |
|||
if (publishDate != null && publishDate.isAfter(threeYearsAgo)) { |
|||
articles.add(new Article(title, articleUrl, content, "", publishDate)); |
|||
} else if (publishDate == null) { |
|||
articles.add(new Article(title, articleUrl, content)); |
|||
} |
|||
} |
|||
} |
|||
|
|||
return articles; |
|||
} |
|||
|
|||
private LocalDate parseDate(String dateText) { |
|||
if (dateText == null || dateText.isEmpty()) { |
|||
return null; |
|||
} |
|||
|
|||
Matcher matcher = DATE_PATTERN.matcher(dateText); |
|||
if (matcher.find()) { |
|||
try { |
|||
int year = Integer.parseInt(matcher.group(1)); |
|||
int month = Integer.parseInt(matcher.group(2)); |
|||
int day = Integer.parseInt(matcher.group(3)); |
|||
return LocalDate.of(year, month, day); |
|||
} catch (Exception e) { |
|||
return null; |
|||
} |
|||
} |
|||
return null; |
|||
} |
|||
} |
|||
@ -0,0 +1,155 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.exception.ParseException; |
|||
import com.example.datacollect.model.Article; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class NBStrategy implements CrawlStrategy { |
|||
@Override |
|||
public boolean supports(String url) { |
|||
return url.contains("stats.gov.cn"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> parse(String url, Document doc) throws ParseException { |
|||
List<Article> articles = new ArrayList<>(); |
|||
|
|||
System.out.println("DEBUG: Page title: " + doc.title()); |
|||
System.out.println("DEBUG: Page URL: " + url); |
|||
|
|||
Elements allTables = doc.select("table"); |
|||
System.out.println("DEBUG: Found " + allTables.size() + " tables"); |
|||
|
|||
for (int i = 0; i < allTables.size(); i++) { |
|||
Element table = allTables.get(i); |
|||
Elements rows = table.select("tr"); |
|||
System.out.println("\nDEBUG: Table " + i + " has " + rows.size() + " rows"); |
|||
|
|||
for (int j = 0; j < rows.size(); j++) { |
|||
Element row = rows.get(j); |
|||
Elements cells = row.select("td, th"); |
|||
System.out.print("DEBUG: Row " + j + ": "); |
|||
for (Element cell : cells) { |
|||
System.out.print("[" + cell.text().trim() + "] "); |
|||
} |
|||
System.out.println(); |
|||
|
|||
if (cells.size() >= 2) { |
|||
String col1 = cells.get(0).text().trim(); |
|||
String col2 = cells.size() > 1 ? cells.get(1).text().trim() : ""; |
|||
|
|||
if (isProvinceData(col1, col2)) { |
|||
String title = col1 + " 人口数据"; |
|||
String content = col2; |
|||
articles.add(new Article(title, url, content)); |
|||
System.out.println("DEBUG: Added province data: " + title + " = " + content); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
Elements allLinks = doc.select("a"); |
|||
System.out.println("\nDEBUG: Found " + allLinks.size() + " links total"); |
|||
System.out.println("DEBUG: First 20 links:"); |
|||
for (int i = 0; i < Math.min(20, allLinks.size()); i++) { |
|||
Element link = allLinks.get(i); |
|||
System.out.println(" [" + i + "] " + link.text().trim() + " -> " + link.attr("href")); |
|||
} |
|||
|
|||
Elements listItems = doc.select("ul li, .list li, .data-list li, .content li, li"); |
|||
System.out.println("\nDEBUG: Found " + listItems.size() + " list items"); |
|||
|
|||
for (Element item : listItems) { |
|||
Element link = item.selectFirst("a"); |
|||
if (link != null) { |
|||
String title = link.text().trim(); |
|||
String articleUrl = link.attr("href"); |
|||
|
|||
if (!title.isEmpty() && title.length() > 4) { |
|||
if (!articleUrl.startsWith("http")) { |
|||
if (articleUrl.startsWith("/")) { |
|||
articleUrl = "https://www.stats.gov.cn" + articleUrl; |
|||
} else { |
|||
articleUrl = "https://www.stats.gov.cn/" + articleUrl; |
|||
} |
|||
} |
|||
|
|||
String content = item.text().replace(title, "").trim(); |
|||
articles.add(new Article(title, articleUrl, content)); |
|||
} |
|||
} |
|||
} |
|||
|
|||
Elements dataDivs = doc.select(".data-item, .stat-item, .news-item, .article-item, [class*=data], [class*=item], div"); |
|||
for (Element div : dataDivs) { |
|||
Element link = div.selectFirst("a"); |
|||
if (link != null) { |
|||
String title = link.text().trim(); |
|||
String articleUrl = link.attr("href"); |
|||
|
|||
if (!title.isEmpty() && title.length() > 4) { |
|||
if (!articleUrl.startsWith("http")) { |
|||
if (articleUrl.startsWith("/")) { |
|||
articleUrl = "https://www.stats.gov.cn" + articleUrl; |
|||
} else { |
|||
articleUrl = "https://www.stats.gov.cn/" + articleUrl; |
|||
} |
|||
} |
|||
|
|||
final String finalUrl = articleUrl; |
|||
boolean exists = articles.stream().anyMatch(a -> a.getUrl().equals(finalUrl)); |
|||
if (!exists) { |
|||
articles.add(new Article(title, articleUrl, "")); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
if (articles.isEmpty()) { |
|||
System.out.println("\nDEBUG: No structured data found, using backup province population data..."); |
|||
articles = getBackupPopulationData(); |
|||
} |
|||
|
|||
System.out.println("\nDEBUG: Parsed " + articles.size() + " items"); |
|||
return articles; |
|||
} |
|||
|
|||
private List<Article> getBackupPopulationData() { |
|||
List<Article> articles = new ArrayList<>(); |
|||
// 第七次全国人口普查数据(单位:万人)
|
|||
articles.add(new Article("广东省", "https://www.stats.gov.cn", "12601.25")); |
|||
articles.add(new Article("山东省", "https://www.stats.gov.cn", "10152.75")); |
|||
articles.add(new Article("河南省", "https://www.stats.gov.cn", "9936.55")); |
|||
articles.add(new Article("江苏省", "https://www.stats.gov.cn", "8474.80")); |
|||
articles.add(new Article("四川省", "https://www.stats.gov.cn", "8367.49")); |
|||
articles.add(new Article("河北省", "https://www.stats.gov.cn", "7461.02")); |
|||
articles.add(new Article("湖南省", "https://www.stats.gov.cn", "6644.49")); |
|||
articles.add(new Article("浙江省", "https://www.stats.gov.cn", "6456.76")); |
|||
articles.add(new Article("安徽省", "https://www.stats.gov.cn", "6102.72")); |
|||
articles.add(new Article("湖北省", "https://www.stats.gov.cn", "5775.26")); |
|||
articles.add(new Article("广西壮族自治区", "https://www.stats.gov.cn", "5012.68")); |
|||
articles.add(new Article("云南省", "https://www.stats.gov.cn", "4720.93")); |
|||
articles.add(new Article("江西省", "https://www.stats.gov.cn", "4518.86")); |
|||
articles.add(new Article("辽宁省", "https://www.stats.gov.cn", "4259.14")); |
|||
articles.add(new Article("福建省", "https://www.stats.gov.cn", "4154.01")); |
|||
articles.add(new Article("陕西省", "https://www.stats.gov.cn", "3952.90")); |
|||
articles.add(new Article("黑龙江省", "https://www.stats.gov.cn", "3185.01")); |
|||
articles.add(new Article("山西省", "https://www.stats.gov.cn", "3491.56")); |
|||
articles.add(new Article("贵州省", "https://www.stats.gov.cn", "3856.21")); |
|||
articles.add(new Article("重庆市", "https://www.stats.gov.cn", "3205.42")); |
|||
return articles; |
|||
} |
|||
|
|||
private boolean isProvinceData(String col1, String col2) { |
|||
if (col1.isEmpty() || col2.isEmpty()) return false; |
|||
|
|||
boolean isProvince = col1.contains("省") || col1.contains("市") || col1.contains("自治区") || col1.contains("地区"); |
|||
boolean hasNumber = col2.matches(".*\\d+.*"); |
|||
|
|||
return isProvince && hasNumber; |
|||
} |
|||
} |
|||
@ -0,0 +1,26 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.exception.ParseException; |
|||
import com.example.datacollect.model.Article; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class NewsStrategy implements CrawlStrategy { |
|||
@Override |
|||
public boolean supports(String url) { |
|||
return url.contains("news.example.com"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> parse(String url, Document doc) throws ParseException { |
|||
List<Article> articles = new ArrayList<>(); |
|||
Elements items = doc.select(".article-headline"); |
|||
for (Element e : items) { |
|||
articles.add(new Article(e.text(), url, "")); |
|||
} |
|||
return articles; |
|||
} |
|||
} |
|||
@ -0,0 +1,35 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class StrategyFactory { |
|||
private static final Logger logger = LoggerFactory.getLogger(StrategyFactory.class); |
|||
private final List<CrawlStrategy> strategies = new ArrayList<>(); |
|||
|
|||
public StrategyFactory() { |
|||
strategies.add(new HnuNewsStrategy()); |
|||
strategies.add(new NBStrategy()); |
|||
strategies.add(new WeatherStrategy()); |
|||
strategies.add(new DoubanMovieStrategy()); |
|||
logger.info("StrategyFactory initialized with {} strategies", strategies.size()); |
|||
} |
|||
|
|||
public CrawlStrategy getStrategy(String url) { |
|||
for (CrawlStrategy s : strategies) { |
|||
if (s.supports(url)) { |
|||
logger.debug("Found strategy {} for URL: {}", s.getClass().getSimpleName(), url); |
|||
return s; |
|||
} |
|||
} |
|||
logger.warn("No strategy found for URL: {}", url); |
|||
return null; |
|||
} |
|||
|
|||
public void register(CrawlStrategy strategy) { |
|||
strategies.add(strategy); |
|||
logger.info("Registered new strategy: {}", strategy.getClass().getSimpleName()); |
|||
} |
|||
} |
|||
@ -0,0 +1,131 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.exception.ParseException; |
|||
import com.example.datacollect.model.Article; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.time.LocalDate; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.time.temporal.ChronoUnit; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class WeatherStrategy implements CrawlStrategy { |
|||
private static final DateTimeFormatter[] DATE_FORMATTERS = { |
|||
DateTimeFormatter.ofPattern("yyyy年MM月dd日"), |
|||
DateTimeFormatter.ofPattern("yyyy-MM-dd"), |
|||
DateTimeFormatter.ofPattern("MM月dd日"), |
|||
DateTimeFormatter.ofPattern("M月d日"), |
|||
DateTimeFormatter.ofPattern("MM/dd"), |
|||
DateTimeFormatter.ofPattern("M/d") |
|||
}; |
|||
|
|||
@Override |
|||
public boolean supports(String url) { |
|||
return url.contains("tianqihoubao.com") || url.contains("tianqi.com") || url.contains("weather"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> parse(String url, Document doc) throws ParseException { |
|||
List<Article> articles = new ArrayList<>(); |
|||
|
|||
System.out.println("DEBUG: Page title: " + doc.title()); |
|||
System.out.println("DEBUG: Page URL: " + url); |
|||
|
|||
LocalDate thirtyDaysAgo = LocalDate.now().minusDays(30); |
|||
System.out.println("DEBUG: Filtering weather data from " + thirtyDaysAgo + " to today"); |
|||
|
|||
Elements allTables = doc.select("table"); |
|||
System.out.println("DEBUG: Found " + allTables.size() + " tables"); |
|||
|
|||
for (int i = 0; i < allTables.size(); i++) { |
|||
Element table = allTables.get(i); |
|||
Elements rows = table.select("tr"); |
|||
System.out.println("\nDEBUG: Table " + i + " has " + rows.size() + " rows"); |
|||
|
|||
for (int j = 0; j < rows.size(); j++) { |
|||
Element row = rows.get(j); |
|||
Elements cells = row.select("td, th"); |
|||
if (cells.size() >= 4) { |
|||
System.out.print("DEBUG: Row " + j + ": "); |
|||
for (Element cell : cells) { |
|||
System.out.print("[" + cell.text().trim() + "] "); |
|||
} |
|||
System.out.println(); |
|||
|
|||
String dateStr = cells.get(0).text().trim(); |
|||
String weather = cells.get(1).text().trim(); |
|||
String temp = cells.size() > 2 ? cells.get(2).text().trim() : ""; |
|||
String humidity = cells.size() > 3 ? cells.get(3).text().trim() : ""; |
|||
|
|||
if (dateStr.isEmpty() || dateStr.contains("日期") || dateStr.contains("时间") || |
|||
dateStr.contains("星期") || dateStr.contains("最高") || dateStr.contains("最低")) { |
|||
continue; |
|||
} |
|||
|
|||
LocalDate date = parseDate(dateStr); |
|||
|
|||
if (date != null) { |
|||
String title = dateStr + " " + weather; |
|||
String content = "日期: " + dateStr + " | 天气: " + weather + " | 温度: " + temp + " | 湿度: " + humidity; |
|||
articles.add(new Article(title, url, content)); |
|||
System.out.println("DEBUG: Added - " + content); |
|||
} else { |
|||
System.out.println("DEBUG: Skipped - cannot parse date: " + dateStr); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
System.out.println("\nDEBUG: Parsed " + articles.size() + " weather items"); |
|||
return articles; |
|||
} |
|||
|
|||
private LocalDate parseDate(String dateStr) { |
|||
if (dateStr == null || dateStr.isEmpty()) return null; |
|||
|
|||
String cleaned = dateStr.replaceAll("\\s+", " ").trim(); |
|||
|
|||
for (DateTimeFormatter formatter : DATE_FORMATTERS) { |
|||
try { |
|||
LocalDate date = LocalDate.parse(cleaned, formatter); |
|||
if (date.getYear() < 2000) { |
|||
date = date.withYear(LocalDate.now().getYear()); |
|||
} |
|||
return date; |
|||
} catch (Exception e) { |
|||
} |
|||
} |
|||
|
|||
try { |
|||
String yearStr = cleaned.replaceAll(".*?(\\d{4}).*", "$1"); |
|||
String monthStr = cleaned.replaceAll(".*?(\\d{1,2})月.*|.*?(\\d{1,2})-.*|.*?(\\d{1,2})/.*", "$1$2$3"); |
|||
String dayStr = cleaned.replaceAll(".*月(\\d{1,2})日.*|.*-(\\d{1,2}).*|.*-(\\d{1,2}).*|.*\\d+/(\\d{1,2}).*", "$1$2$3$4"); |
|||
|
|||
if (!yearStr.isEmpty() && !monthStr.isEmpty() && !dayStr.isEmpty()) { |
|||
int year = Integer.parseInt(yearStr); |
|||
int month = Integer.parseInt(monthStr); |
|||
int day = Integer.parseInt(dayStr); |
|||
return LocalDate.of(year, month, day); |
|||
} |
|||
} catch (Exception e) { |
|||
} |
|||
|
|||
return null; |
|||
} |
|||
|
|||
private String extractDate(String text) { |
|||
if (text == null) return ""; |
|||
|
|||
int dateEnd = text.indexOf(" "); |
|||
if (dateEnd > 0) { |
|||
return text.substring(0, dateEnd); |
|||
} |
|||
|
|||
if (text.length() > 10) { |
|||
return text.substring(0, 10); |
|||
} |
|||
return text; |
|||
} |
|||
} |
|||
@ -0,0 +1,55 @@ |
|||
package com.example.datacollect.view; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
import java.io.BufferedReader; |
|||
import java.io.InputStreamReader; |
|||
import java.nio.charset.StandardCharsets; |
|||
import java.util.List; |
|||
|
|||
public class ConsoleView { |
|||
private static final Logger logger = LoggerFactory.getLogger(ConsoleView.class); |
|||
private static final String ANSI_RESET = "\u001B[0m"; |
|||
private static final String ANSI_GREEN = "\u001B[32m"; |
|||
private static final String ANSI_RED = "\u001B[31m"; |
|||
private static final String ANSI_BLUE = "\u001B[34m"; |
|||
|
|||
private final BufferedReader reader = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); |
|||
|
|||
public String readLine() { |
|||
System.out.print("> "); |
|||
try { |
|||
return reader.readLine(); |
|||
} catch (Exception e) { |
|||
return ""; |
|||
} |
|||
} |
|||
|
|||
public void printSuccess(String msg) { |
|||
System.out.println(ANSI_GREEN + msg + ANSI_RESET); |
|||
} |
|||
|
|||
public void printError(String msg) { |
|||
System.out.println(ANSI_RED + msg + ANSI_RESET); |
|||
} |
|||
|
|||
public void printInfo(String msg) { |
|||
System.out.println(ANSI_BLUE + msg + ANSI_RESET); |
|||
} |
|||
|
|||
public void print(String msg) { |
|||
System.out.print(msg); |
|||
} |
|||
|
|||
public void display(List<Article> articles) { |
|||
if (articles.isEmpty()) { |
|||
printInfo("暂无文章,请先执行 crawl。"); |
|||
return; |
|||
} |
|||
for (int i = 0; i < articles.size(); i++) { |
|||
Article a = articles.get(i); |
|||
System.out.println((i + 1) + ". " + a.getTitle() + " | " + a.getUrl()); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,26 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<configuration> |
|||
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender"> |
|||
<encoder> |
|||
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> |
|||
</encoder> |
|||
</appender> |
|||
|
|||
<appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender"> |
|||
<file>logs/crawler.log</file> |
|||
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy"> |
|||
<fileNamePattern>logs/crawler-%d{yyyy-MM-dd}.log</fileNamePattern> |
|||
<maxHistory>7</maxHistory> |
|||
</rollingPolicy> |
|||
<encoder> |
|||
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> |
|||
</encoder> |
|||
</appender> |
|||
|
|||
<logger name="com.example.datacollect" level="DEBUG"/> |
|||
|
|||
<root level="INFO"> |
|||
<appender-ref ref="CONSOLE"/> |
|||
<appender-ref ref="FILE"/> |
|||
</root> |
|||
</configuration> |
|||
File diff suppressed because it is too large
|
|
Loading…
Reference in new issue