6 changed files with 269 additions and 0 deletions
@ -0,0 +1,7 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<classpath> |
||||
|
<classpathentry kind="src" path="src/main/java"/> |
||||
|
<classpathentry kind="src" path="src/main/resources"/> |
||||
|
<classpathentry kind="lib" path="lib/jsoup-1.17.2.jar"/> |
||||
|
<classpathentry kind="output" path="bin"/> |
||||
|
</classpath> |
||||
@ -0,0 +1,48 @@ |
|||||
|
package com.crawler.common; |
||||
|
|
||||
|
public final class AnsiColorUtil { |
||||
|
private AnsiColorUtil() {} |
||||
|
|
||||
|
public static final String RESET = "\u001B[0m"; |
||||
|
public static final String BLACK = "\u001B[30m"; |
||||
|
public static final String RED = "\u001B[31m"; |
||||
|
public static final String GREEN = "\u001B[32m"; |
||||
|
public static final String YELLOW = "\u001B[33m"; |
||||
|
public static final String BLUE = "\u001B[34m"; |
||||
|
public static final String PURPLE = "\u001B[35m"; |
||||
|
public static final String CYAN = "\u001B[36m"; |
||||
|
public static final String WHITE = "\u001B[37m"; |
||||
|
|
||||
|
public static final String BLACK_BOLD = "\u001B[1;30m"; |
||||
|
public static final String RED_BOLD = "\u001B[1;31m"; |
||||
|
public static final String GREEN_BOLD = "\u001B[1;32m"; |
||||
|
public static final String YELLOW_BOLD = "\u001B[1;33m"; |
||||
|
public static final String BLUE_BOLD = "\u001B[1;34m"; |
||||
|
public static final String PURPLE_BOLD = "\u001B[1;35m"; |
||||
|
public static final String CYAN_BOLD = "\u001B[1;36m"; |
||||
|
public static final String WHITE_BOLD = "\u001B[1;37m"; |
||||
|
|
||||
|
public static String colorize(String text, String color) { |
||||
|
return color + text + RESET; |
||||
|
} |
||||
|
|
||||
|
public static String success(String text) { |
||||
|
return GREEN_BOLD + text + RESET; |
||||
|
} |
||||
|
|
||||
|
public static String error(String text) { |
||||
|
return RED_BOLD + text + RESET; |
||||
|
} |
||||
|
|
||||
|
public static String warning(String text) { |
||||
|
return YELLOW_BOLD + text + RESET; |
||||
|
} |
||||
|
|
||||
|
public static String info(String text) { |
||||
|
return BLUE_BOLD + text + RESET; |
||||
|
} |
||||
|
|
||||
|
public static String highlight(String text) { |
||||
|
return CYAN_BOLD + text + RESET; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,31 @@ |
|||||
|
package com.crawler.common; |
||||
|
|
||||
|
public abstract class BaseCommand<T> implements Command { |
||||
|
protected T model; |
||||
|
protected ConsoleView view; |
||||
|
|
||||
|
public BaseCommand(T model, ConsoleView view) { |
||||
|
this.model = model; |
||||
|
this.view = view; |
||||
|
} |
||||
|
|
||||
|
public BaseCommand(ConsoleView view) { |
||||
|
this.view = view; |
||||
|
} |
||||
|
|
||||
|
public T getModel() { |
||||
|
return model; |
||||
|
} |
||||
|
|
||||
|
public void setModel(T model) { |
||||
|
this.model = model; |
||||
|
} |
||||
|
|
||||
|
public ConsoleView getView() { |
||||
|
return view; |
||||
|
} |
||||
|
|
||||
|
public void setView(ConsoleView view) { |
||||
|
this.view = view; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,35 @@ |
|||||
|
package com.crawler.common; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public abstract class BaseCrawler<T> { |
||||
|
|
||||
|
public abstract List<T> crawl(); |
||||
|
|
||||
|
public void saveToCSV(List<T> data, String filePath) throws IOException { |
||||
|
if (data == null || data.isEmpty()) { |
||||
|
LoggerUtil.warn("没有数据需要保存"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
CsvUtil.ensureDirectoryExists(filePath); |
||||
|
List<String[]> csvData = convertToCsvFormat(data); |
||||
|
CsvUtil.write(filePath, csvData); |
||||
|
} |
||||
|
|
||||
|
public List<T> loadFromCSV(String filePath) throws IOException { |
||||
|
if (!CsvUtil.fileExists(filePath)) { |
||||
|
LoggerUtil.warn("文件不存在: {}", filePath); |
||||
|
return new ArrayList<>(); |
||||
|
} |
||||
|
|
||||
|
List<String[]> csvData = CsvUtil.read(filePath); |
||||
|
return convertFromCsvFormat(csvData); |
||||
|
} |
||||
|
|
||||
|
protected abstract List<String[]> convertToCsvFormat(List<T> data); |
||||
|
|
||||
|
protected abstract List<T> convertFromCsvFormat(List<String[]> csvData); |
||||
|
} |
||||
@ -0,0 +1,126 @@ |
|||||
|
# CrawlerProject - Java爬虫项目 |
||||
|
|
||||
|
一个基于Java的爬虫项目,包含豆瓣电影Top250、网易云音乐热搜、北京天气预报三个模块。 |
||||
|
|
||||
|
## 项目结构 |
||||
|
|
||||
|
``` |
||||
|
CrawlerProject/ |
||||
|
├── pom.xml # Maven配置文件 |
||||
|
├── README.md # 项目说明 |
||||
|
├── output/ # 输出目录(运行时自动创建) |
||||
|
│ ├── douban_top250.csv # 豆瓣电影数据 |
||||
|
│ ├── netease_top150.csv # 网易云音乐数据 |
||||
|
│ └── beijing_weather_30days.csv # 北京天气数据 |
||||
|
└── src/main/ |
||||
|
├── java/com/crawler/ |
||||
|
│ ├── common/ # 公共工具类 |
||||
|
│ │ ├── AnsiColorUtil.java # ANSI颜色码管理 |
||||
|
│ │ ├── BaseCommand.java # 命令抽象类 |
||||
|
│ │ ├── BaseCrawler.java # 爬虫抽象类 |
||||
|
│ │ ├── Command.java # 命令接口 |
||||
|
│ │ ├── ConsoleView.java # 统一输出类 |
||||
|
│ │ ├── CsvUtil.java # CSV读写工具 |
||||
|
│ │ ├── LoggerUtil.java # 日志工具类 |
||||
|
│ │ └── MainController.java # 主控制器 |
||||
|
│ ├── douban/ # 豆瓣模块 |
||||
|
│ ├── netease/ # 网易云模块 |
||||
|
│ └── weather/ # 天气模块 |
||||
|
└── resources/ |
||||
|
└── logback.xml # 日志配置 |
||||
|
``` |
||||
|
|
||||
|
## 技术栈 |
||||
|
|
||||
|
- Java 11 |
||||
|
- Maven 3.8+ |
||||
|
- Jsoup 1.17.2 - HTML解析 |
||||
|
- OpenCSV 5.9 - CSV处理 |
||||
|
- SLF4J + Logback - 日志框架 |
||||
|
|
||||
|
## 运行方式 |
||||
|
|
||||
|
### 方式一:使用Maven运行 |
||||
|
|
||||
|
```bash |
||||
|
cd CrawlerProject |
||||
|
mvn clean compile |
||||
|
mvn exec:java |
||||
|
``` |
||||
|
|
||||
|
### 方式二:打包后运行 |
||||
|
|
||||
|
```bash |
||||
|
cd CrawlerProject |
||||
|
mvn clean package |
||||
|
java -jar target/CrawlerProject-1.0.0.jar |
||||
|
``` |
||||
|
|
||||
|
## 使用说明 |
||||
|
|
||||
|
运行后会显示主菜单: |
||||
|
|
||||
|
``` |
||||
|
========== 爬虫系统 ========== |
||||
|
1. 豆瓣电影Top250 |
||||
|
2. 网易云音乐热搜 |
||||
|
3. 北京天气预报 |
||||
|
0. 退出 |
||||
|
请选择: |
||||
|
``` |
||||
|
|
||||
|
### 豆瓣电影Top250模块 |
||||
|
|
||||
|
进入模块后支持以下命令: |
||||
|
- `help` - 显示帮助信息 |
||||
|
- `list` - 列出已爬取的电影 |
||||
|
- `crawl` - 爬取豆瓣电影Top250数据 |
||||
|
- `exit` - 退出模块 |
||||
|
|
||||
|
### 网易云音乐热搜模块 |
||||
|
|
||||
|
进入模块后支持以下命令: |
||||
|
- `help` - 显示帮助信息 |
||||
|
- `list` - 列出已爬取的歌曲 |
||||
|
- `crawl` - 爬取网易云音乐热搜数据 |
||||
|
- `exit` - 退出模块 |
||||
|
|
||||
|
**注意**:网易云反爬严格,如无法获取真实数据将使用模拟数据。 |
||||
|
|
||||
|
### 北京天气预报模块 |
||||
|
|
||||
|
进入模块后支持以下命令: |
||||
|
- `help` - 显示帮助信息 |
||||
|
- `list` - 列出已爬取的天气数据 |
||||
|
- `crawl` - 爬取北京30天天气预报 |
||||
|
- `exit` - 退出模块 |
||||
|
|
||||
|
**API配置**:如需获取真实天气数据,请在 `WeatherCrawler.java` 中配置和风天气API Key: |
||||
|
```java |
||||
|
private static final String QWEATHER_API_KEY = "YOUR_API_KEY"; |
||||
|
``` |
||||
|
注册地址:https://devapi.qweather.com/ |
||||
|
|
||||
|
## 输出文件 |
||||
|
|
||||
|
- `output/douban_top250.csv` - 豆瓣电影Top250数据 |
||||
|
- `output/netease_top150.csv` - 网易云音乐热搜榜数据 |
||||
|
- `output/beijing_weather_30days.csv` - 北京30天天气预报数据 |
||||
|
|
||||
|
## 注意事项 |
||||
|
|
||||
|
1. **网络请求**:爬虫会访问外部网站,请确保网络连接正常 |
||||
|
2. **反爬机制**:部分网站有反爬机制,爬取时会有随机延迟 |
||||
|
3. **数据保存**:所有爬取数据会自动保存到CSV文件 |
||||
|
4. **模拟数据**:当无法获取真实数据时,会使用模拟数据展示 |
||||
|
|
||||
|
## 代码规范 |
||||
|
|
||||
|
- 所有输出均通过 `ConsoleView` 类,禁止直接调用 `System.out` |
||||
|
- ANSI颜色码统一在 `AnsiColorUtil` 类中定义 |
||||
|
- 每个模块包含完整的MVC结构 |
||||
|
- 使用命令模式实现模块功能 |
||||
|
|
||||
|
## 许可证 |
||||
|
|
||||
|
MIT License |
||||
@ -0,0 +1,22 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<configuration> |
||||
|
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender"> |
||||
|
<encoder> |
||||
|
<pattern>%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n</pattern> |
||||
|
</encoder> |
||||
|
</appender> |
||||
|
|
||||
|
<appender name="FILE" class="ch.qos.logback.core.FileAppender"> |
||||
|
<file>logs/crawler.log</file> |
||||
|
<encoder> |
||||
|
<pattern>%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n</pattern> |
||||
|
</encoder> |
||||
|
</appender> |
||||
|
|
||||
|
<logger name="com.crawler" level="INFO"/> |
||||
|
|
||||
|
<root level="INFO"> |
||||
|
<appender-ref ref="CONSOLE"/> |
||||
|
<appender-ref ref="FILE"/> |
||||
|
</root> |
||||
|
</configuration> |
||||
Loading…
Reference in new issue