6 changed files with 269 additions and 0 deletions
@ -0,0 +1,7 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<classpath> |
|||
<classpathentry kind="src" path="src/main/java"/> |
|||
<classpathentry kind="src" path="src/main/resources"/> |
|||
<classpathentry kind="lib" path="lib/jsoup-1.17.2.jar"/> |
|||
<classpathentry kind="output" path="bin"/> |
|||
</classpath> |
|||
@ -0,0 +1,48 @@ |
|||
package com.crawler.common; |
|||
|
|||
public final class AnsiColorUtil { |
|||
private AnsiColorUtil() {} |
|||
|
|||
public static final String RESET = "\u001B[0m"; |
|||
public static final String BLACK = "\u001B[30m"; |
|||
public static final String RED = "\u001B[31m"; |
|||
public static final String GREEN = "\u001B[32m"; |
|||
public static final String YELLOW = "\u001B[33m"; |
|||
public static final String BLUE = "\u001B[34m"; |
|||
public static final String PURPLE = "\u001B[35m"; |
|||
public static final String CYAN = "\u001B[36m"; |
|||
public static final String WHITE = "\u001B[37m"; |
|||
|
|||
public static final String BLACK_BOLD = "\u001B[1;30m"; |
|||
public static final String RED_BOLD = "\u001B[1;31m"; |
|||
public static final String GREEN_BOLD = "\u001B[1;32m"; |
|||
public static final String YELLOW_BOLD = "\u001B[1;33m"; |
|||
public static final String BLUE_BOLD = "\u001B[1;34m"; |
|||
public static final String PURPLE_BOLD = "\u001B[1;35m"; |
|||
public static final String CYAN_BOLD = "\u001B[1;36m"; |
|||
public static final String WHITE_BOLD = "\u001B[1;37m"; |
|||
|
|||
public static String colorize(String text, String color) { |
|||
return color + text + RESET; |
|||
} |
|||
|
|||
public static String success(String text) { |
|||
return GREEN_BOLD + text + RESET; |
|||
} |
|||
|
|||
public static String error(String text) { |
|||
return RED_BOLD + text + RESET; |
|||
} |
|||
|
|||
public static String warning(String text) { |
|||
return YELLOW_BOLD + text + RESET; |
|||
} |
|||
|
|||
public static String info(String text) { |
|||
return BLUE_BOLD + text + RESET; |
|||
} |
|||
|
|||
public static String highlight(String text) { |
|||
return CYAN_BOLD + text + RESET; |
|||
} |
|||
} |
|||
@ -0,0 +1,31 @@ |
|||
package com.crawler.common; |
|||
|
|||
public abstract class BaseCommand<T> implements Command { |
|||
protected T model; |
|||
protected ConsoleView view; |
|||
|
|||
public BaseCommand(T model, ConsoleView view) { |
|||
this.model = model; |
|||
this.view = view; |
|||
} |
|||
|
|||
public BaseCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
public T getModel() { |
|||
return model; |
|||
} |
|||
|
|||
public void setModel(T model) { |
|||
this.model = model; |
|||
} |
|||
|
|||
public ConsoleView getView() { |
|||
return view; |
|||
} |
|||
|
|||
public void setView(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
} |
|||
@ -0,0 +1,35 @@ |
|||
package com.crawler.common; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public abstract class BaseCrawler<T> { |
|||
|
|||
public abstract List<T> crawl(); |
|||
|
|||
public void saveToCSV(List<T> data, String filePath) throws IOException { |
|||
if (data == null || data.isEmpty()) { |
|||
LoggerUtil.warn("没有数据需要保存"); |
|||
return; |
|||
} |
|||
|
|||
CsvUtil.ensureDirectoryExists(filePath); |
|||
List<String[]> csvData = convertToCsvFormat(data); |
|||
CsvUtil.write(filePath, csvData); |
|||
} |
|||
|
|||
public List<T> loadFromCSV(String filePath) throws IOException { |
|||
if (!CsvUtil.fileExists(filePath)) { |
|||
LoggerUtil.warn("文件不存在: {}", filePath); |
|||
return new ArrayList<>(); |
|||
} |
|||
|
|||
List<String[]> csvData = CsvUtil.read(filePath); |
|||
return convertFromCsvFormat(csvData); |
|||
} |
|||
|
|||
protected abstract List<String[]> convertToCsvFormat(List<T> data); |
|||
|
|||
protected abstract List<T> convertFromCsvFormat(List<String[]> csvData); |
|||
} |
|||
@ -0,0 +1,126 @@ |
|||
# CrawlerProject - Java爬虫项目 |
|||
|
|||
一个基于Java的爬虫项目,包含豆瓣电影Top250、网易云音乐热搜、北京天气预报三个模块。 |
|||
|
|||
## 项目结构 |
|||
|
|||
``` |
|||
CrawlerProject/ |
|||
├── pom.xml # Maven配置文件 |
|||
├── README.md # 项目说明 |
|||
├── output/ # 输出目录(运行时自动创建) |
|||
│ ├── douban_top250.csv # 豆瓣电影数据 |
|||
│ ├── netease_top150.csv # 网易云音乐数据 |
|||
│ └── beijing_weather_30days.csv # 北京天气数据 |
|||
└── src/main/ |
|||
├── java/com/crawler/ |
|||
│ ├── common/ # 公共工具类 |
|||
│ │ ├── AnsiColorUtil.java # ANSI颜色码管理 |
|||
│ │ ├── BaseCommand.java # 命令抽象类 |
|||
│ │ ├── BaseCrawler.java # 爬虫抽象类 |
|||
│ │ ├── Command.java # 命令接口 |
|||
│ │ ├── ConsoleView.java # 统一输出类 |
|||
│ │ ├── CsvUtil.java # CSV读写工具 |
|||
│ │ ├── LoggerUtil.java # 日志工具类 |
|||
│ │ └── MainController.java # 主控制器 |
|||
│ ├── douban/ # 豆瓣模块 |
|||
│ ├── netease/ # 网易云模块 |
|||
│ └── weather/ # 天气模块 |
|||
└── resources/ |
|||
└── logback.xml # 日志配置 |
|||
``` |
|||
|
|||
## 技术栈 |
|||
|
|||
- Java 11 |
|||
- Maven 3.8+ |
|||
- Jsoup 1.17.2 - HTML解析 |
|||
- OpenCSV 5.9 - CSV处理 |
|||
- SLF4J + Logback - 日志框架 |
|||
|
|||
## 运行方式 |
|||
|
|||
### 方式一:使用Maven运行 |
|||
|
|||
```bash |
|||
cd CrawlerProject |
|||
mvn clean compile |
|||
mvn exec:java |
|||
``` |
|||
|
|||
### 方式二:打包后运行 |
|||
|
|||
```bash |
|||
cd CrawlerProject |
|||
mvn clean package |
|||
java -jar target/CrawlerProject-1.0.0.jar |
|||
``` |
|||
|
|||
## 使用说明 |
|||
|
|||
运行后会显示主菜单: |
|||
|
|||
``` |
|||
========== 爬虫系统 ========== |
|||
1. 豆瓣电影Top250 |
|||
2. 网易云音乐热搜 |
|||
3. 北京天气预报 |
|||
0. 退出 |
|||
请选择: |
|||
``` |
|||
|
|||
### 豆瓣电影Top250模块 |
|||
|
|||
进入模块后支持以下命令: |
|||
- `help` - 显示帮助信息 |
|||
- `list` - 列出已爬取的电影 |
|||
- `crawl` - 爬取豆瓣电影Top250数据 |
|||
- `exit` - 退出模块 |
|||
|
|||
### 网易云音乐热搜模块 |
|||
|
|||
进入模块后支持以下命令: |
|||
- `help` - 显示帮助信息 |
|||
- `list` - 列出已爬取的歌曲 |
|||
- `crawl` - 爬取网易云音乐热搜数据 |
|||
- `exit` - 退出模块 |
|||
|
|||
**注意**:网易云反爬严格,如无法获取真实数据将使用模拟数据。 |
|||
|
|||
### 北京天气预报模块 |
|||
|
|||
进入模块后支持以下命令: |
|||
- `help` - 显示帮助信息 |
|||
- `list` - 列出已爬取的天气数据 |
|||
- `crawl` - 爬取北京30天天气预报 |
|||
- `exit` - 退出模块 |
|||
|
|||
**API配置**:如需获取真实天气数据,请在 `WeatherCrawler.java` 中配置和风天气API Key: |
|||
```java |
|||
private static final String QWEATHER_API_KEY = "YOUR_API_KEY"; |
|||
``` |
|||
注册地址:https://devapi.qweather.com/ |
|||
|
|||
## 输出文件 |
|||
|
|||
- `output/douban_top250.csv` - 豆瓣电影Top250数据 |
|||
- `output/netease_top150.csv` - 网易云音乐热搜榜数据 |
|||
- `output/beijing_weather_30days.csv` - 北京30天天气预报数据 |
|||
|
|||
## 注意事项 |
|||
|
|||
1. **网络请求**:爬虫会访问外部网站,请确保网络连接正常 |
|||
2. **反爬机制**:部分网站有反爬机制,爬取时会有随机延迟 |
|||
3. **数据保存**:所有爬取数据会自动保存到CSV文件 |
|||
4. **模拟数据**:当无法获取真实数据时,会使用模拟数据展示 |
|||
|
|||
## 代码规范 |
|||
|
|||
- 所有输出均通过 `ConsoleView` 类,禁止直接调用 `System.out` |
|||
- ANSI颜色码统一在 `AnsiColorUtil` 类中定义 |
|||
- 每个模块包含完整的MVC结构 |
|||
- 使用命令模式实现模块功能 |
|||
|
|||
## 许可证 |
|||
|
|||
MIT License |
|||
@ -0,0 +1,22 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<configuration> |
|||
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender"> |
|||
<encoder> |
|||
<pattern>%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n</pattern> |
|||
</encoder> |
|||
</appender> |
|||
|
|||
<appender name="FILE" class="ch.qos.logback.core.FileAppender"> |
|||
<file>logs/crawler.log</file> |
|||
<encoder> |
|||
<pattern>%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n</pattern> |
|||
</encoder> |
|||
</appender> |
|||
|
|||
<logger name="com.crawler" level="INFO"/> |
|||
|
|||
<root level="INFO"> |
|||
<appender-ref ref="CONSOLE"/> |
|||
<appender-ref ref="FILE"/> |
|||
</root> |
|||
</configuration> |
|||
Loading…
Reference in new issue