5 changed files with 146 additions and 0 deletions
Binary file not shown.
@ -0,0 +1,80 @@ |
|||||
|
package com.rental.crawler; |
||||
|
|
||||
|
import com.rental.crawler.model.Book; |
||||
|
import com.rental.crawler.util.HttpUtil; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class DoubanCrawler { |
||||
|
private static final String BASE_URL = "https://book.douban.com/tag/论文"; |
||||
|
|
||||
|
public List<Book> crawl(int pageCount) throws IOException { |
||||
|
List<Book> books = new ArrayList<>(); |
||||
|
|
||||
|
for (int page = 0; page < pageCount; page++) { |
||||
|
String url = BASE_URL + "?start=" + (page * 20); |
||||
|
System.out.println("正在爬取: " + url); |
||||
|
|
||||
|
Document doc = HttpUtil.getDocument(url); |
||||
|
|
||||
|
Elements bookElements = doc.select(".subject-item"); |
||||
|
for (Element bookElement : bookElements) { |
||||
|
Book book = parseBook(bookElement); |
||||
|
if (book != null) { |
||||
|
books.add(book); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
Thread.sleep(1000); |
||||
|
} catch (InterruptedException e) { |
||||
|
e.printStackTrace(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return books; |
||||
|
} |
||||
|
|
||||
|
private Book parseBook(Element bookElement) { |
||||
|
Book book = new Book(); |
||||
|
|
||||
|
Element titleElement = bookElement.selectFirst(".info h2 a"); |
||||
|
if (titleElement != null) { |
||||
|
book.setTitle(titleElement.text().trim()); |
||||
|
book.setUrl(titleElement.attr("href")); |
||||
|
} |
||||
|
|
||||
|
Element infoElement = bookElement.selectFirst(".info .pub"); |
||||
|
if (infoElement != null) { |
||||
|
String info = infoElement.text().trim(); |
||||
|
String[] parts = info.split("/"); |
||||
|
if (parts.length >= 4) { |
||||
|
book.setAuthors(parts[0].trim()); |
||||
|
book.setPublisher(parts[1].trim()); |
||||
|
book.setPublishDate(parts[2].trim()); |
||||
|
book.setPrice(parts[3].trim()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
Element ratingElement = bookElement.selectFirst(".info .rating_nums"); |
||||
|
if (ratingElement != null) { |
||||
|
try { |
||||
|
book.setRating(Double.parseDouble(ratingElement.text().trim())); |
||||
|
} catch (NumberFormatException e) { |
||||
|
book.setRating(0.0); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
Element summaryElement = bookElement.selectFirst(".info p"); |
||||
|
if (summaryElement != null) { |
||||
|
book.setSummary(summaryElement.text().trim()); |
||||
|
} |
||||
|
|
||||
|
return book; |
||||
|
} |
||||
|
} |
||||
Binary file not shown.
@ -0,0 +1,66 @@ |
|||||
|
# 豆瓣读书爬虫项目协助记录 |
||||
|
|
||||
|
## 项目概述 |
||||
|
- **项目名称**:豆瓣读书爬虫 |
||||
|
- **目标**:爬取豆瓣读书中论文标签的书籍信息 |
||||
|
- **技术栈**:Java + Jsoup |
||||
|
|
||||
|
## 协助过程 |
||||
|
|
||||
|
### 1. 环境准备 |
||||
|
- 检查Java环境:确认已安装OpenJDK 21.0.10 LTS |
||||
|
- 添加Jsoup依赖:用于解析网页HTML |
||||
|
- 创建Maven项目结构 |
||||
|
|
||||
|
### 2. 项目结构搭建 |
||||
|
- 创建目录结构:`com/rental/crawler/model` 和 `com/rental/crawler/util` |
||||
|
- 配置pom.xml文件:添加Jsoup依赖 |
||||
|
|
||||
|
### 3. 核心文件创建 |
||||
|
|
||||
|
#### HTTP工具类 (`HttpUtil.java`) |
||||
|
- 功能:发送HTTP请求获取网页内容 |
||||
|
- 特性:模拟浏览器User-Agent,设置超时时间 |
||||
|
|
||||
|
#### 书籍数据模型 (`Book.java`) |
||||
|
- 定义书籍属性:书名、作者、出版社、出版日期、价格、评分、简介、链接 |
||||
|
|
||||
|
#### 豆瓣爬虫实现 (`DoubanCrawler.java`) |
||||
|
- 核心功能:爬取豆瓣读书论文标签的书籍信息 |
||||
|
- 支持多页爬取,每页20本书 |
||||
|
- 解析HTML提取书籍信息 |
||||
|
- 实现延迟控制,避免被反爬 |
||||
|
|
||||
|
#### 主类 (`Main.java`) |
||||
|
- 启动爬虫并显示结果 |
||||
|
- 支持指定爬取页数 |
||||
|
|
||||
|
### 4. 编译与运行 |
||||
|
- 编译代码:使用javac命令编译Java文件 |
||||
|
- 运行爬虫:使用java命令运行主类 |
||||
|
- 保存结果:将爬虫结果重定向到文件 |
||||
|
|
||||
|
### 5. 项目打包 |
||||
|
- 创建w3文件夹:用于存放项目文件和结果 |
||||
|
- 复制所有爬虫相关文件到w3文件夹 |
||||
|
- 保存爬虫结果到w3文件夹 |
||||
|
- 保存本协助记录到w3文件夹 |
||||
|
|
||||
|
## 运行结果 |
||||
|
- 成功爬取40本书的信息 |
||||
|
- 包含书名、作者、出版社、出版日期、价格、评分、简介、豆瓣链接 |
||||
|
- 结果保存在 `w3/crawler_result.txt` |
||||
|
|
||||
|
## 技术要点 |
||||
|
- Jsoup库的使用:解析HTML |
||||
|
- HTTP请求模拟:设置User-Agent |
||||
|
- 反爬措施:添加延迟 |
||||
|
- 数据解析:使用CSS选择器提取数据 |
||||
|
- 命令行操作:编译和运行Java程序 |
||||
|
|
||||
|
## 后续扩展建议 |
||||
|
- 增加数据存储到数据库 |
||||
|
- 添加GUI界面 |
||||
|
- 实现多线程爬取 |
||||
|
- 增加更多标签的爬取 |
||||
|
- 实现数据可视化 |
||||
Binary file not shown.
Loading…
Reference in new issue