You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

500 lines
27 KiB

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.*;
// ==================== 1. 异常体系 ====================
enum ErrorCode {
CONNECTION_ERROR("连接失败"),
PARSE_ERROR("解析失败"),
VALIDATION_ERROR("参数验证失败"),
IO_ERROR("IO操作失败"),
UNKNOWN_ERROR("未知错误"),
NO_RESULTS_ERROR("未获取到数据");
private final String message;
ErrorCode(String message) { this.message = message; }
public String getMessage() { return message; }
}
class CrawlerException extends Exception {
private final ErrorCode errorCode;
public CrawlerException(ErrorCode errorCode, String message) { super(message); this.errorCode = errorCode; }
public CrawlerException(ErrorCode errorCode, String message, Throwable cause) { super(message, cause); this.errorCode = errorCode; }
public ErrorCode getErrorCode() { return errorCode; }
}
// ==================== 2. 数据模型 (Model) ====================
class Book {
private String title; private String authors; private String publisher;
private String publishDate; private String price; private double rating;
private String summary; private String url;
public String getTitle() { return title; } public void setTitle(String title) { this.title = title; }
public String getAuthors() { return authors; } public void setAuthors(String authors) { this.authors = authors; }
public String getPublisher() { return publisher; } public void setPublisher(String publisher) { this.publisher = publisher; }
public String getPublishDate() { return publishDate; } public void setPublishDate(String publishDate) { this.publishDate = publishDate; }
public String getPrice() { return price; } public void setPrice(String price) { this.price = price; }
public double getRating() { return rating; } public void setRating(double rating) { this.rating = rating; }
public String getSummary() { return summary; } public void setSummary(String summary) { this.summary = summary; }
public String getUrl() { return url; } public void setUrl(String url) { this.url = url; }
}
class Movie {
private String title; private String director; private String actors;
private String year; private String type; private double rating;
private String url;
public String getTitle() { return title; } public void setTitle(String title) { this.title = title; }
public String getDirector() { return director; } public void setDirector(String director) { this.director = director; }
public String getActors() { return actors; } public void setActors(String actors) { this.actors = actors; }
public String getYear() { return year; } public void setYear(String year) { this.year = year; }
public String getType() { return type; } public void setType(String type) { this.type = type; }
public double getRating() { return rating; } public void setRating(double rating) { this.rating = rating; }
public String getUrl() { return url; } public void setUrl(String url) { this.url = url; }
}
class Music {
private String title; private String artist; private String album;
private String releaseDate; private double rating; private String url;
public String getTitle() { return title; } public void setTitle(String title) { this.title = title; }
public String getArtist() { return artist; } public void setArtist(String artist) { this.artist = artist; }
public String getAlbum() { return album; } public void setAlbum(String album) { this.album = album; }
public String getReleaseDate() { return releaseDate; } public void setReleaseDate(String releaseDate) { this.releaseDate = releaseDate; }
public double getRating() { return rating; } public void setRating(double rating) { this.rating = rating; }
public String getUrl() { return url; } public void setUrl(String url) { this.url = url; }
}
class Article {
private String title; private String author; private String publishDate;
private String views; private String url;
public String getTitle() { return title; } public void setTitle(String title) { this.title = title; }
public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; }
public String getPublishDate() { return publishDate; } public void setPublishDate(String publishDate) { this.publishDate = publishDate; }
public String getViews() { return views; } public void setViews(String views) { this.views = views; }
public String getUrl() { return url; } public void setUrl(String url) { this.url = url; }
}
class CrawlResult {
private String source; private List<Object> items; private long crawlTime; private int totalCount;
public CrawlResult(String source, List<Object> items, long crawlTime) {
this.source = source; this.items = items; this.crawlTime = crawlTime; this.totalCount = items.size();
}
public String getSource() { return source; }
public List<Object> getItems() { return items; }
public long getCrawlTime() { return crawlTime; }
public int getTotalCount() { return totalCount; }
}
// ==================== 3. 策略模式 (Strategy) ====================
interface CrawlStrategy {
List<Object> crawl(int pageCount) throws CrawlerException;
String getSourceName();
String getItemType();
}
class DoubanBookStrategy implements CrawlStrategy {
@Override
public List<Object> crawl(int pageCount) throws CrawlerException {
List<Object> books = new ArrayList<>();
try {
for (int page = 0; page < pageCount && books.size() < 35; page++) {
String url = "https://book.douban.com/tag/计算机?start=" + (page * 20);
System.out.println("正在爬取豆瓣读书: " + url);
Document doc = HttpUtil.getDocument(url);
Elements bookElements = doc.select(".subject-item");
for (Element element : bookElements) {
Book book = new Book();
Element titleElement = element.selectFirst(".info h2 a");
if (titleElement != null) { book.setTitle(titleElement.text().trim()); book.setUrl(titleElement.attr("href")); }
Element infoElement = element.selectFirst(".info .pub");
if (infoElement != null) {
String[] parts = infoElement.text().trim().split("/");
if (parts.length >= 4) {
book.setAuthors(parts[0].trim());
book.setPublisher(parts[1].trim());
book.setPublishDate(parts[2].trim());
book.setPrice(parts[3].trim());
}
}
Element ratingElement = element.selectFirst(".info .rating_nums");
if (ratingElement != null) {
try { book.setRating(Double.parseDouble(ratingElement.text().trim())); }
catch (NumberFormatException e) { book.setRating(0.0); }
}
if (book.getTitle() != null && !book.getTitle().isEmpty()) books.add(book);
if (books.size() >= 35) break;
}
Thread.sleep(800);
}
} catch (IOException e) { throw new CrawlerException(ErrorCode.CONNECTION_ERROR, "连接豆瓣失败", e); }
catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new CrawlerException(ErrorCode.UNKNOWN_ERROR, "爬取被中断", e); }
if (books.isEmpty()) throw new CrawlerException(ErrorCode.NO_RESULTS_ERROR, "未获取到豆瓣图书数据");
return books;
}
@Override public String getSourceName() { return "豆瓣读书"; }
@Override public String getItemType() { return "Book"; }
}
class DoubanMovieStrategy implements CrawlStrategy {
@Override
public List<Object> crawl(int pageCount) throws CrawlerException {
List<Object> movies = new ArrayList<>();
try {
for (int page = 0; page < pageCount && movies.size() < 35; page++) {
String url = "https://movie.douban.com/top250?start=" + (page * 25);
System.out.println("正在爬取豆瓣电影: " + url);
Document doc = HttpUtil.getDocument(url);
Elements movieElements = doc.select(".item");
for (Element element : movieElements) {
Movie movie = new Movie();
Element titleElement = element.selectFirst(".hd a");
if (titleElement != null) {
movie.setTitle(titleElement.text().replaceAll("\\s+", " ").trim());
movie.setUrl(titleElement.attr("href"));
}
Element infoElement = element.selectFirst(".bd p:first-child");
if (infoElement != null) {
String info = infoElement.text();
if (info.contains("导演")) {
String[] parts = info.split("\\s+");
movie.setDirector(parts[1]);
StringBuilder actors = new StringBuilder();
for (int i = 3; i < parts.length && i < 6; i++) actors.append(parts[i]).append(" ");
movie.setActors(actors.toString().trim());
}
}
Element ratingElement = element.selectFirst(".rating_num");
if (ratingElement != null) {
try { movie.setRating(Double.parseDouble(ratingElement.text().trim())); }
catch (NumberFormatException e) { movie.setRating(0.0); }
}
Element yearElement = element.selectFirst(".bd p:nth-child(2)");
if (yearElement != null) {
String text = yearElement.text();
int idx = text.indexOf("上映日期:");
if (idx > 0) movie.setYear(text.substring(idx + 5).trim().substring(0, 4));
}
if (movie.getTitle() != null && !movie.getTitle().isEmpty()) movies.add(movie);
if (movies.size() >= 35) break;
}
Thread.sleep(800);
}
} catch (IOException e) { throw new CrawlerException(ErrorCode.CONNECTION_ERROR, "连接豆瓣电影失败", e); }
catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new CrawlerException(ErrorCode.UNKNOWN_ERROR, "爬取被中断", e); }
if (movies.isEmpty()) throw new CrawlerException(ErrorCode.NO_RESULTS_ERROR, "未获取到豆瓣电影数据");
return movies;
}
@Override public String getSourceName() { return "豆瓣电影"; }
@Override public String getItemType() { return "Movie"; }
}
class DoubanMusicStrategy implements CrawlStrategy {
@Override
public List<Object> crawl(int pageCount) throws CrawlerException {
List<Object> musics = new ArrayList<>();
try {
for (int page = 0; page < pageCount && musics.size() < 35; page++) {
String url = "https://music.douban.com/chart";
System.out.println("正在爬取豆瓣音乐: " + url);
Document doc = HttpUtil.getDocument(url);
Elements musicElements = doc.select(".clearfix");
for (Element element : musicElements) {
Music music = new Music();
Element titleElement = element.selectFirst(".song-name a");
if (titleElement != null) { music.setTitle(titleElement.text().trim()); music.setUrl(titleElement.attr("href")); }
Element artistElement = element.selectFirst(".artist a");
if (artistElement != null) music.setArtist(artistElement.text().trim());
Element albumElement = element.selectFirst(".album a");
if (albumElement != null) music.setAlbum(albumElement.text().trim());
Element ratingElement = element.selectFirst(".rating_nums");
if (ratingElement != null) {
try { music.setRating(Double.parseDouble(ratingElement.text().trim())); }
catch (NumberFormatException e) { music.setRating(0.0); }
}
if (music.getTitle() != null && !music.getTitle().isEmpty()) musics.add(music);
if (musics.size() >= 35) break;
}
Thread.sleep(800);
}
} catch (IOException e) { throw new CrawlerException(ErrorCode.CONNECTION_ERROR, "连接豆瓣音乐失败", e); }
catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new CrawlerException(ErrorCode.UNKNOWN_ERROR, "爬取被中断", e); }
if (musics.isEmpty()) throw new CrawlerException(ErrorCode.NO_RESULTS_ERROR, "未获取到豆瓣音乐数据");
return musics;
}
@Override public String getSourceName() { return "豆瓣音乐"; }
@Override public String getItemType() { return "Music"; }
}
class CsdnBlogStrategy implements CrawlStrategy {
@Override
public List<Object> crawl(int pageCount) throws CrawlerException {
List<Object> articles = new ArrayList<>();
try {
for (int page = 1; page <= pageCount && articles.size() < 35; page++) {
String url = "https://www.csdn.net/nav/java?page=" + page;
System.out.println("正在爬取CSDN博客: " + url);
Document doc = HttpUtil.getDocument(url);
Elements articleElements = doc.select(".list-item");
for (Element element : articleElements) {
Article article = new Article();
Element titleElement = element.selectFirst(".title a");
if (titleElement != null) { article.setTitle(titleElement.text().trim()); article.setUrl(titleElement.attr("href")); }
Element authorElement = element.selectFirst(".name a");
if (authorElement != null) article.setAuthor(authorElement.text().trim());
Element timeElement = element.selectFirst(".time");
if (timeElement != null) article.setPublishDate(timeElement.text().trim());
Element viewElement = element.selectFirst(".view");
if (viewElement != null) article.setViews(viewElement.text().trim());
if (article.getTitle() != null && !article.getTitle().isEmpty()) articles.add(article);
if (articles.size() >= 35) break;
}
Thread.sleep(1000);
}
} catch (IOException e) { throw new CrawlerException(ErrorCode.CONNECTION_ERROR, "连接CSDN失败", e); }
catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new CrawlerException(ErrorCode.UNKNOWN_ERROR, "爬取被中断", e); }
if (articles.isEmpty()) throw new CrawlerException(ErrorCode.NO_RESULTS_ERROR, "未获取到CSDN文章数据");
return articles;
}
@Override public String getSourceName() { return "CSDN博客"; }
@Override public String getItemType() { return "Article"; }
}
// ==================== 4. 工具类 ====================
class HttpUtil {
private static final String[] USER_AGENTS = {
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
};
public static Document getDocument(String url) throws IOException {
String userAgent = USER_AGENTS[(int) (Math.random() * USER_AGENTS.length)];
return Jsoup.connect(url).userAgent(userAgent).timeout(15000).get();
}
}
// ==================== 5. Command模式 ====================
interface Command {
void execute();
String getCommandName();
}
class CrawlCommand implements Command {
private CrawlStrategy strategy;
private CLIView view;
private List<CrawlResult> results;
private int pageCount;
public CrawlCommand(CrawlStrategy strategy, CLIView view, List<CrawlResult> results, int pageCount) {
this.strategy = strategy; this.view = view; this.results = results; this.pageCount = pageCount;
}
@Override
public void execute() {
view.showMessage("开始爬取 " + strategy.getSourceName() + "...");
try {
long startTime = System.currentTimeMillis();
List<Object> items = strategy.crawl(pageCount);
long endTime = System.currentTimeMillis();
results.add(new CrawlResult(strategy.getSourceName(), items, endTime - startTime));
view.showMessage("爬取完成!获取 " + items.size() + " 条数据,耗时 " + (endTime - startTime) + "ms");
} catch (CrawlerException e) {
view.showError("爬取失败: " + e.getMessage());
}
}
@Override public String getCommandName() { return "CrawlCommand(" + strategy.getSourceName() + ")"; }
}
class SaveCommand implements Command {
private List<CrawlResult> results;
private CLIView view;
public SaveCommand(List<CrawlResult> results, CLIView view) {
this.results = results; this.view = view;
}
@Override
public void execute() {
if (results.isEmpty()) { view.showError("没有可保存的数据"); return; }
String fileName = "e:\\w14\\crawl_results_" + LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")) + ".txt";
try (BufferedWriter writer = new BufferedWriter(new FileWriter(fileName))) {
writer.write("爬虫结果报告"); writer.newLine();
writer.write("生成时间: " + LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))); writer.newLine();
writer.write("=".repeat(60)); writer.newLine();
for (CrawlResult result : results) {
writer.write("\n【来源】: " + result.getSource()); writer.newLine();
writer.write("【耗时】: " + result.getCrawlTime() + "ms"); writer.newLine();
writer.write("【数量】: " + result.getTotalCount() + " 条"); writer.newLine();
writer.write("-".repeat(60)); writer.newLine();
int index = 1;
for (Object item : result.getItems()) {
writer.write("\n" + index + ". ");
if (item instanceof Book) {
Book b = (Book) item;
writer.write(b.getTitle()); writer.newLine();
if (b.getAuthors() != null) writer.write(" 作者: " + b.getAuthors() + "\n");
if (b.getPublisher() != null) writer.write(" 出版社: " + b.getPublisher() + "\n");
if (b.getPublishDate() != null) writer.write(" 出版时间: " + b.getPublishDate() + "\n");
if (b.getRating() > 0) writer.write(" 评分: " + b.getRating() + "\n");
} else if (item instanceof Movie) {
Movie m = (Movie) item;
writer.write(m.getTitle()); writer.newLine();
if (m.getDirector() != null) writer.write(" 导演: " + m.getDirector() + "\n");
if (m.getActors() != null) writer.write(" 主演: " + m.getActors() + "\n");
if (m.getYear() != null) writer.write(" 年份: " + m.getYear() + "\n");
if (m.getRating() > 0) writer.write(" 评分: " + m.getRating() + "\n");
} else if (item instanceof Music) {
Music m = (Music) item;
writer.write(m.getTitle()); writer.newLine();
if (m.getArtist() != null) writer.write(" 歌手: " + m.getArtist() + "\n");
if (m.getAlbum() != null) writer.write(" 专辑: " + m.getAlbum() + "\n");
if (m.getRating() > 0) writer.write(" 评分: " + m.getRating() + "\n");
} else if (item instanceof Article) {
Article a = (Article) item;
writer.write(a.getTitle()); writer.newLine();
if (a.getAuthor() != null) writer.write(" 作者: " + a.getAuthor() + "\n");
if (a.getPublishDate() != null) writer.write(" 发布时间: " + a.getPublishDate() + "\n");
if (a.getViews() != null) writer.write(" 阅读量: " + a.getViews() + "\n");
}
index++;
}
writer.write("=".repeat(60)); writer.newLine();
}
view.showMessage("数据已保存到文件: " + fileName);
} catch (IOException e) {
view.showError("保存文件失败: " + e.getMessage());
}
}
@Override public String getCommandName() { return "SaveCommand"; }
}
// ==================== 6. MVC视图 (View) ====================
class CLIView {
private Scanner scanner;
public CLIView() { this.scanner = new Scanner(System.in); }
public int showMenu() {
System.out.println("\n========== 爬虫系统 v1.0 ==========");
System.out.println("1. 爬取豆瓣读书");
System.out.println("2. 爬取豆瓣电影");
System.out.println("3. 爬取豆瓣音乐");
System.out.println("4. 爬取CSDN博客");
System.out.println("5. 爬取所有网站");
System.out.println("6. 查看爬取结果");
System.out.println("7. 保存结果到文件");
System.out.println("0. 退出");
System.out.print("请输入选择: ");
try { return Integer.parseInt(scanner.nextLine()); } catch (NumberFormatException e) { return -1; }
}
public void showMessage(String message) { System.out.println("[信息] " + message); }
public void showError(String error) { System.err.println("[错误] " + error); }
public void displayResults(List<CrawlResult> results) {
if (results.isEmpty()) { System.out.println("暂无爬取结果"); return; }
System.out.println("\n========== 爬取结果 ==========");
for (CrawlResult result : results) {
System.out.println("\n【来源】: " + result.getSource());
System.out.println("【耗时】: " + result.getCrawlTime() + "ms | 【数量】: " + result.getTotalCount() + " 条");
System.out.println("-".repeat(50));
int index = 1;
for (Object item : result.getItems()) {
if (item instanceof Book) {
Book b = (Book) item;
System.out.println(index + ". " + b.getTitle());
if (b.getAuthors() != null) System.out.println(" 作者: " + b.getAuthors());
if (b.getRating() > 0) System.out.println(" 评分: " + b.getRating());
} else if (item instanceof Movie) {
Movie m = (Movie) item;
System.out.println(index + ". " + m.getTitle());
if (m.getDirector() != null) System.out.println(" 导演: " + m.getDirector());
if (m.getRating() > 0) System.out.println(" 评分: " + m.getRating());
} else if (item instanceof Music) {
Music m = (Music) item;
System.out.println(index + ". " + m.getTitle());
if (m.getArtist() != null) System.out.println(" 歌手: " + m.getArtist());
if (m.getRating() > 0) System.out.println(" 评分: " + m.getRating());
} else if (item instanceof Article) {
Article a = (Article) item;
System.out.println(index + ". " + a.getTitle());
if (a.getAuthor() != null) System.out.println(" 作者: " + a.getAuthor());
if (a.getViews() != null) System.out.println(" 阅读量: " + a.getViews());
}
if (index++ >= 10) { System.out.println(" ... (仅显示前10条)"); break; }
}
}
}
}
// ==================== 7. MVC控制器 (Controller) ====================
class CrawlerController {
private CLIView view;
private List<CrawlResult> results;
public CrawlerController(CLIView view) {
this.view = view;
this.results = new ArrayList<>();
}
public void executeCrawlCommand(String source) {
List<Command> commands = new ArrayList<>();
int pageCount = 3; // 确保获取至少30条数据
switch (source.toLowerCase()) {
case "doubanbook": commands.add(new CrawlCommand(new DoubanBookStrategy(), view, results, pageCount)); break;
case "doubanmovie": commands.add(new CrawlCommand(new DoubanMovieStrategy(), view, results, pageCount)); break;
case "doubanmusic": commands.add(new CrawlCommand(new DoubanMusicStrategy(), view, results, pageCount)); break;
case "csdn": commands.add(new CrawlCommand(new CsdnBlogStrategy(), view, results, pageCount)); break;
case "all":
commands.add(new CrawlCommand(new DoubanBookStrategy(), view, results, pageCount));
commands.add(new CrawlCommand(new DoubanMovieStrategy(), view, results, pageCount));
commands.add(new CrawlCommand(new DoubanMusicStrategy(), view, results, pageCount));
commands.add(new CrawlCommand(new CsdnBlogStrategy(), view, results, pageCount));
break;
default: view.showError("未知的爬取源: " + source); return;
}
for (Command command : commands) {
command.execute();
}
}
public void saveResults() {
new SaveCommand(results, view).execute();
}
public List<CrawlResult> getResults() {
return new ArrayList<>(results);
}
}
// ==================== 8. 主程序入口 ====================
public class CrawlerApp {
public static void main(String[] args) {
CLIView view = new CLIView();
CrawlerController controller = new CrawlerController(view);
System.out.println("========== 爬虫系统 v1.0 自动运行模式 ==========");
System.out.println("开始爬取所有4个网站...\n");
controller.executeCrawlCommand("all");
System.out.println("\n========== 爬取结果汇总 ==========");
view.displayResults(controller.getResults());
System.out.println("\n========== 保存结果到文件 ==========");
controller.saveResults();
System.out.println("\n========== 程序结束 ==========");
}
}