You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
562 lines
21 KiB
562 lines
21 KiB
import java.io.FileWriter;
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.Scanner;
|
|
import java.util.logging.Logger;
|
|
|
|
interface Crawler {
|
|
List<? extends CrawlerItem> crawl() throws CrawlerException;
|
|
}
|
|
|
|
interface CrawlerItem {
|
|
int getRank();
|
|
String getName();
|
|
String getUrl();
|
|
String getDescription();
|
|
String getSource();
|
|
}
|
|
|
|
interface Command {
|
|
void execute() throws CrawlerException;
|
|
String getCommandName();
|
|
}
|
|
|
|
class CrawlerException extends Exception {
|
|
public CrawlerException(String message) {
|
|
super(message);
|
|
}
|
|
public CrawlerException(String message, Throwable cause) {
|
|
super(message, cause);
|
|
}
|
|
}
|
|
|
|
class NetworkException extends CrawlerException {
|
|
public NetworkException(String message) {
|
|
super(message);
|
|
}
|
|
public NetworkException(String message, Throwable cause) {
|
|
super(message, cause);
|
|
}
|
|
}
|
|
|
|
class ParseException extends CrawlerException {
|
|
public ParseException(String message) {
|
|
super(message);
|
|
}
|
|
public ParseException(String message, Throwable cause) {
|
|
super(message, cause);
|
|
}
|
|
}
|
|
|
|
class GitHubItem implements CrawlerItem {
|
|
private int rank;
|
|
private String name;
|
|
private String description;
|
|
private String url;
|
|
private String language;
|
|
private String stars;
|
|
private String todayStars;
|
|
|
|
@Override
|
|
public int getRank() { return rank; }
|
|
public void setRank(int rank) { this.rank = rank; }
|
|
@Override
|
|
public String getName() { return name; }
|
|
public void setName(String name) { this.name = name; }
|
|
@Override
|
|
public String getDescription() { return description; }
|
|
public void setDescription(String description) { this.description = description; }
|
|
@Override
|
|
public String getUrl() { return url; }
|
|
public void setUrl(String url) { this.url = url; }
|
|
public String getLanguage() { return language; }
|
|
public void setLanguage(String language) { this.language = language; }
|
|
public String getStars() { return stars; }
|
|
public void setStars(String stars) { this.stars = stars; }
|
|
public String getTodayStars() { return todayStars; }
|
|
public void setTodayStars(String todayStars) { this.todayStars = todayStars; }
|
|
@Override
|
|
public String getSource() { return "GitHub Trending"; }
|
|
@Override
|
|
public String toString() {
|
|
return String.format("%d. %s [%s] - %s星", rank, name, language, stars);
|
|
}
|
|
}
|
|
|
|
class WeiboItem implements CrawlerItem {
|
|
private int rank;
|
|
private String name;
|
|
private String description;
|
|
private String url;
|
|
private String hot;
|
|
|
|
@Override
|
|
public int getRank() { return rank; }
|
|
public void setRank(int rank) { this.rank = rank; }
|
|
@Override
|
|
public String getName() { return name; }
|
|
public void setName(String name) { this.name = name; }
|
|
@Override
|
|
public String getDescription() { return description; }
|
|
public void setDescription(String description) { this.description = description; }
|
|
@Override
|
|
public String getUrl() { return url; }
|
|
public void setUrl(String url) { this.url = url; }
|
|
public String getHot() { return hot; }
|
|
public void setHot(String hot) { this.hot = hot; }
|
|
@Override
|
|
public String getSource() { return "微博热搜"; }
|
|
@Override
|
|
public String toString() {
|
|
return String.format("%d. %s - %s", rank, name, hot);
|
|
}
|
|
}
|
|
|
|
class DoubanItem implements CrawlerItem {
|
|
private int rank;
|
|
private String name;
|
|
private String description;
|
|
private String url;
|
|
private String rating;
|
|
private String director;
|
|
|
|
@Override
|
|
public int getRank() { return rank; }
|
|
public void setRank(int rank) { this.rank = rank; }
|
|
@Override
|
|
public String getName() { return name; }
|
|
public void setName(String name) { this.name = name; }
|
|
@Override
|
|
public String getDescription() { return description; }
|
|
public void setDescription(String description) { this.description = description; }
|
|
@Override
|
|
public String getUrl() { return url; }
|
|
public void setUrl(String url) { this.url = url; }
|
|
public String getRating() { return rating; }
|
|
public void setRating(String rating) { this.rating = rating; }
|
|
public String getDirector() { return director; }
|
|
public void setDirector(String director) { this.director = director; }
|
|
@Override
|
|
public String getSource() { return "豆瓣电影Top250"; }
|
|
@Override
|
|
public String toString() {
|
|
return String.format("%d. %s - %s分", rank, name, rating);
|
|
}
|
|
}
|
|
|
|
class GitHubStrategy implements Crawler {
|
|
@Override
|
|
public List<CrawlerItem> crawl() throws CrawlerException {
|
|
try {
|
|
return parseMockData();
|
|
} catch (Exception e) {
|
|
throw new ParseException("解析GitHub数据失败", e);
|
|
}
|
|
}
|
|
|
|
private List<CrawlerItem> parseMockData() {
|
|
List<CrawlerItem> items = new ArrayList<>();
|
|
String[][] data = {
|
|
{"freeCodeCamp/freeCodeCamp", "开源代码库和课程", "JavaScript", "358000", "120"},
|
|
{"microsoft/vscode", "Visual Studio Code", "TypeScript", "158000", "89"},
|
|
{"facebook/react", "React框架", "JavaScript", "205000", "76"},
|
|
{"tensorflow/tensorflow", "机器学习框架", "Python", "180000", "65"},
|
|
{"torvalds/linux", "Linux内核", "C", "160000", "45"},
|
|
{"kubernetes/kubernetes", "容器编排", "Go", "100000", "38"},
|
|
{"spring-projects/spring-boot", "Spring Boot", "Java", "60000", "32"},
|
|
{"vuejs/core", "Vue.js框架", "TypeScript", "45000", "58"},
|
|
{"rust-lang/rust", "Rust语言", "Rust", "85000", "42"},
|
|
{"numpy/numpy", "数值计算库", "Python", "25000", "28"}
|
|
};
|
|
for (int i = 0; i < data.length; i++) {
|
|
GitHubItem item = new GitHubItem();
|
|
item.setRank(i + 1);
|
|
item.setName(data[i][0]);
|
|
item.setDescription(data[i][1]);
|
|
item.setLanguage(data[i][2]);
|
|
item.setStars(data[i][3]);
|
|
item.setTodayStars(data[i][4]);
|
|
item.setUrl("https://github.com/" + data[i][0]);
|
|
items.add(item);
|
|
}
|
|
return items;
|
|
}
|
|
|
|
public String getSourceName() {
|
|
return "GitHub Trending";
|
|
}
|
|
}
|
|
|
|
class WeiboStrategy implements Crawler {
|
|
@Override
|
|
public List<CrawlerItem> crawl() throws CrawlerException {
|
|
try {
|
|
return parseMockData();
|
|
} catch (Exception e) {
|
|
throw new ParseException("解析微博数据失败", e);
|
|
}
|
|
}
|
|
|
|
private List<CrawlerItem> parseMockData() {
|
|
List<CrawlerItem> items = new ArrayList<>();
|
|
String[][] data = {
|
|
{"北京气温创历史新高", "北京今日最高气温达到40.2度,创历史同期新高", "288万"},
|
|
{"国乒世预赛大胜", "国乒在世界杯预选赛中以3:0战胜对手", "196万"},
|
|
{"新型人工智能模型发布", "某科技公司发布新一代AI大模型,性能提升300%", "156万"},
|
|
{"高考倒计时100天", "距离2024年高考还有100天,考生们积极备考", "128万"},
|
|
{"春季旅游旺季来临", "随着气温回暖,各大景区迎来旅游高峰", "98万"},
|
|
{"新能源汽车销量创新高", "一季度新能源汽车销量同比增长50%", "86万"},
|
|
{"5G商用全面铺开", "全国5G基站总数突破400万", "72万"},
|
|
{"数字人民币试点扩大", "数字人民币应用场景进一步扩展", "65万"},
|
|
{"航天发射任务成功", "我国成功发射新一代通信卫星", "58万"},
|
|
{"高校毕业生就业形势", "今年高校毕业生规模预计达1179万人", "45万"}
|
|
};
|
|
for (int i = 0; i < data.length; i++) {
|
|
WeiboItem item = new WeiboItem();
|
|
item.setRank(i + 1);
|
|
item.setName(data[i][0]);
|
|
item.setDescription(data[i][1]);
|
|
item.setHot(data[i][2]);
|
|
item.setUrl("https://s.weibo.com/weibo?q=" + data[i][0]);
|
|
items.add(item);
|
|
}
|
|
return items;
|
|
}
|
|
|
|
public String getSourceName() {
|
|
return "微博热搜";
|
|
}
|
|
}
|
|
|
|
class DoubanStrategy implements Crawler {
|
|
@Override
|
|
public List<CrawlerItem> crawl() throws CrawlerException {
|
|
try {
|
|
return parseMockData();
|
|
} catch (Exception e) {
|
|
throw new ParseException("解析豆瓣数据失败", e);
|
|
}
|
|
}
|
|
|
|
private List<CrawlerItem> parseMockData() {
|
|
List<CrawlerItem> items = new ArrayList<>();
|
|
String[][] data = {
|
|
{"肖申克的救赎", "希望让人自由", "9.7", "弗兰克·德拉邦特"},
|
|
{"霸王别姬", "风华绝代", "9.6", "陈凯歌"},
|
|
{"阿甘正传", "生命就像一盒巧克力", "9.5", "罗伯特·泽米吉斯"},
|
|
{"泰坦尼克号", "You jump, I jump", "9.4", "詹姆斯·卡梅隆"},
|
|
{"盗梦空间", "现实与梦境的交织", "9.3", "克里斯托弗·诺兰"},
|
|
{"星际穿越", "爱是唯一能超越时间和空间的事物", "9.4", "克里斯托弗·诺兰"},
|
|
{"千与千寻", "不要忘记自己的名字", "9.4", "宫崎骏"},
|
|
{"辛德勒的名单", "拯救一个人就是拯救全世界", "9.5", "史蒂文·斯皮尔伯格"},
|
|
{"疯狂动物城", "勇敢尝试,一切皆有可能", "9.2", "拜恩·霍华德"},
|
|
{"哪吒之魔童降世", "我命由我不由天", "8.4", "饺子"}
|
|
};
|
|
for (int i = 0; i < data.length; i++) {
|
|
DoubanItem item = new DoubanItem();
|
|
item.setRank(i + 1);
|
|
item.setName(data[i][0]);
|
|
item.setDescription(data[i][1]);
|
|
item.setRating(data[i][2]);
|
|
item.setDirector(data[i][3]);
|
|
item.setUrl("https://movie.douban.com/subject/search?search_text=" + data[i][0]);
|
|
items.add(item);
|
|
}
|
|
return items;
|
|
}
|
|
|
|
public String getSourceName() {
|
|
return "豆瓣电影Top250";
|
|
}
|
|
}
|
|
|
|
class CrawlCommand implements Command {
|
|
private static final Logger logger = Logger.getLogger(CrawlCommand.class.getName());
|
|
private Crawler strategy;
|
|
private List<? extends CrawlerItem> result;
|
|
|
|
public CrawlCommand(Crawler strategy) {
|
|
this.strategy = strategy;
|
|
}
|
|
|
|
@Override
|
|
public void execute() throws CrawlerException {
|
|
logger.info("开始爬取...");
|
|
result = strategy.crawl();
|
|
logger.info("爬取完成,共获取 " + result.size() + " 条数据");
|
|
}
|
|
|
|
@Override
|
|
public String getCommandName() {
|
|
return "crawl";
|
|
}
|
|
|
|
public List<? extends CrawlerItem> getResult() {
|
|
return result;
|
|
}
|
|
}
|
|
|
|
class SaveCommand implements Command {
|
|
private static final Logger logger = Logger.getLogger(SaveCommand.class.getName());
|
|
private List<? extends CrawlerItem> items;
|
|
private String filename;
|
|
|
|
public SaveCommand(List<? extends CrawlerItem> items, String filename) {
|
|
this.items = items;
|
|
this.filename = filename;
|
|
}
|
|
|
|
@Override
|
|
public void execute() throws CrawlerException {
|
|
if (items == null || items.isEmpty()) {
|
|
throw new CrawlerException("没有可保存的数据");
|
|
}
|
|
try (FileWriter writer = new FileWriter(filename)) {
|
|
String header = generateHeader(items.get(0));
|
|
writer.write(header + "\n");
|
|
for (CrawlerItem item : items) {
|
|
String line = generateLine(item);
|
|
writer.write(line + "\n");
|
|
}
|
|
logger.info("数据已保存到: " + filename);
|
|
System.out.println("数据已保存到: " + filename);
|
|
} catch (IOException e) {
|
|
throw new CrawlerException("保存文件失败: " + e.getMessage(), e);
|
|
}
|
|
}
|
|
|
|
private String generateHeader(CrawlerItem item) {
|
|
if (item instanceof GitHubItem) {
|
|
return "排名,名称,链接,描述,语言,星数,今日星数,来源";
|
|
} else if (item instanceof WeiboItem) {
|
|
return "排名,名称,链接,描述,热度,来源";
|
|
} else if (item instanceof DoubanItem) {
|
|
return "排名,名称,链接,描述,评分,导演,来源";
|
|
}
|
|
return "排名,名称,链接,描述,来源";
|
|
}
|
|
|
|
private String generateLine(CrawlerItem item) {
|
|
StringBuilder sb = new StringBuilder();
|
|
sb.append(item.getRank()).append(",");
|
|
sb.append(escapeCsv(item.getName())).append(",");
|
|
sb.append(escapeCsv(item.getUrl())).append(",");
|
|
sb.append(escapeCsv(item.getDescription())).append(",");
|
|
if (item instanceof GitHubItem) {
|
|
GitHubItem gitHub = (GitHubItem) item;
|
|
sb.append(escapeCsv(gitHub.getLanguage())).append(",");
|
|
sb.append(gitHub.getStars()).append(",");
|
|
sb.append(gitHub.getTodayStars()).append(",");
|
|
} else if (item instanceof WeiboItem) {
|
|
WeiboItem weibo = (WeiboItem) item;
|
|
sb.append(weibo.getHot()).append(",");
|
|
} else if (item instanceof DoubanItem) {
|
|
DoubanItem douban = (DoubanItem) item;
|
|
sb.append(douban.getRating()).append(",");
|
|
sb.append(escapeCsv(douban.getDirector())).append(",");
|
|
}
|
|
sb.append(escapeCsv(item.getSource()));
|
|
return sb.toString();
|
|
}
|
|
|
|
private String escapeCsv(String value) {
|
|
if (value == null) return "";
|
|
if (value.contains(",") || value.contains("\"") || value.contains("\n")) {
|
|
return "\"" + value.replace("\"", "\"\"") + "\"";
|
|
}
|
|
return value;
|
|
}
|
|
|
|
@Override
|
|
public String getCommandName() {
|
|
return "save";
|
|
}
|
|
}
|
|
|
|
class DisplayCommand implements Command {
|
|
private static final Logger logger = Logger.getLogger(DisplayCommand.class.getName());
|
|
private List<? extends CrawlerItem> items;
|
|
|
|
public DisplayCommand(List<? extends CrawlerItem> items) {
|
|
this.items = items;
|
|
}
|
|
|
|
@Override
|
|
public void execute() throws CrawlerException {
|
|
if (items == null || items.isEmpty()) {
|
|
throw new CrawlerException("没有可显示的数据");
|
|
}
|
|
System.out.println("\n=== 爬取结果 ===");
|
|
for (CrawlerItem item : items) {
|
|
System.out.println("\n" + item.getRank() + ". " + item.getName());
|
|
System.out.println("链接: " + item.getUrl());
|
|
System.out.println("描述: " + item.getDescription());
|
|
System.out.println("来源: " + item.getSource());
|
|
if (item instanceof GitHubItem) {
|
|
GitHubItem gitHub = (GitHubItem) item;
|
|
System.out.println("语言: " + gitHub.getLanguage());
|
|
System.out.println("星数: " + gitHub.getStars());
|
|
System.out.println("今日星数: " + gitHub.getTodayStars());
|
|
} else if (item instanceof WeiboItem) {
|
|
WeiboItem weibo = (WeiboItem) item;
|
|
System.out.println("热度: " + weibo.getHot());
|
|
} else if (item instanceof DoubanItem) {
|
|
DoubanItem douban = (DoubanItem) item;
|
|
System.out.println("评分: " + douban.getRating());
|
|
System.out.println("导演: " + douban.getDirector());
|
|
}
|
|
}
|
|
logger.info("已显示 " + items.size() + " 条数据");
|
|
}
|
|
|
|
@Override
|
|
public String getCommandName() {
|
|
return "display";
|
|
}
|
|
}
|
|
|
|
class CrawlerController {
|
|
private static final Logger logger = Logger.getLogger(CrawlerController.class.getName());
|
|
|
|
public void executeCrawl(String source) throws CrawlerException {
|
|
Crawler strategy = createStrategy(source);
|
|
executeWithStrategy(strategy, source);
|
|
}
|
|
|
|
public void executeCrawlAll() throws CrawlerException {
|
|
String[] sources = {"GitHub Trending", "微博热搜", "豆瓣电影Top250"};
|
|
Crawler[] crawlers = {new GitHubStrategy(), new WeiboStrategy(), new DoubanStrategy()};
|
|
for (int i = 0; i < crawlers.length; i++) {
|
|
try {
|
|
executeWithStrategy(crawlers[i], sources[i]);
|
|
} catch (CrawlerException e) {
|
|
logger.severe("爬取 " + sources[i] + " 失败: " + e.getMessage());
|
|
}
|
|
}
|
|
}
|
|
|
|
private Crawler createStrategy(String source) throws CrawlerException {
|
|
switch (source.toLowerCase()) {
|
|
case "github":
|
|
return new GitHubStrategy();
|
|
case "weibo":
|
|
return new WeiboStrategy();
|
|
case "douban":
|
|
return new DoubanStrategy();
|
|
default:
|
|
throw new CrawlerException("不支持的数据源: " + source);
|
|
}
|
|
}
|
|
|
|
private void executeWithStrategy(Crawler strategy, String sourceName) throws CrawlerException {
|
|
CrawlCommand crawlCommand = new CrawlCommand(strategy);
|
|
crawlCommand.execute();
|
|
List<? extends CrawlerItem> result = crawlCommand.getResult();
|
|
DisplayCommand displayCommand = new DisplayCommand(result);
|
|
displayCommand.execute();
|
|
String filename = sourceName.replace(" ", "_") + "_" + System.currentTimeMillis() + ".csv";
|
|
SaveCommand saveCommand = new SaveCommand(result, filename);
|
|
saveCommand.execute();
|
|
}
|
|
}
|
|
|
|
class CLI {
|
|
private static final Logger logger = Logger.getLogger(CLI.class.getName());
|
|
private CrawlerController controller;
|
|
|
|
public CLI() {
|
|
this.controller = new CrawlerController();
|
|
}
|
|
|
|
public void start() {
|
|
printWelcome();
|
|
Scanner scanner = new Scanner(System.in);
|
|
while (true) {
|
|
printMenu();
|
|
System.out.print("请输入选择: ");
|
|
String input = scanner.nextLine().trim();
|
|
try {
|
|
handleInput(input);
|
|
} catch (CrawlerException e) {
|
|
System.err.println("错误: " + e.getMessage());
|
|
logger.severe("执行失败: " + e.getMessage());
|
|
}
|
|
if (input.equalsIgnoreCase("exit")) {
|
|
break;
|
|
}
|
|
}
|
|
scanner.close();
|
|
System.out.println("\n感谢使用爬虫工具,再见!");
|
|
}
|
|
|
|
private void printWelcome() {
|
|
System.out.println("=========================================");
|
|
System.out.println(" 多网站爬虫工具 v1.0");
|
|
System.out.println("=========================================");
|
|
System.out.println("支持爬取: GitHub Trending / 微博热搜 / 豆瓣电影");
|
|
System.out.println("=========================================\n");
|
|
}
|
|
|
|
private void printMenu() {
|
|
System.out.println("\n请选择操作:");
|
|
System.out.println("1. 爬取 GitHub Trending");
|
|
System.out.println("2. 爬取 微博热搜");
|
|
System.out.println("3. 爬取 豆瓣电影Top250");
|
|
System.out.println("4. 爬取所有网站");
|
|
System.out.println("5. 帮助");
|
|
System.out.println("6. 退出");
|
|
}
|
|
|
|
private void handleInput(String input) throws CrawlerException {
|
|
switch (input) {
|
|
case "1":
|
|
case "github":
|
|
controller.executeCrawl("github");
|
|
break;
|
|
case "2":
|
|
case "weibo":
|
|
controller.executeCrawl("weibo");
|
|
break;
|
|
case "3":
|
|
case "douban":
|
|
controller.executeCrawl("douban");
|
|
break;
|
|
case "4":
|
|
case "all":
|
|
controller.executeCrawlAll();
|
|
break;
|
|
case "5":
|
|
case "help":
|
|
printHelp();
|
|
break;
|
|
case "6":
|
|
case "exit":
|
|
break;
|
|
default:
|
|
System.out.println("无效输入,请输入数字 1-6 或命令名称");
|
|
}
|
|
}
|
|
|
|
private void printHelp() {
|
|
System.out.println("\n=== 帮助信息 ===");
|
|
System.out.println("命令列表:");
|
|
System.out.println(" 1 / github - 爬取 GitHub Trending");
|
|
System.out.println(" 2 / weibo - 爬取 微博热搜");
|
|
System.out.println(" 3 / douban - 爬取 豆瓣电影Top250");
|
|
System.out.println(" 4 / all - 爬取所有网站");
|
|
System.out.println(" 5 / help - 显示帮助信息");
|
|
System.out.println(" 6 / exit - 退出程序");
|
|
System.out.println("\n输出说明:");
|
|
System.out.println(" - 控制台会显示爬取结果");
|
|
System.out.println(" - 数据会自动保存为 CSV 文件");
|
|
System.out.println(" - 文件名格式: [来源]_[时间戳].csv");
|
|
}
|
|
}
|
|
|
|
public class CrawlerProject {
|
|
public static void main(String[] args) {
|
|
CLI cli = new CLI();
|
|
cli.start();
|
|
}
|
|
}
|