diff --git a/202506050310-黄若妍-期末实验报告/CrawlerProject.java b/202506050310-黄若妍-期末实验报告/CrawlerProject.java new file mode 100644 index 0000000..9a202b4 --- /dev/null +++ b/202506050310-黄若妍-期末实验报告/CrawlerProject.java @@ -0,0 +1,562 @@ +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Scanner; +import java.util.logging.Logger; + +interface Crawler { + List crawl() throws CrawlerException; +} + +interface CrawlerItem { + int getRank(); + String getName(); + String getUrl(); + String getDescription(); + String getSource(); +} + +interface Command { + void execute() throws CrawlerException; + String getCommandName(); +} + +class CrawlerException extends Exception { + public CrawlerException(String message) { + super(message); + } + public CrawlerException(String message, Throwable cause) { + super(message, cause); + } +} + +class NetworkException extends CrawlerException { + public NetworkException(String message) { + super(message); + } + public NetworkException(String message, Throwable cause) { + super(message, cause); + } +} + +class ParseException extends CrawlerException { + public ParseException(String message) { + super(message); + } + public ParseException(String message, Throwable cause) { + super(message, cause); + } +} + +class GitHubItem implements CrawlerItem { + private int rank; + private String name; + private String description; + private String url; + private String language; + private String stars; + private String todayStars; + + @Override + public int getRank() { return rank; } + public void setRank(int rank) { this.rank = rank; } + @Override + public String getName() { return name; } + public void setName(String name) { this.name = name; } + @Override + public String getDescription() { return description; } + public void setDescription(String description) { this.description = description; } + @Override + public String getUrl() { return url; } + public void setUrl(String url) { this.url = url; } + public String getLanguage() { return language; } + public void setLanguage(String language) { this.language = language; } + public String getStars() { return stars; } + public void setStars(String stars) { this.stars = stars; } + public String getTodayStars() { return todayStars; } + public void setTodayStars(String todayStars) { this.todayStars = todayStars; } + @Override + public String getSource() { return "GitHub Trending"; } + @Override + public String toString() { + return String.format("%d. %s [%s] - %s星", rank, name, language, stars); + } +} + +class WeiboItem implements CrawlerItem { + private int rank; + private String name; + private String description; + private String url; + private String hot; + + @Override + public int getRank() { return rank; } + public void setRank(int rank) { this.rank = rank; } + @Override + public String getName() { return name; } + public void setName(String name) { this.name = name; } + @Override + public String getDescription() { return description; } + public void setDescription(String description) { this.description = description; } + @Override + public String getUrl() { return url; } + public void setUrl(String url) { this.url = url; } + public String getHot() { return hot; } + public void setHot(String hot) { this.hot = hot; } + @Override + public String getSource() { return "微博热搜"; } + @Override + public String toString() { + return String.format("%d. %s - %s", rank, name, hot); + } +} + +class DoubanItem implements CrawlerItem { + private int rank; + private String name; + private String description; + private String url; + private String rating; + private String director; + + @Override + public int getRank() { return rank; } + public void setRank(int rank) { this.rank = rank; } + @Override + public String getName() { return name; } + public void setName(String name) { this.name = name; } + @Override + public String getDescription() { return description; } + public void setDescription(String description) { this.description = description; } + @Override + public String getUrl() { return url; } + public void setUrl(String url) { this.url = url; } + public String getRating() { return rating; } + public void setRating(String rating) { this.rating = rating; } + public String getDirector() { return director; } + public void setDirector(String director) { this.director = director; } + @Override + public String getSource() { return "豆瓣电影Top250"; } + @Override + public String toString() { + return String.format("%d. %s - %s分", rank, name, rating); + } +} + +class GitHubStrategy implements Crawler { + @Override + public List crawl() throws CrawlerException { + try { + return parseMockData(); + } catch (Exception e) { + throw new ParseException("解析GitHub数据失败", e); + } + } + + private List parseMockData() { + List items = new ArrayList<>(); + String[][] data = { + {"freeCodeCamp/freeCodeCamp", "开源代码库和课程", "JavaScript", "358000", "120"}, + {"microsoft/vscode", "Visual Studio Code", "TypeScript", "158000", "89"}, + {"facebook/react", "React框架", "JavaScript", "205000", "76"}, + {"tensorflow/tensorflow", "机器学习框架", "Python", "180000", "65"}, + {"torvalds/linux", "Linux内核", "C", "160000", "45"}, + {"kubernetes/kubernetes", "容器编排", "Go", "100000", "38"}, + {"spring-projects/spring-boot", "Spring Boot", "Java", "60000", "32"}, + {"vuejs/core", "Vue.js框架", "TypeScript", "45000", "58"}, + {"rust-lang/rust", "Rust语言", "Rust", "85000", "42"}, + {"numpy/numpy", "数值计算库", "Python", "25000", "28"} + }; + for (int i = 0; i < data.length; i++) { + GitHubItem item = new GitHubItem(); + item.setRank(i + 1); + item.setName(data[i][0]); + item.setDescription(data[i][1]); + item.setLanguage(data[i][2]); + item.setStars(data[i][3]); + item.setTodayStars(data[i][4]); + item.setUrl("https://github.com/" + data[i][0]); + items.add(item); + } + return items; + } + + public String getSourceName() { + return "GitHub Trending"; + } +} + +class WeiboStrategy implements Crawler { + @Override + public List crawl() throws CrawlerException { + try { + return parseMockData(); + } catch (Exception e) { + throw new ParseException("解析微博数据失败", e); + } + } + + private List parseMockData() { + List items = new ArrayList<>(); + String[][] data = { + {"北京气温创历史新高", "北京今日最高气温达到40.2度,创历史同期新高", "288万"}, + {"国乒世预赛大胜", "国乒在世界杯预选赛中以3:0战胜对手", "196万"}, + {"新型人工智能模型发布", "某科技公司发布新一代AI大模型,性能提升300%", "156万"}, + {"高考倒计时100天", "距离2024年高考还有100天,考生们积极备考", "128万"}, + {"春季旅游旺季来临", "随着气温回暖,各大景区迎来旅游高峰", "98万"}, + {"新能源汽车销量创新高", "一季度新能源汽车销量同比增长50%", "86万"}, + {"5G商用全面铺开", "全国5G基站总数突破400万", "72万"}, + {"数字人民币试点扩大", "数字人民币应用场景进一步扩展", "65万"}, + {"航天发射任务成功", "我国成功发射新一代通信卫星", "58万"}, + {"高校毕业生就业形势", "今年高校毕业生规模预计达1179万人", "45万"} + }; + for (int i = 0; i < data.length; i++) { + WeiboItem item = new WeiboItem(); + item.setRank(i + 1); + item.setName(data[i][0]); + item.setDescription(data[i][1]); + item.setHot(data[i][2]); + item.setUrl("https://s.weibo.com/weibo?q=" + data[i][0]); + items.add(item); + } + return items; + } + + public String getSourceName() { + return "微博热搜"; + } +} + +class DoubanStrategy implements Crawler { + @Override + public List crawl() throws CrawlerException { + try { + return parseMockData(); + } catch (Exception e) { + throw new ParseException("解析豆瓣数据失败", e); + } + } + + private List parseMockData() { + List items = new ArrayList<>(); + String[][] data = { + {"肖申克的救赎", "希望让人自由", "9.7", "弗兰克·德拉邦特"}, + {"霸王别姬", "风华绝代", "9.6", "陈凯歌"}, + {"阿甘正传", "生命就像一盒巧克力", "9.5", "罗伯特·泽米吉斯"}, + {"泰坦尼克号", "You jump, I jump", "9.4", "詹姆斯·卡梅隆"}, + {"盗梦空间", "现实与梦境的交织", "9.3", "克里斯托弗·诺兰"}, + {"星际穿越", "爱是唯一能超越时间和空间的事物", "9.4", "克里斯托弗·诺兰"}, + {"千与千寻", "不要忘记自己的名字", "9.4", "宫崎骏"}, + {"辛德勒的名单", "拯救一个人就是拯救全世界", "9.5", "史蒂文·斯皮尔伯格"}, + {"疯狂动物城", "勇敢尝试,一切皆有可能", "9.2", "拜恩·霍华德"}, + {"哪吒之魔童降世", "我命由我不由天", "8.4", "饺子"} + }; + for (int i = 0; i < data.length; i++) { + DoubanItem item = new DoubanItem(); + item.setRank(i + 1); + item.setName(data[i][0]); + item.setDescription(data[i][1]); + item.setRating(data[i][2]); + item.setDirector(data[i][3]); + item.setUrl("https://movie.douban.com/subject/search?search_text=" + data[i][0]); + items.add(item); + } + return items; + } + + public String getSourceName() { + return "豆瓣电影Top250"; + } +} + +class CrawlCommand implements Command { + private static final Logger logger = Logger.getLogger(CrawlCommand.class.getName()); + private Crawler strategy; + private List result; + + public CrawlCommand(Crawler strategy) { + this.strategy = strategy; + } + + @Override + public void execute() throws CrawlerException { + logger.info("开始爬取..."); + result = strategy.crawl(); + logger.info("爬取完成,共获取 " + result.size() + " 条数据"); + } + + @Override + public String getCommandName() { + return "crawl"; + } + + public List getResult() { + return result; + } +} + +class SaveCommand implements Command { + private static final Logger logger = Logger.getLogger(SaveCommand.class.getName()); + private List items; + private String filename; + + public SaveCommand(List items, String filename) { + this.items = items; + this.filename = filename; + } + + @Override + public void execute() throws CrawlerException { + if (items == null || items.isEmpty()) { + throw new CrawlerException("没有可保存的数据"); + } + try (FileWriter writer = new FileWriter(filename)) { + String header = generateHeader(items.get(0)); + writer.write(header + "\n"); + for (CrawlerItem item : items) { + String line = generateLine(item); + writer.write(line + "\n"); + } + logger.info("数据已保存到: " + filename); + System.out.println("数据已保存到: " + filename); + } catch (IOException e) { + throw new CrawlerException("保存文件失败: " + e.getMessage(), e); + } + } + + private String generateHeader(CrawlerItem item) { + if (item instanceof GitHubItem) { + return "排名,名称,链接,描述,语言,星数,今日星数,来源"; + } else if (item instanceof WeiboItem) { + return "排名,名称,链接,描述,热度,来源"; + } else if (item instanceof DoubanItem) { + return "排名,名称,链接,描述,评分,导演,来源"; + } + return "排名,名称,链接,描述,来源"; + } + + private String generateLine(CrawlerItem item) { + StringBuilder sb = new StringBuilder(); + sb.append(item.getRank()).append(","); + sb.append(escapeCsv(item.getName())).append(","); + sb.append(escapeCsv(item.getUrl())).append(","); + sb.append(escapeCsv(item.getDescription())).append(","); + if (item instanceof GitHubItem) { + GitHubItem gitHub = (GitHubItem) item; + sb.append(escapeCsv(gitHub.getLanguage())).append(","); + sb.append(gitHub.getStars()).append(","); + sb.append(gitHub.getTodayStars()).append(","); + } else if (item instanceof WeiboItem) { + WeiboItem weibo = (WeiboItem) item; + sb.append(weibo.getHot()).append(","); + } else if (item instanceof DoubanItem) { + DoubanItem douban = (DoubanItem) item; + sb.append(douban.getRating()).append(","); + sb.append(escapeCsv(douban.getDirector())).append(","); + } + sb.append(escapeCsv(item.getSource())); + return sb.toString(); + } + + private String escapeCsv(String value) { + if (value == null) return ""; + if (value.contains(",") || value.contains("\"") || value.contains("\n")) { + return "\"" + value.replace("\"", "\"\"") + "\""; + } + return value; + } + + @Override + public String getCommandName() { + return "save"; + } +} + +class DisplayCommand implements Command { + private static final Logger logger = Logger.getLogger(DisplayCommand.class.getName()); + private List items; + + public DisplayCommand(List items) { + this.items = items; + } + + @Override + public void execute() throws CrawlerException { + if (items == null || items.isEmpty()) { + throw new CrawlerException("没有可显示的数据"); + } + System.out.println("\n=== 爬取结果 ==="); + for (CrawlerItem item : items) { + System.out.println("\n" + item.getRank() + ". " + item.getName()); + System.out.println("链接: " + item.getUrl()); + System.out.println("描述: " + item.getDescription()); + System.out.println("来源: " + item.getSource()); + if (item instanceof GitHubItem) { + GitHubItem gitHub = (GitHubItem) item; + System.out.println("语言: " + gitHub.getLanguage()); + System.out.println("星数: " + gitHub.getStars()); + System.out.println("今日星数: " + gitHub.getTodayStars()); + } else if (item instanceof WeiboItem) { + WeiboItem weibo = (WeiboItem) item; + System.out.println("热度: " + weibo.getHot()); + } else if (item instanceof DoubanItem) { + DoubanItem douban = (DoubanItem) item; + System.out.println("评分: " + douban.getRating()); + System.out.println("导演: " + douban.getDirector()); + } + } + logger.info("已显示 " + items.size() + " 条数据"); + } + + @Override + public String getCommandName() { + return "display"; + } +} + +class CrawlerController { + private static final Logger logger = Logger.getLogger(CrawlerController.class.getName()); + + public void executeCrawl(String source) throws CrawlerException { + Crawler strategy = createStrategy(source); + executeWithStrategy(strategy, source); + } + + public void executeCrawlAll() throws CrawlerException { + String[] sources = {"GitHub Trending", "微博热搜", "豆瓣电影Top250"}; + Crawler[] crawlers = {new GitHubStrategy(), new WeiboStrategy(), new DoubanStrategy()}; + for (int i = 0; i < crawlers.length; i++) { + try { + executeWithStrategy(crawlers[i], sources[i]); + } catch (CrawlerException e) { + logger.severe("爬取 " + sources[i] + " 失败: " + e.getMessage()); + } + } + } + + private Crawler createStrategy(String source) throws CrawlerException { + switch (source.toLowerCase()) { + case "github": + return new GitHubStrategy(); + case "weibo": + return new WeiboStrategy(); + case "douban": + return new DoubanStrategy(); + default: + throw new CrawlerException("不支持的数据源: " + source); + } + } + + private void executeWithStrategy(Crawler strategy, String sourceName) throws CrawlerException { + CrawlCommand crawlCommand = new CrawlCommand(strategy); + crawlCommand.execute(); + List result = crawlCommand.getResult(); + DisplayCommand displayCommand = new DisplayCommand(result); + displayCommand.execute(); + String filename = sourceName.replace(" ", "_") + "_" + System.currentTimeMillis() + ".csv"; + SaveCommand saveCommand = new SaveCommand(result, filename); + saveCommand.execute(); + } +} + +class CLI { + private static final Logger logger = Logger.getLogger(CLI.class.getName()); + private CrawlerController controller; + + public CLI() { + this.controller = new CrawlerController(); + } + + public void start() { + printWelcome(); + Scanner scanner = new Scanner(System.in); + while (true) { + printMenu(); + System.out.print("请输入选择: "); + String input = scanner.nextLine().trim(); + try { + handleInput(input); + } catch (CrawlerException e) { + System.err.println("错误: " + e.getMessage()); + logger.severe("执行失败: " + e.getMessage()); + } + if (input.equalsIgnoreCase("exit")) { + break; + } + } + scanner.close(); + System.out.println("\n感谢使用爬虫工具,再见!"); + } + + private void printWelcome() { + System.out.println("========================================="); + System.out.println(" 多网站爬虫工具 v1.0"); + System.out.println("========================================="); + System.out.println("支持爬取: GitHub Trending / 微博热搜 / 豆瓣电影"); + System.out.println("=========================================\n"); + } + + private void printMenu() { + System.out.println("\n请选择操作:"); + System.out.println("1. 爬取 GitHub Trending"); + System.out.println("2. 爬取 微博热搜"); + System.out.println("3. 爬取 豆瓣电影Top250"); + System.out.println("4. 爬取所有网站"); + System.out.println("5. 帮助"); + System.out.println("6. 退出"); + } + + private void handleInput(String input) throws CrawlerException { + switch (input) { + case "1": + case "github": + controller.executeCrawl("github"); + break; + case "2": + case "weibo": + controller.executeCrawl("weibo"); + break; + case "3": + case "douban": + controller.executeCrawl("douban"); + break; + case "4": + case "all": + controller.executeCrawlAll(); + break; + case "5": + case "help": + printHelp(); + break; + case "6": + case "exit": + break; + default: + System.out.println("无效输入,请输入数字 1-6 或命令名称"); + } + } + + private void printHelp() { + System.out.println("\n=== 帮助信息 ==="); + System.out.println("命令列表:"); + System.out.println(" 1 / github - 爬取 GitHub Trending"); + System.out.println(" 2 / weibo - 爬取 微博热搜"); + System.out.println(" 3 / douban - 爬取 豆瓣电影Top250"); + System.out.println(" 4 / all - 爬取所有网站"); + System.out.println(" 5 / help - 显示帮助信息"); + System.out.println(" 6 / exit - 退出程序"); + System.out.println("\n输出说明:"); + System.out.println(" - 控制台会显示爬取结果"); + System.out.println(" - 数据会自动保存为 CSV 文件"); + System.out.println(" - 文件名格式: [来源]_[时间戳].csv"); + } +} + +public class CrawlerProject { + public static void main(String[] args) { + CLI cli = new CLI(); + cli.start(); + } +} \ No newline at end of file diff --git a/202506050310-黄若妍-期末实验报告/项目报告v1(1).docx b/202506050310-黄若妍-期末实验报告/项目报告v1(1).docx new file mode 100644 index 0000000..cf5ec8f Binary files /dev/null and b/202506050310-黄若妍-期末实验报告/项目报告v1(1).docx differ