diff --git a/w10/CrawlerController.java b/w10/CrawlerController.java new file mode 100644 index 0000000..8b3ead7 --- /dev/null +++ b/w10/CrawlerController.java @@ -0,0 +1,56 @@ +package controller; + +import command.Command; +import command.CrawlCommand; +import command.ListCommand; +import command.HelpCommand; +import command.ExitCommand; +import command.PlatformCommand; +import view.ConsoleView; +import repository.PaperRepository; +import strategy.StrategyFactory; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +public class CrawlerController { + private final ConsoleView view; + private final PaperRepository repository; + private final Map commands = new HashMap<>(); + + public CrawlerController(ConsoleView view, PaperRepository repository, StrategyFactory strategyFactory) { + this.view = view; + this.repository = repository; + + register(new CrawlCommand(view, strategyFactory)); + register(new ListCommand(view)); + register(new PlatformCommand(view, strategyFactory)); + register(new ExitCommand(view)); + register(new HelpCommand(view, new ArrayList<>(commands.values()))); + } + + private void register(Command command) { + commands.put(command.getName(), command); + } + + public void run() { + view.displayWelcome(); + + while (true) { + String input = view.getInput(); + if (input.isEmpty()) continue; + + String[] parts = input.split("\\s+"); + String commandName = parts[0].toLowerCase(); + + if (!commands.containsKey(commandName)) { + view.showError("未知命令,请输入 help 查看可用命令"); + continue; + } + + Command command = commands.get(commandName); + command.execute(parts, repository); + } + } +} diff --git a/w10/Paper.java b/w10/Paper.java new file mode 100644 index 0000000..cbcbf38 --- /dev/null +++ b/w10/Paper.java @@ -0,0 +1,45 @@ +package model; + +public class Paper { + private String title; + private String authors; + private String abstractText; + private String url; + private String platform; + + public Paper() { + } + + public Paper(String title, String authors, String abstractText, String url, String platform) { + this.title = title; + this.authors = authors; + this.abstractText = abstractText; + this.url = url; + this.platform = platform; + } + + public String getTitle() { return title; } + public void setTitle(String title) { this.title = title; } + + public String getAuthors() { return authors; } + public void setAuthors(String authors) { this.authors = authors; } + + public String getAbstractText() { return abstractText; } + public void setAbstractText(String abstractText) { this.abstractText = abstractText; } + + public String getUrl() { return url; } + public void setUrl(String url) { this.url = url; } + + public String getPlatform() { return platform; } + public void setPlatform(String platform) { this.platform = platform; } + + @Override + public String toString() { + return "Paper{" + + "title='" + title + '\'' + + ", authors='" + authors + '\'' + + ", url='" + url + '\'' + + ", platform='" + platform + '\'' + + '}'; + } +} \ No newline at end of file diff --git a/w10/PaperRepository.java b/w10/PaperRepository.java new file mode 100644 index 0000000..d525d8c --- /dev/null +++ b/w10/PaperRepository.java @@ -0,0 +1,145 @@ +package repository; + +import model.Paper; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationFeature; +import utils.Utils; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +public class PaperRepository { + private String baseDir = "论文爬取"; + private String subDir; + private ObjectMapper objectMapper; + + public PaperRepository() { + objectMapper = new ObjectMapper(); + objectMapper.enable(SerializationFeature.INDENT_OUTPUT); + } + + public void init(String platformName) { + this.subDir = baseDir + File.separator + Utils.cleanFileName(platformName); + + File dir = new File(subDir); + if (!dir.exists()) { + dir.mkdirs(); + } + } + + public List removeDuplicates(List papers) { + Set existingTitles = new HashSet<>(); + List uniquePapers = new ArrayList<>(); + + File[] files = new File(subDir).listFiles(); + if (files != null) { + for (File file : files) { + if (file.isFile() && file.getName().endsWith(".json")) { + try { + Paper[] existingPapers = objectMapper.readValue(file, Paper[].class); + for (Paper paper : existingPapers) { + existingTitles.add(paper.getTitle()); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + + for (Paper paper : papers) { + if (!existingTitles.contains(paper.getTitle())) { + uniquePapers.add(paper); + existingTitles.add(paper.getTitle()); + } + } + + return uniquePapers; + } + + public void savePapers(List papers) throws Exception { + if (papers.isEmpty()) { + System.out.println("没有论文需要保存"); + return; + } + + int savedCount = 0; + for (Paper paper : papers) { + String title = paper.getTitle(); + String fileName = Utils.cleanTitleForFileName(title) + ".json"; + String filePath = subDir + File.separator + fileName; + + List singlePaperList = new ArrayList<>(); + singlePaperList.add(paper); + + objectMapper.writeValue(new File(filePath), singlePaperList); + savedCount++; + System.out.println("论文已保存: " + filePath); + } + System.out.println("共保存 " + savedCount + " 篇论文到: " + subDir); + } + + public List loadPapers() throws IOException { + List allPapers = new ArrayList<>(); + + File[] files = new File(subDir).listFiles(); + if (files != null) { + for (File file : files) { + if (file.isFile() && file.getName().endsWith(".json")) { + Paper[] papers = objectMapper.readValue(file, Paper[].class); + for (Paper paper : papers) { + allPapers.add(paper); + } + } + } + } + + return allPapers; + } + + public Map> loadAllPapersGroupedByPlatform() throws IOException { + Map> papersByPlatform = new HashMap<>(); + + File baseDirFile = new File(baseDir); + if (!baseDirFile.exists()) { + return papersByPlatform; + } + + File[] platformDirs = baseDirFile.listFiles(); + if (platformDirs != null) { + for (File platformDir : platformDirs) { + if (platformDir.isDirectory()) { + String platformName = platformDir.getName(); + List platformPapers = new ArrayList<>(); + + File[] files = platformDir.listFiles(); + if (files != null) { + for (File file : files) { + if (file.isFile() && file.getName().endsWith(".json")) { + try { + Paper[] papers = objectMapper.readValue(file, Paper[].class); + for (Paper paper : papers) { + platformPapers.add(paper); + } + } catch (IOException e) { + System.out.println("读取文件失败: " + file.getName()); + } + } + } + } + + if (!platformPapers.isEmpty()) { + papersByPlatform.put(platformName, platformPapers); + } + } + } + } + + return papersByPlatform; + } +} \ No newline at end of file diff --git a/w10/PlatformCommand.java b/w10/PlatformCommand.java new file mode 100644 index 0000000..913b2a5 --- /dev/null +++ b/w10/PlatformCommand.java @@ -0,0 +1,49 @@ +package command; + +import strategy.CrawlerStrategy; +import strategy.StrategyFactory; +import view.ConsoleView; +import java.util.List; +import repository.PaperRepository; + +public class PlatformCommand implements Command { + private StrategyFactory strategyFactory; + private ConsoleView view; + + public PlatformCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public void execute(String[] args, PaperRepository repository) { + List strategies = strategyFactory.getAllStrategies(); + + if (strategies.isEmpty()) { + view.showInfo("暂不支持任何论文平台"); + } else { + view.showInfo("当前支持 " + strategies.size() + " 个论文平台:"); + System.out.println(); + + int index = 1; + for (CrawlerStrategy strategy : strategies) { + System.out.println(index + ". " + strategy.getPlatformName()); + index++; + } + + System.out.println(); + view.showInfo("使用示例: crawl <平台URL>"); + view.showInfo("例如: crawl https://arxiv.org/search/?query=machine+learning"); + } + } + + @Override + public String getDescription() { + return "显示支持的论文平台列表"; + } + + @Override + public String getName() { + return "platforms"; + } +} diff --git a/w10/Utils.java b/w10/Utils.java new file mode 100644 index 0000000..7d1e56b --- /dev/null +++ b/w10/Utils.java @@ -0,0 +1,252 @@ +package utils; + +import org.apache.hc.client5.http.classic.methods.HttpGet; +import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; +import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; +import org.apache.hc.client5.http.impl.classic.HttpClients; +import org.apache.hc.core5.http.protocol.BasicHttpContext; + +import org.apache.hc.core5.http.io.entity.EntityUtils; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import java.net.URLEncoder; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + + +public class Utils { + // 随机User-Agent列表 + private static final List USER_AGENTS = new ArrayList<>(); + static { + USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"); + USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"); + USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/124.0"); + USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/123.0.0.0"); + USER_AGENTS.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15"); + USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"); + USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/123.0"); + USER_AGENTS.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15"); + USER_AGENTS.add("Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36"); + USER_AGENTS.add("Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1"); + } + + // 随机Referer列表 + private static final List REFERERS = new ArrayList<>(); + static { + REFERERS.add("https://www.google.com/"); + REFERERS.add("https://www.bing.com/"); + REFERERS.add("https://www.baidu.com/"); + REFERERS.add("https://scholar.google.com/"); + REFERERS.add("https://www.sciencedirect.com/"); + REFERERS.add("https://link.springer.com/"); + REFERERS.add("https://ieeexplore.ieee.org/"); + REFERERS.add("https://dl.acm.org/"); + REFERERS.add("https://kns.cnki.net/"); + REFERERS.add("https://www.google.com/search"); + } + + private static final Random RANDOM = new Random(); + + // 发送 HTTP GET 请求 + public static String sendGetRequest(String urlString) throws Exception { + System.out.println("正在发送HTTP请求: " + urlString); + + // 尝试多次普通HTTP请求,使用不同的User-Agent和Referer + for (int i = 0; i < 2; i++) { // 减少重试次数,避免卡住 + String html = sendHttpGetRequest(urlString); + if (!html.isEmpty()) { + return html; + } + // 每次失败后添加更长的延迟 + int delay = 2000 + i * 1000; + System.out.println("第 " + (i + 1) + " 次请求失败,添加延迟: " + delay + "ms"); + Thread.sleep(delay); + } + + // 暂时禁用Selenium,因为初始化可能会卡住 + System.out.println("所有HTTP请求都失败,暂时跳过Selenium..."); + return ""; + } + + // 使用普通HTTP请求 + private static String sendHttpGetRequest(String urlString) throws Exception { + long startTime = System.currentTimeMillis(); + + // 设置请求超时时间 + final int TIMEOUT = 15000; // 15秒 + + try { + // 使用默认的HttpClient + CloseableHttpClient httpClient = HttpClients.createDefault(); + + HttpGet httpGet = new HttpGet(urlString); + + // 随机选择User-Agent + String userAgent = USER_AGENTS.get(RANDOM.nextInt(USER_AGENTS.size())); + // 随机选择Referer + String referer = REFERERS.get(RANDOM.nextInt(REFERERS.size())); + + // 添加更完整的HTTP头信息,模拟真实浏览器 + httpGet.setHeader("User-Agent", userAgent); + httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"); + httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"); + httpGet.setHeader("Accept-Encoding", "gzip, deflate, br"); + httpGet.setHeader("Connection", "keep-alive"); + httpGet.setHeader("Referer", referer); + httpGet.setHeader("Upgrade-Insecure-Requests", "1"); + httpGet.setHeader("Sec-Fetch-Dest", "document"); + httpGet.setHeader("Sec-Fetch-Mode", "navigate"); + httpGet.setHeader("Sec-Fetch-Site", "cross-site"); + httpGet.setHeader("Sec-Fetch-User", "?1"); + httpGet.setHeader("Cache-Control", "max-age=0"); + httpGet.setHeader("DNT", "1"); + httpGet.setHeader("TE", "trailers"); + + // 执行请求 + System.out.println("开始执行HTTP请求..."); + System.out.println("请求超时设置: " + TIMEOUT + "ms"); + + CloseableHttpResponse response = null; + try { + // 使用Future来处理超时 + java.util.concurrent.Future future = java.util.concurrent.Executors.newSingleThreadExecutor().submit(new java.util.concurrent.Callable() { + @Override + public CloseableHttpResponse call() throws Exception { + try { + return (CloseableHttpResponse) httpClient.executeOpen(null, httpGet, new BasicHttpContext()); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + }); + + try { + response = future.get(TIMEOUT, java.util.concurrent.TimeUnit.MILLISECONDS); + } catch (java.util.concurrent.TimeoutException e) { + System.out.println("HTTP请求超时: " + e.getMessage()); + future.cancel(true); + return ""; + } + + // 获取响应状态码 + int statusCode = response.getCode(); + System.out.println("HTTP响应状态码: " + statusCode); + System.out.println("使用的User-Agent: " + userAgent); + System.out.println("使用的Referer: " + referer); + + if (statusCode != 200) { + System.out.println("HTTP请求失败,状态码: " + statusCode); + return ""; + } + + // 读取响应内容 + System.out.println("正在读取响应内容..."); + // 限制读取的内容长度,避免程序卡住 + String html = EntityUtils.toString(response.getEntity(), "UTF-8"); + // 如果内容长度超过100000字符,只保留前100000字符 + if (html.length() > 100000) { + html = html.substring(0, 100000); + System.out.println("响应内容过长,已截断为100000字符"); + } + + long endTime = System.currentTimeMillis(); + System.out.println("HTTP请求完成,耗时: " + (endTime - startTime) + "ms"); + System.out.println("响应内容长度: " + html.length() + " 字符"); + + // 检查响应内容是否为空或包含反爬信息 + if (html == null || html.isEmpty()) { + System.out.println("响应内容为空"); + return ""; + } + + // 检查是否是反爬页面 + boolean isAntiCrawl = false; + String[] antiCrawlKeywords = {"captcha", "verify", "robot", "Robot", "reCAPTCHA", "blocked", "Blocked"}; + for (String keyword : antiCrawlKeywords) { + if (html.contains(keyword)) { + isAntiCrawl = true; + break; + } + } + + // 特殊处理arXiv,因为它的页面可能包含一些被误判为反爬的关键词 + if (urlString.contains("arxiv.org")) { + isAntiCrawl = false; // 对于arXiv,我们信任它返回的内容 + } + + if (isAntiCrawl) { + System.out.println("检测到反爬页面"); + return ""; + } + + // 智能延迟,模拟真实用户行为,使用更随机的延迟时间 + int delay = RANDOM.nextInt(1500) + 800; // 800-2300ms + System.out.println("添加随机延迟: " + delay + "ms"); + Thread.sleep(delay); + + return html; + } finally { + // 确保响应和客户端被关闭 + if (response != null) { + try { + response.close(); + } catch (Exception e) { + System.out.println("关闭响应时出错: " + e.getMessage()); + } + } + try { + httpClient.close(); + } catch (Exception e) { + System.out.println("关闭HTTP客户端时出错: " + e.getMessage()); + } + } + } catch (java.net.SocketTimeoutException e) { + System.out.println("HTTP请求超时: " + e.getMessage()); + return ""; + } catch (java.io.IOException e) { + System.out.println("HTTP请求IO错误: " + e.getMessage()); + return ""; + } catch (Exception e) { + System.out.println("发送HTTP请求时出错: " + e.getMessage()); + e.printStackTrace(); + return ""; + } + } + + // 解析 HTML + public static Document parseHtml(String html) { + return Jsoup.parse(html); + } + + // URL 编码 + public static String urlEncode(String value) throws Exception { + return URLEncoder.encode(value, "UTF-8"); + } + + // 生成唯一文件名 + public static String generateFileName(String keyword) { + return keyword + "_" + System.currentTimeMillis() + ".json"; + } + + // 清理文件名中的非法字符 + public static String cleanFileName(String fileName) { + return fileName.replaceAll("[\\/:*?\"<>|]", "_"); + } + + // 清理论文标题用于文件名 + public static String cleanTitleForFileName(String title) { + if (title == null || title.isEmpty()) { + return "untitled"; + } + String cleaned = title.trim() + .replaceAll("[\\\\/:*?\"<>|]", "_") + .replaceAll("\\s+", "_") + .replaceAll("_+", "_"); + if (cleaned.length() > 100) { + cleaned = cleaned.substring(0, 100); + } + return cleaned; + } +}