上传文件至 'w10'

1 month ago · 58c2936cbb
5 changed files with 547 additions and 0 deletions
--- a/w10/CrawlerController.java
+++ b/w10/CrawlerController.java
@ -0,0 +1,56 @@
+package controller;
+
+import command.Command;
+import command.CrawlCommand;
+import command.ListCommand;
+import command.HelpCommand;
+import command.ExitCommand;
+import command.PlatformCommand;
+import view.ConsoleView;
+import repository.PaperRepository;
+import strategy.StrategyFactory;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+
+public class CrawlerController {
+    private final ConsoleView view;
+    private final PaperRepository repository;
+    private final Map<String, Command> commands = new HashMap<>();
+    
+    public CrawlerController(ConsoleView view, PaperRepository repository, StrategyFactory strategyFactory) {
+        this.view = view;
+        this.repository = repository;
+        
+        register(new CrawlCommand(view, strategyFactory));
+        register(new ListCommand(view));
+        register(new PlatformCommand(view, strategyFactory));
+        register(new ExitCommand(view));
+        register(new HelpCommand(view, new ArrayList<>(commands.values())));
+    }
+
+    private void register(Command command) {
+        commands.put(command.getName(), command);
+    }
+
+    public void run() {
+        view.displayWelcome();
+        
+        while (true) {
+            String input = view.getInput();
+            if (input.isEmpty()) continue;
+            
+            String[] parts = input.split("\\s+");
+            String commandName = parts[0].toLowerCase();
+            
+            if (!commands.containsKey(commandName)) {
+                view.showError("未知命令，请输入 help 查看可用命令");
+                continue;
+            }
+            
+            Command command = commands.get(commandName);
+            command.execute(parts, repository);
+        }
+    }
+}
--- a/w10/Paper.java
+++ b/w10/Paper.java
@ -0,0 +1,45 @@
+package model;
+
+public class Paper {
+    private String title;
+    private String authors;
+    private String abstractText;
+    private String url;
+    private String platform;
+    
+    public Paper() {
+    }
+    
+    public Paper(String title, String authors, String abstractText, String url, String platform) {
+        this.title = title;
+        this.authors = authors;
+        this.abstractText = abstractText;
+        this.url = url;
+        this.platform = platform;
+    }
+    
+    public String getTitle() { return title; }
+    public void setTitle(String title) { this.title = title; }
+    
+    public String getAuthors() { return authors; }
+    public void setAuthors(String authors) { this.authors = authors; }
+    
+    public String getAbstractText() { return abstractText; }
+    public void setAbstractText(String abstractText) { this.abstractText = abstractText; }
+    
+    public String getUrl() { return url; }
+    public void setUrl(String url) { this.url = url; }
+    
+    public String getPlatform() { return platform; }
+    public void setPlatform(String platform) { this.platform = platform; }
+    
+    @Override
+    public String toString() {
+        return "Paper{" +
+                "title='" + title + '\'' +
+                ", authors='" + authors + '\'' +
+                ", url='" + url + '\'' +
+                ", platform='" + platform + '\'' +
+                '}';
+    }
+}
--- a/w10/PaperRepository.java
+++ b/w10/PaperRepository.java
@ -0,0 +1,145 @@
+package repository;
+
+import model.Paper;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.SerializationFeature;
+import utils.Utils;
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+public class PaperRepository {
+    private String baseDir = "论文爬取";
+    private String subDir;
+    private ObjectMapper objectMapper;
+    
+    public PaperRepository() {
+        objectMapper = new ObjectMapper();
+        objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
+    }
+    
+    public void init(String platformName) {
+        this.subDir = baseDir + File.separator + Utils.cleanFileName(platformName);
+        
+        File dir = new File(subDir);
+        if (!dir.exists()) {
+            dir.mkdirs();
+        }
+    }
+    
+    public List<Paper> removeDuplicates(List<Paper> papers) {
+        Set<String> existingTitles = new HashSet<>();
+        List<Paper> uniquePapers = new ArrayList<>();
+        
+        File[] files = new File(subDir).listFiles();
+        if (files != null) {
+            for (File file : files) {
+                if (file.isFile() && file.getName().endsWith(".json")) {
+                    try {
+                        Paper[] existingPapers = objectMapper.readValue(file, Paper[].class);
+                        for (Paper paper : existingPapers) {
+                            existingTitles.add(paper.getTitle());
+                        }
+                    } catch (IOException e) {
+                        e.printStackTrace();
+                    }
+                }
+            }
+        }
+        
+        for (Paper paper : papers) {
+            if (!existingTitles.contains(paper.getTitle())) {
+                uniquePapers.add(paper);
+                existingTitles.add(paper.getTitle());
+            }
+        }
+        
+        return uniquePapers;
+    }
+    
+    public void savePapers(List<Paper> papers) throws Exception {
+        if (papers.isEmpty()) {
+            System.out.println("没有论文需要保存");
+            return;
+        }
+        
+        int savedCount = 0;
+        for (Paper paper : papers) {
+            String title = paper.getTitle();
+            String fileName = Utils.cleanTitleForFileName(title) + ".json";
+            String filePath = subDir + File.separator + fileName;
+            
+            List<Paper> singlePaperList = new ArrayList<>();
+            singlePaperList.add(paper);
+            
+            objectMapper.writeValue(new File(filePath), singlePaperList);
+            savedCount++;
+            System.out.println("论文已保存: " + filePath);
+        }
+        System.out.println("共保存 " + savedCount + " 篇论文到: " + subDir);
+    }
+    
+    public List<Paper> loadPapers() throws IOException {
+        List<Paper> allPapers = new ArrayList<>();
+        
+        File[] files = new File(subDir).listFiles();
+        if (files != null) {
+            for (File file : files) {
+                if (file.isFile() && file.getName().endsWith(".json")) {
+                    Paper[] papers = objectMapper.readValue(file, Paper[].class);
+                    for (Paper paper : papers) {
+                        allPapers.add(paper);
+                    }
+                }
+            }
+        }
+        
+        return allPapers;
+    }
+    
+    public Map<String, List<Paper>> loadAllPapersGroupedByPlatform() throws IOException {
+        Map<String, List<Paper>> papersByPlatform = new HashMap<>();
+        
+        File baseDirFile = new File(baseDir);
+        if (!baseDirFile.exists()) {
+            return papersByPlatform;
+        }
+        
+        File[] platformDirs = baseDirFile.listFiles();
+        if (platformDirs != null) {
+            for (File platformDir : platformDirs) {
+                if (platformDir.isDirectory()) {
+                    String platformName = platformDir.getName();
+                    List<Paper> platformPapers = new ArrayList<>();
+                    
+                    File[] files = platformDir.listFiles();
+                    if (files != null) {
+                        for (File file : files) {
+                            if (file.isFile() && file.getName().endsWith(".json")) {
+                                try {
+                                    Paper[] papers = objectMapper.readValue(file, Paper[].class);
+                                    for (Paper paper : papers) {
+                                        platformPapers.add(paper);
+                                    }
+                                } catch (IOException e) {
+                                    System.out.println("读取文件失败: " + file.getName());
+                                }
+                            }
+                        }
+                    }
+                    
+                    if (!platformPapers.isEmpty()) {
+                        papersByPlatform.put(platformName, platformPapers);
+                    }
+                }
+            }
+        }
+        
+        return papersByPlatform;
+    }
+}
--- a/w10/PlatformCommand.java
+++ b/w10/PlatformCommand.java
@ -0,0 +1,49 @@
+package command;
+
+import strategy.CrawlerStrategy;
+import strategy.StrategyFactory;
+import view.ConsoleView;
+import java.util.List;
+import repository.PaperRepository;
+
+public class PlatformCommand implements Command {
+    private StrategyFactory strategyFactory;
+    private ConsoleView view;
+    
+    public PlatformCommand(ConsoleView view, StrategyFactory strategyFactory) {
+        this.view = view;
+        this.strategyFactory = strategyFactory;
+    }
+    
+    @Override
+    public void execute(String[] args, PaperRepository repository) {
+        List<CrawlerStrategy> strategies = strategyFactory.getAllStrategies();
+        
+        if (strategies.isEmpty()) {
+            view.showInfo("暂不支持任何论文平台");
+        } else {
+            view.showInfo("当前支持 " + strategies.size() + " 个论文平台:");
+            System.out.println();
+            
+            int index = 1;
+            for (CrawlerStrategy strategy : strategies) {
+                System.out.println(index + ". " + strategy.getPlatformName());
+                index++;
+            }
+            
+            System.out.println();
+            view.showInfo("使用示例: crawl <平台URL>");
+            view.showInfo("例如: crawl https://arxiv.org/search/?query=machine+learning");
+        }
+    }
+    
+    @Override
+    public String getDescription() {
+        return "显示支持的论文平台列表";
+    }
+    
+    @Override
+    public String getName() {
+        return "platforms";
+    }
+}
--- a/w10/Utils.java
+++ b/w10/Utils.java
@ -0,0 +1,252 @@
+package utils;
+
+import org.apache.hc.client5.http.classic.methods.HttpGet;
+import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
+import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
+import org.apache.hc.client5.http.impl.classic.HttpClients;
+import org.apache.hc.core5.http.protocol.BasicHttpContext;
+
+import org.apache.hc.core5.http.io.entity.EntityUtils;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import java.net.URLEncoder;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+
+public class Utils {
+    // 随机User-Agent列表
+    private static final List<String> USER_AGENTS = new ArrayList<>();
+    static {
+        USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36");
+        USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36");
+        USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/124.0");
+        USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/123.0.0.0");
+        USER_AGENTS.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15");
+        USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36");
+        USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/123.0");
+        USER_AGENTS.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15");
+        USER_AGENTS.add("Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36");
+        USER_AGENTS.add("Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1");
+    }
+    
+    // 随机Referer列表
+    private static final List<String> REFERERS = new ArrayList<>();
+    static {
+        REFERERS.add("https://www.google.com/");
+        REFERERS.add("https://www.bing.com/");
+        REFERERS.add("https://www.baidu.com/");
+        REFERERS.add("https://scholar.google.com/");
+        REFERERS.add("https://www.sciencedirect.com/");
+        REFERERS.add("https://link.springer.com/");
+        REFERERS.add("https://ieeexplore.ieee.org/");
+        REFERERS.add("https://dl.acm.org/");
+        REFERERS.add("https://kns.cnki.net/");
+        REFERERS.add("https://www.google.com/search");
+    }
+    
+    private static final Random RANDOM = new Random();
+    
+    // 发送 HTTP GET 请求
+    public static String sendGetRequest(String urlString) throws Exception {
+        System.out.println("正在发送HTTP请求: " + urlString);
+        
+        // 尝试多次普通HTTP请求，使用不同的User-Agent和Referer
+        for (int i = 0; i < 2; i++) { // 减少重试次数，避免卡住
+            String html = sendHttpGetRequest(urlString);
+            if (!html.isEmpty()) {
+                return html;
+            }
+            // 每次失败后添加更长的延迟
+            int delay = 2000 + i * 1000;
+            System.out.println("第 " + (i + 1) + " 次请求失败，添加延迟: " + delay + "ms");
+            Thread.sleep(delay);
+        }
+        
+        // 暂时禁用Selenium，因为初始化可能会卡住
+        System.out.println("所有HTTP请求都失败，暂时跳过Selenium...");
+        return "";
+    }
+    
+    // 使用普通HTTP请求
+    private static String sendHttpGetRequest(String urlString) throws Exception {
+        long startTime = System.currentTimeMillis();
+        
+        // 设置请求超时时间
+        final int TIMEOUT = 15000; // 15秒
+        
+        try {
+            // 使用默认的HttpClient
+            CloseableHttpClient httpClient = HttpClients.createDefault();
+            
+            HttpGet httpGet = new HttpGet(urlString);
+            
+            // 随机选择User-Agent
+            String userAgent = USER_AGENTS.get(RANDOM.nextInt(USER_AGENTS.size()));
+            // 随机选择Referer
+            String referer = REFERERS.get(RANDOM.nextInt(REFERERS.size()));
+            
+            // 添加更完整的HTTP头信息，模拟真实浏览器
+            httpGet.setHeader("User-Agent", userAgent);
+            httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8");
+            httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
+            httpGet.setHeader("Accept-Encoding", "gzip, deflate, br");
+            httpGet.setHeader("Connection", "keep-alive");
+            httpGet.setHeader("Referer", referer);
+            httpGet.setHeader("Upgrade-Insecure-Requests", "1");
+            httpGet.setHeader("Sec-Fetch-Dest", "document");
+            httpGet.setHeader("Sec-Fetch-Mode", "navigate");
+            httpGet.setHeader("Sec-Fetch-Site", "cross-site");
+            httpGet.setHeader("Sec-Fetch-User", "?1");
+            httpGet.setHeader("Cache-Control", "max-age=0");
+            httpGet.setHeader("DNT", "1");
+            httpGet.setHeader("TE", "trailers");
+            
+            // 执行请求
+            System.out.println("开始执行HTTP请求...");
+            System.out.println("请求超时设置: " + TIMEOUT + "ms");
+            
+            CloseableHttpResponse response = null;
+            try {
+                // 使用Future来处理超时
+                java.util.concurrent.Future<CloseableHttpResponse> future = java.util.concurrent.Executors.newSingleThreadExecutor().submit(new java.util.concurrent.Callable<CloseableHttpResponse>() {
+                    @Override
+                    public CloseableHttpResponse call() throws Exception {
+                        try {
+                            return (CloseableHttpResponse) httpClient.executeOpen(null, httpGet, new BasicHttpContext());
+                        } catch (Exception e) {
+                            throw new RuntimeException(e);
+                        }
+                    }
+                });
+                
+                try {
+                    response = future.get(TIMEOUT, java.util.concurrent.TimeUnit.MILLISECONDS);
+                } catch (java.util.concurrent.TimeoutException e) {
+                    System.out.println("HTTP请求超时: " + e.getMessage());
+                    future.cancel(true);
+                    return "";
+                }
+                
+                // 获取响应状态码
+                int statusCode = response.getCode();
+                System.out.println("HTTP响应状态码: " + statusCode);
+                System.out.println("使用的User-Agent: " + userAgent);
+                System.out.println("使用的Referer: " + referer);
+                
+                if (statusCode != 200) {
+                    System.out.println("HTTP请求失败，状态码: " + statusCode);
+                    return "";
+                }
+                
+                // 读取响应内容
+                System.out.println("正在读取响应内容...");
+                // 限制读取的内容长度，避免程序卡住
+                String html = EntityUtils.toString(response.getEntity(), "UTF-8");
+                // 如果内容长度超过100000字符，只保留前100000字符
+                if (html.length() > 100000) {
+                    html = html.substring(0, 100000);
+                    System.out.println("响应内容过长，已截断为100000字符");
+                }
+                
+                long endTime = System.currentTimeMillis();
+                System.out.println("HTTP请求完成，耗时: " + (endTime - startTime) + "ms");
+                System.out.println("响应内容长度: " + html.length() + " 字符");
+                
+                // 检查响应内容是否为空或包含反爬信息
+                if (html == null || html.isEmpty()) {
+                    System.out.println("响应内容为空");
+                    return "";
+                }
+                
+                // 检查是否是反爬页面
+                boolean isAntiCrawl = false;
+                String[] antiCrawlKeywords = {"captcha", "verify", "robot", "Robot", "reCAPTCHA", "blocked", "Blocked"};
+                for (String keyword : antiCrawlKeywords) {
+                    if (html.contains(keyword)) {
+                        isAntiCrawl = true;
+                        break;
+                    }
+                }
+                
+                // 特殊处理arXiv，因为它的页面可能包含一些被误判为反爬的关键词
+                if (urlString.contains("arxiv.org")) {
+                    isAntiCrawl = false; // 对于arXiv，我们信任它返回的内容
+                }
+                
+                if (isAntiCrawl) {
+                    System.out.println("检测到反爬页面");
+                    return "";
+                }
+                
+                // 智能延迟，模拟真实用户行为，使用更随机的延迟时间
+                int delay = RANDOM.nextInt(1500) + 800; // 800-2300ms
+                System.out.println("添加随机延迟: " + delay + "ms");
+                Thread.sleep(delay);
+                
+                return html;
+            } finally {
+                // 确保响应和客户端被关闭
+                if (response != null) {
+                    try {
+                        response.close();
+                    } catch (Exception e) {
+                        System.out.println("关闭响应时出错: " + e.getMessage());
+                    }
+                }
+                try {
+                    httpClient.close();
+                } catch (Exception e) {
+                    System.out.println("关闭HTTP客户端时出错: " + e.getMessage());
+                }
+            }
+        } catch (java.net.SocketTimeoutException e) {
+            System.out.println("HTTP请求超时: " + e.getMessage());
+            return "";
+        } catch (java.io.IOException e) {
+            System.out.println("HTTP请求IO错误: " + e.getMessage());
+            return "";
+        } catch (Exception e) {
+            System.out.println("发送HTTP请求时出错: " + e.getMessage());
+            e.printStackTrace();
+            return "";
+        }
+    }
+    
+    // 解析 HTML
+    public static Document parseHtml(String html) {
+        return Jsoup.parse(html);
+    }
+    
+    // URL 编码
+    public static String urlEncode(String value) throws Exception {
+        return URLEncoder.encode(value, "UTF-8");
+    }
+    
+    // 生成唯一文件名
+    public static String generateFileName(String keyword) {
+        return keyword + "_" + System.currentTimeMillis() + ".json";
+    }
+    
+    // 清理文件名中的非法字符
+    public static String cleanFileName(String fileName) {
+        return fileName.replaceAll("[\\/:*?\"<>|]", "_");
+    }
+    
+    // 清理论文标题用于文件名
+    public static String cleanTitleForFileName(String title) {
+        if (title == null || title.isEmpty()) {
+            return "untitled";
+        }
+        String cleaned = title.trim()
+            .replaceAll("[\\\\/:*?\"<>|]", "_")
+            .replaceAll("\\s+", "_")
+            .replaceAll("_+", "_");
+        if (cleaned.length() > 100) {
+            cleaned = cleaned.substring(0, 100);
+        }
+        return cleaned;
+    }
+}