上传文件至 'w10'

1 month ago · 58c2936cbb
5 changed files with 547 additions and 0 deletions
--- a/w10/CrawlerController.java
+++ b/w10/CrawlerController.java
@ -0,0 +1,56 @@
 package controller;
 import command.Command;
 import command.CrawlCommand;
 import command.ListCommand;
 import command.HelpCommand;
 import command.ExitCommand;
 import command.PlatformCommand;
 import view.ConsoleView;
 import repository.PaperRepository;
 import strategy.StrategyFactory;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Map;
 public class CrawlerController {
    private final ConsoleView view;
    private final PaperRepository repository;
    private final Map<String, Command> commands = new HashMap<>();
    public CrawlerController(ConsoleView view, PaperRepository repository, StrategyFactory strategyFactory) {
        this.view = view;
        this.repository = repository;
        register(new CrawlCommand(view, strategyFactory));
        register(new ListCommand(view));
        register(new PlatformCommand(view, strategyFactory));
        register(new ExitCommand(view));
        register(new HelpCommand(view, new ArrayList<>(commands.values())));
    }
    private void register(Command command) {
        commands.put(command.getName(), command);
    }
    public void run() {
        view.displayWelcome();
        while (true) {
            String input = view.getInput();
            if (input.isEmpty()) continue;
            String[] parts = input.split("\\s+");
            String commandName = parts[0].toLowerCase();
            if (!commands.containsKey(commandName)) {
                view.showError("未知命令，请输入 help 查看可用命令");
                continue;
            }
            Command command = commands.get(commandName);
            command.execute(parts, repository);
        }
    }
 }
--- a/w10/Paper.java
+++ b/w10/Paper.java
@ -0,0 +1,45 @@
 package model;
 public class Paper {
    private String title;
    private String authors;
    private String abstractText;
    private String url;
    private String platform;
    public Paper() {
    }
    public Paper(String title, String authors, String abstractText, String url, String platform) {
        this.title = title;
        this.authors = authors;
        this.abstractText = abstractText;
        this.url = url;
        this.platform = platform;
    }
    public String getTitle() { return title; }
    public void setTitle(String title) { this.title = title; }
    public String getAuthors() { return authors; }
    public void setAuthors(String authors) { this.authors = authors; }
    public String getAbstractText() { return abstractText; }
    public void setAbstractText(String abstractText) { this.abstractText = abstractText; }
    public String getUrl() { return url; }
    public void setUrl(String url) { this.url = url; }
    public String getPlatform() { return platform; }
    public void setPlatform(String platform) { this.platform = platform; }
    @Override
    public String toString() {
        return "Paper{" +
                "title='" + title + '\'' +
                ", authors='" + authors + '\'' +
                ", url='" + url + '\'' +
                ", platform='" + platform + '\'' +
                '}';
    }
 }
--- a/w10/PaperRepository.java
+++ b/w10/PaperRepository.java
@ -0,0 +1,145 @@
 package repository;
 import model.Paper;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.SerializationFeature;
 import utils.Utils;
 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 public class PaperRepository {
    private String baseDir = "论文爬取";
    private String subDir;
    private ObjectMapper objectMapper;
    public PaperRepository() {
        objectMapper = new ObjectMapper();
        objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
    }
    public void init(String platformName) {
        this.subDir = baseDir + File.separator + Utils.cleanFileName(platformName);
        File dir = new File(subDir);
        if (!dir.exists()) {
            dir.mkdirs();
        }
    }
    public List<Paper> removeDuplicates(List<Paper> papers) {
        Set<String> existingTitles = new HashSet<>();
        List<Paper> uniquePapers = new ArrayList<>();
        File[] files = new File(subDir).listFiles();
        if (files != null) {
            for (File file : files) {
                if (file.isFile() && file.getName().endsWith(".json")) {
                    try {
                        Paper[] existingPapers = objectMapper.readValue(file, Paper[].class);
                        for (Paper paper : existingPapers) {
                            existingTitles.add(paper.getTitle());
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
        }
        for (Paper paper : papers) {
            if (!existingTitles.contains(paper.getTitle())) {
                uniquePapers.add(paper);
                existingTitles.add(paper.getTitle());
            }
        }
        return uniquePapers;
    }
    public void savePapers(List<Paper> papers) throws Exception {
        if (papers.isEmpty()) {
            System.out.println("没有论文需要保存");
            return;
        }
        int savedCount = 0;
        for (Paper paper : papers) {
            String title = paper.getTitle();
            String fileName = Utils.cleanTitleForFileName(title) + ".json";
            String filePath = subDir + File.separator + fileName;
            List<Paper> singlePaperList = new ArrayList<>();
            singlePaperList.add(paper);
            objectMapper.writeValue(new File(filePath), singlePaperList);
            savedCount++;
            System.out.println("论文已保存: " + filePath);
        }
        System.out.println("共保存 " + savedCount + " 篇论文到: " + subDir);
    }
    public List<Paper> loadPapers() throws IOException {
        List<Paper> allPapers = new ArrayList<>();
        File[] files = new File(subDir).listFiles();
        if (files != null) {
            for (File file : files) {
                if (file.isFile() && file.getName().endsWith(".json")) {
                    Paper[] papers = objectMapper.readValue(file, Paper[].class);
                    for (Paper paper : papers) {
                        allPapers.add(paper);
                    }
                }
            }
        }
        return allPapers;
    }
    public Map<String, List<Paper>> loadAllPapersGroupedByPlatform() throws IOException {
        Map<String, List<Paper>> papersByPlatform = new HashMap<>();
        File baseDirFile = new File(baseDir);
        if (!baseDirFile.exists()) {
            return papersByPlatform;
        }
        File[] platformDirs = baseDirFile.listFiles();
        if (platformDirs != null) {
            for (File platformDir : platformDirs) {
                if (platformDir.isDirectory()) {
                    String platformName = platformDir.getName();
                    List<Paper> platformPapers = new ArrayList<>();
                    File[] files = platformDir.listFiles();
                    if (files != null) {
                        for (File file : files) {
                            if (file.isFile() && file.getName().endsWith(".json")) {
                                try {
                                    Paper[] papers = objectMapper.readValue(file, Paper[].class);
                                    for (Paper paper : papers) {
                                        platformPapers.add(paper);
                                    }
                                } catch (IOException e) {
                                    System.out.println("读取文件失败: " + file.getName());
                                }
                            }
                        }
                    }
                    if (!platformPapers.isEmpty()) {
                        papersByPlatform.put(platformName, platformPapers);
                    }
                }
            }
        }
        return papersByPlatform;
    }
 }
--- a/w10/PlatformCommand.java
+++ b/w10/PlatformCommand.java
@ -0,0 +1,49 @@
 package command;
 import strategy.CrawlerStrategy;
 import strategy.StrategyFactory;
 import view.ConsoleView;
 import java.util.List;
 import repository.PaperRepository;
 public class PlatformCommand implements Command {
    private StrategyFactory strategyFactory;
    private ConsoleView view;
    public PlatformCommand(ConsoleView view, StrategyFactory strategyFactory) {
        this.view = view;
        this.strategyFactory = strategyFactory;
    }
    @Override
    public void execute(String[] args, PaperRepository repository) {
        List<CrawlerStrategy> strategies = strategyFactory.getAllStrategies();
        if (strategies.isEmpty()) {
            view.showInfo("暂不支持任何论文平台");
        } else {
            view.showInfo("当前支持 " + strategies.size() + " 个论文平台:");
            System.out.println();
            int index = 1;
            for (CrawlerStrategy strategy : strategies) {
                System.out.println(index + ". " + strategy.getPlatformName());
                index++;
            }
            System.out.println();
            view.showInfo("使用示例: crawl <平台URL>");
            view.showInfo("例如: crawl https://arxiv.org/search/?query=machine+learning");
        }
    }
    @Override
    public String getDescription() {
        return "显示支持的论文平台列表";
    }
    @Override
    public String getName() {
        return "platforms";
    }
 }
--- a/w10/Utils.java
+++ b/w10/Utils.java
@ -0,0 +1,252 @@
 package utils;
 import org.apache.hc.client5.http.classic.methods.HttpGet;
 import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
 import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
 import org.apache.hc.client5.http.impl.classic.HttpClients;
 import org.apache.hc.core5.http.protocol.BasicHttpContext;
 import org.apache.hc.core5.http.io.entity.EntityUtils;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import java.net.URLEncoder;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Random;
 public class Utils {
    // 随机User-Agent列表
    private static final List<String> USER_AGENTS = new ArrayList<>();
    static {
        USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36");
        USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36");
        USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/124.0");
        USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/123.0.0.0");
        USER_AGENTS.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15");
        USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36");
        USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/123.0");
        USER_AGENTS.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15");
        USER_AGENTS.add("Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36");
        USER_AGENTS.add("Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1");
    }
    // 随机Referer列表
    private static final List<String> REFERERS = new ArrayList<>();
    static {
        REFERERS.add("https://www.google.com/");
        REFERERS.add("https://www.bing.com/");
        REFERERS.add("https://www.baidu.com/");
        REFERERS.add("https://scholar.google.com/");
        REFERERS.add("https://www.sciencedirect.com/");
        REFERERS.add("https://link.springer.com/");
        REFERERS.add("https://ieeexplore.ieee.org/");
        REFERERS.add("https://dl.acm.org/");
        REFERERS.add("https://kns.cnki.net/");
        REFERERS.add("https://www.google.com/search");
    }
    private static final Random RANDOM = new Random();
    // 发送 HTTP GET 请求
    public static String sendGetRequest(String urlString) throws Exception {
        System.out.println("正在发送HTTP请求: " + urlString);
        // 尝试多次普通HTTP请求，使用不同的User-Agent和Referer
        for (int i = 0; i < 2; i++) { // 减少重试次数，避免卡住
            String html = sendHttpGetRequest(urlString);
            if (!html.isEmpty()) {
                return html;
            }
            // 每次失败后添加更长的延迟
            int delay = 2000 + i * 1000;
            System.out.println("第 " + (i + 1) + " 次请求失败，添加延迟: " + delay + "ms");
            Thread.sleep(delay);
        }
        // 暂时禁用Selenium，因为初始化可能会卡住
        System.out.println("所有HTTP请求都失败，暂时跳过Selenium...");
        return "";
    }
    // 使用普通HTTP请求
    private static String sendHttpGetRequest(String urlString) throws Exception {
        long startTime = System.currentTimeMillis();
        // 设置请求超时时间
        final int TIMEOUT = 15000; // 15秒
        try {
            // 使用默认的HttpClient
            CloseableHttpClient httpClient = HttpClients.createDefault();
            HttpGet httpGet = new HttpGet(urlString);
            // 随机选择User-Agent
            String userAgent = USER_AGENTS.get(RANDOM.nextInt(USER_AGENTS.size()));
            // 随机选择Referer
            String referer = REFERERS.get(RANDOM.nextInt(REFERERS.size()));
            // 添加更完整的HTTP头信息，模拟真实浏览器
            httpGet.setHeader("User-Agent", userAgent);
            httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8");
            httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
            httpGet.setHeader("Accept-Encoding", "gzip, deflate, br");
            httpGet.setHeader("Connection", "keep-alive");
            httpGet.setHeader("Referer", referer);
            httpGet.setHeader("Upgrade-Insecure-Requests", "1");
            httpGet.setHeader("Sec-Fetch-Dest", "document");
            httpGet.setHeader("Sec-Fetch-Mode", "navigate");
            httpGet.setHeader("Sec-Fetch-Site", "cross-site");
            httpGet.setHeader("Sec-Fetch-User", "?1");
            httpGet.setHeader("Cache-Control", "max-age=0");
            httpGet.setHeader("DNT", "1");
            httpGet.setHeader("TE", "trailers");
            // 执行请求
            System.out.println("开始执行HTTP请求...");
            System.out.println("请求超时设置: " + TIMEOUT + "ms");
            CloseableHttpResponse response = null;
            try {
                // 使用Future来处理超时
                java.util.concurrent.Future<CloseableHttpResponse> future = java.util.concurrent.Executors.newSingleThreadExecutor().submit(new java.util.concurrent.Callable<CloseableHttpResponse>() {
                    @Override
                    public CloseableHttpResponse call() throws Exception {
                        try {
                            return (CloseableHttpResponse) httpClient.executeOpen(null, httpGet, new BasicHttpContext());
                        } catch (Exception e) {
                            throw new RuntimeException(e);
                        }
                    }
                });
                try {
                    response = future.get(TIMEOUT, java.util.concurrent.TimeUnit.MILLISECONDS);
                } catch (java.util.concurrent.TimeoutException e) {
                    System.out.println("HTTP请求超时: " + e.getMessage());
                    future.cancel(true);
                    return "";
                }
                // 获取响应状态码
                int statusCode = response.getCode();
                System.out.println("HTTP响应状态码: " + statusCode);
                System.out.println("使用的User-Agent: " + userAgent);
                System.out.println("使用的Referer: " + referer);
                if (statusCode != 200) {
                    System.out.println("HTTP请求失败，状态码: " + statusCode);
                    return "";
                }
                // 读取响应内容
                System.out.println("正在读取响应内容...");
                // 限制读取的内容长度，避免程序卡住
                String html = EntityUtils.toString(response.getEntity(), "UTF-8");
                // 如果内容长度超过100000字符，只保留前100000字符
                if (html.length() > 100000) {
                    html = html.substring(0, 100000);
                    System.out.println("响应内容过长，已截断为100000字符");
                }
                long endTime = System.currentTimeMillis();
                System.out.println("HTTP请求完成，耗时: " + (endTime - startTime) + "ms");
                System.out.println("响应内容长度: " + html.length() + " 字符");
                // 检查响应内容是否为空或包含反爬信息
                if (html == null || html.isEmpty()) {
                    System.out.println("响应内容为空");
                    return "";
                }
                // 检查是否是反爬页面
                boolean isAntiCrawl = false;
                String[] antiCrawlKeywords = {"captcha", "verify", "robot", "Robot", "reCAPTCHA", "blocked", "Blocked"};
                for (String keyword : antiCrawlKeywords) {
                    if (html.contains(keyword)) {
                        isAntiCrawl = true;
                        break;
                    }
                }
                // 特殊处理arXiv，因为它的页面可能包含一些被误判为反爬的关键词
                if (urlString.contains("arxiv.org")) {
                    isAntiCrawl = false; // 对于arXiv，我们信任它返回的内容
                }
                if (isAntiCrawl) {
                    System.out.println("检测到反爬页面");
                    return "";
                }
                // 智能延迟，模拟真实用户行为，使用更随机的延迟时间
                int delay = RANDOM.nextInt(1500) + 800; // 800-2300ms
                System.out.println("添加随机延迟: " + delay + "ms");
                Thread.sleep(delay);
                return html;
            } finally {
                // 确保响应和客户端被关闭
                if (response != null) {
                    try {
                        response.close();
                    } catch (Exception e) {
                        System.out.println("关闭响应时出错: " + e.getMessage());
                    }
                }
                try {
                    httpClient.close();
                } catch (Exception e) {
                    System.out.println("关闭HTTP客户端时出错: " + e.getMessage());
                }
            }
        } catch (java.net.SocketTimeoutException e) {
            System.out.println("HTTP请求超时: " + e.getMessage());
            return "";
        } catch (java.io.IOException e) {
            System.out.println("HTTP请求IO错误: " + e.getMessage());
            return "";
        } catch (Exception e) {
            System.out.println("发送HTTP请求时出错: " + e.getMessage());
            e.printStackTrace();
            return "";
        }
    }
    // 解析 HTML
    public static Document parseHtml(String html) {
        return Jsoup.parse(html);
    }
    // URL 编码
    public static String urlEncode(String value) throws Exception {
        return URLEncoder.encode(value, "UTF-8");
    }
    // 生成唯一文件名
    public static String generateFileName(String keyword) {
        return keyword + "_" + System.currentTimeMillis() + ".json";
    }
    // 清理文件名中的非法字符
    public static String cleanFileName(String fileName) {
        return fileName.replaceAll("[\\/:*?\"<>|]", "_");
    }
    // 清理论文标题用于文件名
    public static String cleanTitleForFileName(String title) {
        if (title == null || title.isEmpty()) {
            return "untitled";
        }
        String cleaned = title.trim()
            .replaceAll("[\\\\/:*?\"<>|]", "_")
            .replaceAll("\\s+", "_")
            .replaceAll("_+", "_");
        if (cleaned.length() > 100) {
            cleaned = cleaned.substring(0, 100);
        }
        return cleaned;
    }
 }