package strategy; import model.Paper; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import utils.Utils; import java.util.ArrayList; import java.util.List; public class ArXivStrategy extends AbstractCrawlerStrategy { @Override public String getPlatformName() { return "arXiv"; } @Override public boolean supportsUrl(String url) { return url != null && url.contains("arxiv.org"); } private String cleanPrefix(String text, String[] prefixes) { if (text == null) { return ""; } String cleaned = text.trim(); for (String prefix : prefixes) { if (cleaned.toLowerCase().startsWith(prefix.toLowerCase())) { cleaned = cleaned.substring(prefix.length()).trim(); } } return cleaned; } @Override protected List fetchPapers(String url, int count) throws Exception { List papers = new ArrayList<>(); System.out.println("=== 开始使用arXiv获取论文 ==="); String html = Utils.sendGetRequest(url); if (html.isEmpty()) return papers; Document doc = Jsoup.parse(html); Elements paperElements = doc.select(".arxiv-result"); if (!paperElements.isEmpty()) { System.out.println("检测到搜索结果页面,找到 " + paperElements.size() + " 篇论文"); for (Element element : paperElements) { if (papers.size() >= count) break; try { Element titleElement = element.selectFirst(".title"); String title = titleElement != null ? titleElement.text().trim() : ""; title = cleanPrefix(title, new String[]{"Title:", "Title"}); Element linkElement = element.selectFirst(".list-title a"); String link = linkElement != null ? linkElement.attr("href") : ""; if (!link.isEmpty() && !link.startsWith("http")) { link = "https://arxiv.org" + link; } Element abstractElement = element.selectFirst(".abstract"); String abstractText = abstractElement != null ? abstractElement.text().trim() : ""; abstractText = cleanPrefix(abstractText, new String[]{"Abstract:", "Abstract"}); Element authorsElement = element.selectFirst(".authors"); String authors = authorsElement != null ? authorsElement.text().trim() : ""; authors = cleanPrefix(authors, new String[]{"Authors:", "Author:", "Authors", "Author"}); if (title.length() < 5 || link.isEmpty()) continue; papers.add(new Paper(title, authors, abstractText, link, getPlatformName())); } catch (Exception e) { continue; } } } else { Element titleElement = doc.selectFirst("h1.title.mathjax"); if (titleElement != null) { System.out.println("检测到单个论文页面"); String title = titleElement.text().trim(); title = cleanPrefix(title, new String[]{"Title:", "Title"}); Element authorsElement = doc.selectFirst(".authors"); String authors = authorsElement != null ? authorsElement.text().trim() : ""; authors = cleanPrefix(authors, new String[]{"Authors:", "Author:", "Authors", "Author"}); Element abstractElement = doc.selectFirst(".abstract.mathjax"); String abstractText = abstractElement != null ? abstractElement.text().trim() : ""; abstractText = cleanPrefix(abstractText, new String[]{"Abstract:", "Abstract"}); String link = url; if (title.length() >= 5 && !link.isEmpty()) { papers.add(new Paper(title, authors, abstractText, link, getPlatformName())); } } else { System.out.println("未找到论文元素,尝试其他选择器..."); String[] selectors = {"article", ".paper", ".document", "div[role='main']"}; for (String selector : selectors) { Elements articles = doc.select(selector); if (!articles.isEmpty()) { for (Element article : articles) { if (papers.size() >= count) break; String title = article.selectFirst("h1, h2, .title") != null ? article.selectFirst("h1, h2, .title").text().trim() : ""; title = cleanPrefix(title, new String[]{"Title:", "Title"}); String link = article.selectFirst("a[href*='/abs/']") != null ? "https://arxiv.org" + article.selectFirst("a[href*='/abs/']").attr("href") : url; if (title.length() >= 5 && !link.isEmpty()) { papers.add(new Paper(title, "", "", link, getPlatformName())); } } break; } } } } return papers; } }