You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

128 lines
5.6 KiB

package strategy;
import model.Paper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import utils.Utils;
import java.util.ArrayList;
import java.util.List;
public class ArXivStrategy extends AbstractCrawlerStrategy {
@Override
public String getPlatformName() {
return "arXiv";
}
@Override
public boolean supportsUrl(String url) {
return url != null && url.contains("arxiv.org");
}
private String cleanPrefix(String text, String[] prefixes) {
if (text == null) {
return "";
}
String cleaned = text.trim();
for (String prefix : prefixes) {
if (cleaned.toLowerCase().startsWith(prefix.toLowerCase())) {
cleaned = cleaned.substring(prefix.length()).trim();
}
}
return cleaned;
}
@Override
protected List<Paper> fetchPapers(String url, int count) throws Exception {
List<Paper> papers = new ArrayList<>();
System.out.println("=== 开始使用arXiv获取论文 ===");
String html = Utils.sendGetRequest(url);
if (html.isEmpty()) return papers;
Document doc = Jsoup.parse(html);
Elements paperElements = doc.select(".arxiv-result");
if (!paperElements.isEmpty()) {
System.out.println("检测到搜索结果页面,找到 " + paperElements.size() + " 篇论文");
for (Element element : paperElements) {
if (papers.size() >= count) break;
try {
Element titleElement = element.selectFirst(".title");
String title = titleElement != null ? titleElement.text().trim() : "";
title = cleanPrefix(title, new String[]{"Title:", "Title"});
Element linkElement = element.selectFirst(".list-title a");
String link = linkElement != null ? linkElement.attr("href") : "";
if (!link.isEmpty() && !link.startsWith("http")) {
link = "https://arxiv.org" + link;
}
Element abstractElement = element.selectFirst(".abstract");
String abstractText = abstractElement != null ? abstractElement.text().trim() : "";
abstractText = cleanPrefix(abstractText, new String[]{"Abstract:", "Abstract"});
Element authorsElement = element.selectFirst(".authors");
String authors = authorsElement != null ? authorsElement.text().trim() : "";
authors = cleanPrefix(authors, new String[]{"Authors:", "Author:", "Authors", "Author"});
if (title.length() < 5 || link.isEmpty()) continue;
papers.add(new Paper(title, authors, abstractText, link, getPlatformName()));
} catch (Exception e) {
continue;
}
}
} else {
Element titleElement = doc.selectFirst("h1.title.mathjax");
if (titleElement != null) {
System.out.println("检测到单个论文页面");
String title = titleElement.text().trim();
title = cleanPrefix(title, new String[]{"Title:", "Title"});
Element authorsElement = doc.selectFirst(".authors");
String authors = authorsElement != null ? authorsElement.text().trim() : "";
authors = cleanPrefix(authors, new String[]{"Authors:", "Author:", "Authors", "Author"});
Element abstractElement = doc.selectFirst(".abstract.mathjax");
String abstractText = abstractElement != null ? abstractElement.text().trim() : "";
abstractText = cleanPrefix(abstractText, new String[]{"Abstract:", "Abstract"});
String link = url;
if (title.length() >= 5 && !link.isEmpty()) {
papers.add(new Paper(title, authors, abstractText, link, getPlatformName()));
}
} else {
System.out.println("未找到论文元素,尝试其他选择器...");
String[] selectors = {"article", ".paper", ".document", "div[role='main']"};
for (String selector : selectors) {
Elements articles = doc.select(selector);
if (!articles.isEmpty()) {
for (Element article : articles) {
if (papers.size() >= count) break;
String title = article.selectFirst("h1, h2, .title") != null ?
article.selectFirst("h1, h2, .title").text().trim() : "";
title = cleanPrefix(title, new String[]{"Title:", "Title"});
String link = article.selectFirst("a[href*='/abs/']") != null ?
"https://arxiv.org" + article.selectFirst("a[href*='/abs/']").attr("href") : url;
if (title.length() >= 5 && !link.isEmpty()) {
papers.add(new Paper(title, "", "", link, getPlatformName()));
}
}
break;
}
}
}
}
return papers;
}
}