You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
128 lines
5.6 KiB
128 lines
5.6 KiB
package strategy;
|
|
|
|
import model.Paper;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
import utils.Utils;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
public class ArXivStrategy extends AbstractCrawlerStrategy {
|
|
@Override
|
|
public String getPlatformName() {
|
|
return "arXiv";
|
|
}
|
|
|
|
@Override
|
|
public boolean supportsUrl(String url) {
|
|
return url != null && url.contains("arxiv.org");
|
|
}
|
|
|
|
private String cleanPrefix(String text, String[] prefixes) {
|
|
if (text == null) {
|
|
return "";
|
|
}
|
|
String cleaned = text.trim();
|
|
for (String prefix : prefixes) {
|
|
if (cleaned.toLowerCase().startsWith(prefix.toLowerCase())) {
|
|
cleaned = cleaned.substring(prefix.length()).trim();
|
|
}
|
|
}
|
|
return cleaned;
|
|
}
|
|
|
|
@Override
|
|
protected List<Paper> fetchPapers(String url, int count) throws Exception {
|
|
List<Paper> papers = new ArrayList<>();
|
|
System.out.println("=== 开始使用arXiv获取论文 ===");
|
|
|
|
String html = Utils.sendGetRequest(url);
|
|
if (html.isEmpty()) return papers;
|
|
|
|
Document doc = Jsoup.parse(html);
|
|
|
|
Elements paperElements = doc.select(".arxiv-result");
|
|
|
|
if (!paperElements.isEmpty()) {
|
|
System.out.println("检测到搜索结果页面,找到 " + paperElements.size() + " 篇论文");
|
|
for (Element element : paperElements) {
|
|
if (papers.size() >= count) break;
|
|
|
|
try {
|
|
Element titleElement = element.selectFirst(".title");
|
|
String title = titleElement != null ? titleElement.text().trim() : "";
|
|
title = cleanPrefix(title, new String[]{"Title:", "Title"});
|
|
|
|
Element linkElement = element.selectFirst(".list-title a");
|
|
String link = linkElement != null ? linkElement.attr("href") : "";
|
|
if (!link.isEmpty() && !link.startsWith("http")) {
|
|
link = "https://arxiv.org" + link;
|
|
}
|
|
|
|
Element abstractElement = element.selectFirst(".abstract");
|
|
String abstractText = abstractElement != null ? abstractElement.text().trim() : "";
|
|
abstractText = cleanPrefix(abstractText, new String[]{"Abstract:", "Abstract"});
|
|
|
|
Element authorsElement = element.selectFirst(".authors");
|
|
String authors = authorsElement != null ? authorsElement.text().trim() : "";
|
|
authors = cleanPrefix(authors, new String[]{"Authors:", "Author:", "Authors", "Author"});
|
|
|
|
if (title.length() < 5 || link.isEmpty()) continue;
|
|
|
|
papers.add(new Paper(title, authors, abstractText, link, getPlatformName()));
|
|
} catch (Exception e) {
|
|
continue;
|
|
}
|
|
}
|
|
} else {
|
|
Element titleElement = doc.selectFirst("h1.title.mathjax");
|
|
if (titleElement != null) {
|
|
System.out.println("检测到单个论文页面");
|
|
String title = titleElement.text().trim();
|
|
title = cleanPrefix(title, new String[]{"Title:", "Title"});
|
|
|
|
Element authorsElement = doc.selectFirst(".authors");
|
|
String authors = authorsElement != null ? authorsElement.text().trim() : "";
|
|
authors = cleanPrefix(authors, new String[]{"Authors:", "Author:", "Authors", "Author"});
|
|
|
|
Element abstractElement = doc.selectFirst(".abstract.mathjax");
|
|
String abstractText = abstractElement != null ? abstractElement.text().trim() : "";
|
|
abstractText = cleanPrefix(abstractText, new String[]{"Abstract:", "Abstract"});
|
|
|
|
String link = url;
|
|
|
|
if (title.length() >= 5 && !link.isEmpty()) {
|
|
papers.add(new Paper(title, authors, abstractText, link, getPlatformName()));
|
|
}
|
|
} else {
|
|
System.out.println("未找到论文元素,尝试其他选择器...");
|
|
|
|
String[] selectors = {"article", ".paper", ".document", "div[role='main']"};
|
|
for (String selector : selectors) {
|
|
Elements articles = doc.select(selector);
|
|
if (!articles.isEmpty()) {
|
|
for (Element article : articles) {
|
|
if (papers.size() >= count) break;
|
|
|
|
String title = article.selectFirst("h1, h2, .title") != null ?
|
|
article.selectFirst("h1, h2, .title").text().trim() : "";
|
|
title = cleanPrefix(title, new String[]{"Title:", "Title"});
|
|
|
|
String link = article.selectFirst("a[href*='/abs/']") != null ?
|
|
"https://arxiv.org" + article.selectFirst("a[href*='/abs/']").attr("href") : url;
|
|
|
|
if (title.length() >= 5 && !link.isEmpty()) {
|
|
papers.add(new Paper(title, "", "", link, getPlatformName()));
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return papers;
|
|
}
|
|
}
|
|
|