5 changed files with 286 additions and 0 deletions
@ -0,0 +1,64 @@ |
|||
package strategy; |
|||
|
|||
import model.Paper; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import utils.Utils; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class ACMDigitalLibraryStrategy extends AbstractCrawlerStrategy { |
|||
@Override |
|||
public String getPlatformName() { |
|||
return "ACM Digital Library"; |
|||
} |
|||
|
|||
@Override |
|||
public boolean supportsUrl(String url) { |
|||
return url != null && url.contains("dl.acm.org"); |
|||
} |
|||
|
|||
@Override |
|||
protected List<Paper> fetchPapers(String url, int count) throws Exception { |
|||
List<Paper> papers = new ArrayList<>(); |
|||
System.out.println("=== 开始使用ACM Digital Library获取论文 ==="); |
|||
|
|||
addDelay(2000, 3000); |
|||
|
|||
String html = Utils.sendGetRequest(url); |
|||
if (html.isEmpty()) return papers; |
|||
|
|||
Document doc = Jsoup.parse(html); |
|||
|
|||
Elements paperElements = doc.select(".search__item"); |
|||
|
|||
int collected = 0; |
|||
for (Element element : paperElements) { |
|||
if (collected >= count) break; |
|||
|
|||
try { |
|||
Element titleElement = element.selectFirst("h5 a"); |
|||
String title = titleElement != null ? titleElement.text() : ""; |
|||
|
|||
String paperUrl = titleElement != null ? titleElement.attr("href") : ""; |
|||
if (!paperUrl.startsWith("http")) { |
|||
paperUrl = "https://dl.acm.org" + paperUrl; |
|||
} |
|||
|
|||
Element authorsElement = element.selectFirst(".search__authors"); |
|||
String authors = authorsElement != null ? authorsElement.text() : ""; |
|||
|
|||
if (title.length() < 5 || paperUrl.isEmpty()) continue; |
|||
|
|||
papers.add(new Paper(title, authors, "", paperUrl, getPlatformName())); |
|||
collected++; |
|||
} catch (Exception e) { |
|||
continue; |
|||
} |
|||
} |
|||
|
|||
return papers; |
|||
} |
|||
} |
|||
@ -0,0 +1,28 @@ |
|||
package strategy; |
|||
|
|||
import model.Paper; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.Random; |
|||
|
|||
public abstract class AbstractCrawlerStrategy implements CrawlerStrategy { |
|||
protected Random random = new Random(); |
|||
|
|||
@Override |
|||
public List<Paper> crawl(String url, int count) throws Exception { |
|||
List<Paper> papers = new ArrayList<>(); |
|||
papers.addAll(fetchPapers(url, count)); |
|||
return papers; |
|||
} |
|||
|
|||
protected abstract List<Paper> fetchPapers(String url, int count) throws Exception; |
|||
|
|||
protected void addDelay(int minMs, int maxMs) { |
|||
try { |
|||
int delay = minMs + random.nextInt(maxMs - minMs + 1); |
|||
Thread.sleep(delay); |
|||
} catch (InterruptedException e) { |
|||
Thread.currentThread().interrupt(); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,128 @@ |
|||
package strategy; |
|||
|
|||
import model.Paper; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import utils.Utils; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class ArXivStrategy extends AbstractCrawlerStrategy { |
|||
@Override |
|||
public String getPlatformName() { |
|||
return "arXiv"; |
|||
} |
|||
|
|||
@Override |
|||
public boolean supportsUrl(String url) { |
|||
return url != null && url.contains("arxiv.org"); |
|||
} |
|||
|
|||
private String cleanPrefix(String text, String[] prefixes) { |
|||
if (text == null) { |
|||
return ""; |
|||
} |
|||
String cleaned = text.trim(); |
|||
for (String prefix : prefixes) { |
|||
if (cleaned.toLowerCase().startsWith(prefix.toLowerCase())) { |
|||
cleaned = cleaned.substring(prefix.length()).trim(); |
|||
} |
|||
} |
|||
return cleaned; |
|||
} |
|||
|
|||
@Override |
|||
protected List<Paper> fetchPapers(String url, int count) throws Exception { |
|||
List<Paper> papers = new ArrayList<>(); |
|||
System.out.println("=== 开始使用arXiv获取论文 ==="); |
|||
|
|||
String html = Utils.sendGetRequest(url); |
|||
if (html.isEmpty()) return papers; |
|||
|
|||
Document doc = Jsoup.parse(html); |
|||
|
|||
Elements paperElements = doc.select(".arxiv-result"); |
|||
|
|||
if (!paperElements.isEmpty()) { |
|||
System.out.println("检测到搜索结果页面,找到 " + paperElements.size() + " 篇论文"); |
|||
for (Element element : paperElements) { |
|||
if (papers.size() >= count) break; |
|||
|
|||
try { |
|||
Element titleElement = element.selectFirst(".title"); |
|||
String title = titleElement != null ? titleElement.text().trim() : ""; |
|||
title = cleanPrefix(title, new String[]{"Title:", "Title"}); |
|||
|
|||
Element linkElement = element.selectFirst(".list-title a"); |
|||
String link = linkElement != null ? linkElement.attr("href") : ""; |
|||
if (!link.isEmpty() && !link.startsWith("http")) { |
|||
link = "https://arxiv.org" + link; |
|||
} |
|||
|
|||
Element abstractElement = element.selectFirst(".abstract"); |
|||
String abstractText = abstractElement != null ? abstractElement.text().trim() : ""; |
|||
abstractText = cleanPrefix(abstractText, new String[]{"Abstract:", "Abstract"}); |
|||
|
|||
Element authorsElement = element.selectFirst(".authors"); |
|||
String authors = authorsElement != null ? authorsElement.text().trim() : ""; |
|||
authors = cleanPrefix(authors, new String[]{"Authors:", "Author:", "Authors", "Author"}); |
|||
|
|||
if (title.length() < 5 || link.isEmpty()) continue; |
|||
|
|||
papers.add(new Paper(title, authors, abstractText, link, getPlatformName())); |
|||
} catch (Exception e) { |
|||
continue; |
|||
} |
|||
} |
|||
} else { |
|||
Element titleElement = doc.selectFirst("h1.title.mathjax"); |
|||
if (titleElement != null) { |
|||
System.out.println("检测到单个论文页面"); |
|||
String title = titleElement.text().trim(); |
|||
title = cleanPrefix(title, new String[]{"Title:", "Title"}); |
|||
|
|||
Element authorsElement = doc.selectFirst(".authors"); |
|||
String authors = authorsElement != null ? authorsElement.text().trim() : ""; |
|||
authors = cleanPrefix(authors, new String[]{"Authors:", "Author:", "Authors", "Author"}); |
|||
|
|||
Element abstractElement = doc.selectFirst(".abstract.mathjax"); |
|||
String abstractText = abstractElement != null ? abstractElement.text().trim() : ""; |
|||
abstractText = cleanPrefix(abstractText, new String[]{"Abstract:", "Abstract"}); |
|||
|
|||
String link = url; |
|||
|
|||
if (title.length() >= 5 && !link.isEmpty()) { |
|||
papers.add(new Paper(title, authors, abstractText, link, getPlatformName())); |
|||
} |
|||
} else { |
|||
System.out.println("未找到论文元素,尝试其他选择器..."); |
|||
|
|||
String[] selectors = {"article", ".paper", ".document", "div[role='main']"}; |
|||
for (String selector : selectors) { |
|||
Elements articles = doc.select(selector); |
|||
if (!articles.isEmpty()) { |
|||
for (Element article : articles) { |
|||
if (papers.size() >= count) break; |
|||
|
|||
String title = article.selectFirst("h1, h2, .title") != null ? |
|||
article.selectFirst("h1, h2, .title").text().trim() : ""; |
|||
title = cleanPrefix(title, new String[]{"Title:", "Title"}); |
|||
|
|||
String link = article.selectFirst("a[href*='/abs/']") != null ? |
|||
"https://arxiv.org" + article.selectFirst("a[href*='/abs/']").attr("href") : url; |
|||
|
|||
if (title.length() >= 5 && !link.isEmpty()) { |
|||
papers.add(new Paper(title, "", "", link, getPlatformName())); |
|||
} |
|||
} |
|||
break; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
return papers; |
|||
} |
|||
} |
|||
@ -0,0 +1,52 @@ |
|||
package view; |
|||
|
|||
import model.Paper; |
|||
import command.Command; |
|||
import java.util.Scanner; |
|||
|
|||
public class ConsoleView { |
|||
private Scanner scanner; |
|||
|
|||
public ConsoleView() { |
|||
this.scanner = new Scanner(System.in); |
|||
} |
|||
|
|||
public void displayWelcome() { |
|||
System.out.println("===== 学术论文爬虫程序 ====="); |
|||
System.out.println("输入 help 查看可用命令"); |
|||
System.out.println(); |
|||
} |
|||
|
|||
public String getInput() { |
|||
System.out.print("> "); |
|||
return scanner.nextLine().trim(); |
|||
} |
|||
|
|||
public void showInfo(String message) { |
|||
System.out.println("[INFO] " + message); |
|||
} |
|||
|
|||
public void showSuccess(String message) { |
|||
System.out.println("[SUCCESS] " + message); |
|||
} |
|||
|
|||
public void showError(String message) { |
|||
System.out.println("[ERROR] " + message); |
|||
} |
|||
|
|||
public void showPaperInfo(int index, Paper paper) { |
|||
System.out.println(index + ". " + paper.getTitle()); |
|||
System.out.println(" 作者: " + (paper.getAuthors() != null ? paper.getAuthors() : "未知")); |
|||
System.out.println(" 来源: " + (paper.getPlatform() != null ? paper.getPlatform() : "未知")); |
|||
System.out.println(" URL: " + (paper.getUrl() != null ? paper.getUrl() : "未知")); |
|||
System.out.println(); |
|||
} |
|||
|
|||
public void showCommandInfo(Command command) { |
|||
System.out.println(" " + command.getName() + " - " + command.getDescription()); |
|||
} |
|||
|
|||
public void close() { |
|||
scanner.close(); |
|||
} |
|||
} |
|||
@ -0,0 +1,14 @@ |
|||
import controller.CrawlerController; |
|||
import view.ConsoleView; |
|||
import repository.PaperRepository; |
|||
import strategy.StrategyFactory; |
|||
|
|||
public class Main { |
|||
public static void main(String[] args) { |
|||
ConsoleView view = new ConsoleView(); |
|||
PaperRepository repository = new PaperRepository(); |
|||
StrategyFactory strategyFactory = new StrategyFactory(); |
|||
CrawlerController controller = new CrawlerController(view, repository, strategyFactory); |
|||
controller.run(); |
|||
} |
|||
} |
|||
Loading…
Reference in new issue