5 changed files with 286 additions and 0 deletions
@ -0,0 +1,64 @@ |
|||||
|
package strategy; |
||||
|
|
||||
|
import model.Paper; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import utils.Utils; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class ACMDigitalLibraryStrategy extends AbstractCrawlerStrategy { |
||||
|
@Override |
||||
|
public String getPlatformName() { |
||||
|
return "ACM Digital Library"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public boolean supportsUrl(String url) { |
||||
|
return url != null && url.contains("dl.acm.org"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<Paper> fetchPapers(String url, int count) throws Exception { |
||||
|
List<Paper> papers = new ArrayList<>(); |
||||
|
System.out.println("=== 开始使用ACM Digital Library获取论文 ==="); |
||||
|
|
||||
|
addDelay(2000, 3000); |
||||
|
|
||||
|
String html = Utils.sendGetRequest(url); |
||||
|
if (html.isEmpty()) return papers; |
||||
|
|
||||
|
Document doc = Jsoup.parse(html); |
||||
|
|
||||
|
Elements paperElements = doc.select(".search__item"); |
||||
|
|
||||
|
int collected = 0; |
||||
|
for (Element element : paperElements) { |
||||
|
if (collected >= count) break; |
||||
|
|
||||
|
try { |
||||
|
Element titleElement = element.selectFirst("h5 a"); |
||||
|
String title = titleElement != null ? titleElement.text() : ""; |
||||
|
|
||||
|
String paperUrl = titleElement != null ? titleElement.attr("href") : ""; |
||||
|
if (!paperUrl.startsWith("http")) { |
||||
|
paperUrl = "https://dl.acm.org" + paperUrl; |
||||
|
} |
||||
|
|
||||
|
Element authorsElement = element.selectFirst(".search__authors"); |
||||
|
String authors = authorsElement != null ? authorsElement.text() : ""; |
||||
|
|
||||
|
if (title.length() < 5 || paperUrl.isEmpty()) continue; |
||||
|
|
||||
|
papers.add(new Paper(title, authors, "", paperUrl, getPlatformName())); |
||||
|
collected++; |
||||
|
} catch (Exception e) { |
||||
|
continue; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return papers; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,28 @@ |
|||||
|
package strategy; |
||||
|
|
||||
|
import model.Paper; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.Random; |
||||
|
|
||||
|
public abstract class AbstractCrawlerStrategy implements CrawlerStrategy { |
||||
|
protected Random random = new Random(); |
||||
|
|
||||
|
@Override |
||||
|
public List<Paper> crawl(String url, int count) throws Exception { |
||||
|
List<Paper> papers = new ArrayList<>(); |
||||
|
papers.addAll(fetchPapers(url, count)); |
||||
|
return papers; |
||||
|
} |
||||
|
|
||||
|
protected abstract List<Paper> fetchPapers(String url, int count) throws Exception; |
||||
|
|
||||
|
protected void addDelay(int minMs, int maxMs) { |
||||
|
try { |
||||
|
int delay = minMs + random.nextInt(maxMs - minMs + 1); |
||||
|
Thread.sleep(delay); |
||||
|
} catch (InterruptedException e) { |
||||
|
Thread.currentThread().interrupt(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,128 @@ |
|||||
|
package strategy; |
||||
|
|
||||
|
import model.Paper; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import utils.Utils; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class ArXivStrategy extends AbstractCrawlerStrategy { |
||||
|
@Override |
||||
|
public String getPlatformName() { |
||||
|
return "arXiv"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public boolean supportsUrl(String url) { |
||||
|
return url != null && url.contains("arxiv.org"); |
||||
|
} |
||||
|
|
||||
|
private String cleanPrefix(String text, String[] prefixes) { |
||||
|
if (text == null) { |
||||
|
return ""; |
||||
|
} |
||||
|
String cleaned = text.trim(); |
||||
|
for (String prefix : prefixes) { |
||||
|
if (cleaned.toLowerCase().startsWith(prefix.toLowerCase())) { |
||||
|
cleaned = cleaned.substring(prefix.length()).trim(); |
||||
|
} |
||||
|
} |
||||
|
return cleaned; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<Paper> fetchPapers(String url, int count) throws Exception { |
||||
|
List<Paper> papers = new ArrayList<>(); |
||||
|
System.out.println("=== 开始使用arXiv获取论文 ==="); |
||||
|
|
||||
|
String html = Utils.sendGetRequest(url); |
||||
|
if (html.isEmpty()) return papers; |
||||
|
|
||||
|
Document doc = Jsoup.parse(html); |
||||
|
|
||||
|
Elements paperElements = doc.select(".arxiv-result"); |
||||
|
|
||||
|
if (!paperElements.isEmpty()) { |
||||
|
System.out.println("检测到搜索结果页面,找到 " + paperElements.size() + " 篇论文"); |
||||
|
for (Element element : paperElements) { |
||||
|
if (papers.size() >= count) break; |
||||
|
|
||||
|
try { |
||||
|
Element titleElement = element.selectFirst(".title"); |
||||
|
String title = titleElement != null ? titleElement.text().trim() : ""; |
||||
|
title = cleanPrefix(title, new String[]{"Title:", "Title"}); |
||||
|
|
||||
|
Element linkElement = element.selectFirst(".list-title a"); |
||||
|
String link = linkElement != null ? linkElement.attr("href") : ""; |
||||
|
if (!link.isEmpty() && !link.startsWith("http")) { |
||||
|
link = "https://arxiv.org" + link; |
||||
|
} |
||||
|
|
||||
|
Element abstractElement = element.selectFirst(".abstract"); |
||||
|
String abstractText = abstractElement != null ? abstractElement.text().trim() : ""; |
||||
|
abstractText = cleanPrefix(abstractText, new String[]{"Abstract:", "Abstract"}); |
||||
|
|
||||
|
Element authorsElement = element.selectFirst(".authors"); |
||||
|
String authors = authorsElement != null ? authorsElement.text().trim() : ""; |
||||
|
authors = cleanPrefix(authors, new String[]{"Authors:", "Author:", "Authors", "Author"}); |
||||
|
|
||||
|
if (title.length() < 5 || link.isEmpty()) continue; |
||||
|
|
||||
|
papers.add(new Paper(title, authors, abstractText, link, getPlatformName())); |
||||
|
} catch (Exception e) { |
||||
|
continue; |
||||
|
} |
||||
|
} |
||||
|
} else { |
||||
|
Element titleElement = doc.selectFirst("h1.title.mathjax"); |
||||
|
if (titleElement != null) { |
||||
|
System.out.println("检测到单个论文页面"); |
||||
|
String title = titleElement.text().trim(); |
||||
|
title = cleanPrefix(title, new String[]{"Title:", "Title"}); |
||||
|
|
||||
|
Element authorsElement = doc.selectFirst(".authors"); |
||||
|
String authors = authorsElement != null ? authorsElement.text().trim() : ""; |
||||
|
authors = cleanPrefix(authors, new String[]{"Authors:", "Author:", "Authors", "Author"}); |
||||
|
|
||||
|
Element abstractElement = doc.selectFirst(".abstract.mathjax"); |
||||
|
String abstractText = abstractElement != null ? abstractElement.text().trim() : ""; |
||||
|
abstractText = cleanPrefix(abstractText, new String[]{"Abstract:", "Abstract"}); |
||||
|
|
||||
|
String link = url; |
||||
|
|
||||
|
if (title.length() >= 5 && !link.isEmpty()) { |
||||
|
papers.add(new Paper(title, authors, abstractText, link, getPlatformName())); |
||||
|
} |
||||
|
} else { |
||||
|
System.out.println("未找到论文元素,尝试其他选择器..."); |
||||
|
|
||||
|
String[] selectors = {"article", ".paper", ".document", "div[role='main']"}; |
||||
|
for (String selector : selectors) { |
||||
|
Elements articles = doc.select(selector); |
||||
|
if (!articles.isEmpty()) { |
||||
|
for (Element article : articles) { |
||||
|
if (papers.size() >= count) break; |
||||
|
|
||||
|
String title = article.selectFirst("h1, h2, .title") != null ? |
||||
|
article.selectFirst("h1, h2, .title").text().trim() : ""; |
||||
|
title = cleanPrefix(title, new String[]{"Title:", "Title"}); |
||||
|
|
||||
|
String link = article.selectFirst("a[href*='/abs/']") != null ? |
||||
|
"https://arxiv.org" + article.selectFirst("a[href*='/abs/']").attr("href") : url; |
||||
|
|
||||
|
if (title.length() >= 5 && !link.isEmpty()) { |
||||
|
papers.add(new Paper(title, "", "", link, getPlatformName())); |
||||
|
} |
||||
|
} |
||||
|
break; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return papers; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,52 @@ |
|||||
|
package view; |
||||
|
|
||||
|
import model.Paper; |
||||
|
import command.Command; |
||||
|
import java.util.Scanner; |
||||
|
|
||||
|
public class ConsoleView { |
||||
|
private Scanner scanner; |
||||
|
|
||||
|
public ConsoleView() { |
||||
|
this.scanner = new Scanner(System.in); |
||||
|
} |
||||
|
|
||||
|
public void displayWelcome() { |
||||
|
System.out.println("===== 学术论文爬虫程序 ====="); |
||||
|
System.out.println("输入 help 查看可用命令"); |
||||
|
System.out.println(); |
||||
|
} |
||||
|
|
||||
|
public String getInput() { |
||||
|
System.out.print("> "); |
||||
|
return scanner.nextLine().trim(); |
||||
|
} |
||||
|
|
||||
|
public void showInfo(String message) { |
||||
|
System.out.println("[INFO] " + message); |
||||
|
} |
||||
|
|
||||
|
public void showSuccess(String message) { |
||||
|
System.out.println("[SUCCESS] " + message); |
||||
|
} |
||||
|
|
||||
|
public void showError(String message) { |
||||
|
System.out.println("[ERROR] " + message); |
||||
|
} |
||||
|
|
||||
|
public void showPaperInfo(int index, Paper paper) { |
||||
|
System.out.println(index + ". " + paper.getTitle()); |
||||
|
System.out.println(" 作者: " + (paper.getAuthors() != null ? paper.getAuthors() : "未知")); |
||||
|
System.out.println(" 来源: " + (paper.getPlatform() != null ? paper.getPlatform() : "未知")); |
||||
|
System.out.println(" URL: " + (paper.getUrl() != null ? paper.getUrl() : "未知")); |
||||
|
System.out.println(); |
||||
|
} |
||||
|
|
||||
|
public void showCommandInfo(Command command) { |
||||
|
System.out.println(" " + command.getName() + " - " + command.getDescription()); |
||||
|
} |
||||
|
|
||||
|
public void close() { |
||||
|
scanner.close(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,14 @@ |
|||||
|
import controller.CrawlerController; |
||||
|
import view.ConsoleView; |
||||
|
import repository.PaperRepository; |
||||
|
import strategy.StrategyFactory; |
||||
|
|
||||
|
public class Main { |
||||
|
public static void main(String[] args) { |
||||
|
ConsoleView view = new ConsoleView(); |
||||
|
PaperRepository repository = new PaperRepository(); |
||||
|
StrategyFactory strategyFactory = new StrategyFactory(); |
||||
|
CrawlerController controller = new CrawlerController(view, repository, strategyFactory); |
||||
|
controller.run(); |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue