Browse Source

上传文件至 'w10'

main
Zhengjie 1 month ago
parent
commit
8b9c45e51c
  1. 64
      w10/ACMDigitalLibraryStrategy.java
  2. 28
      w10/AbstractCrawlerStrategy.java
  3. 128
      w10/ArXivStrategy.java
  4. 52
      w10/ConsoleView.java
  5. 14
      w10/Main.java

64
w10/ACMDigitalLibraryStrategy.java

@ -0,0 +1,64 @@
package strategy;
import model.Paper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import utils.Utils;
import java.util.ArrayList;
import java.util.List;
public class ACMDigitalLibraryStrategy extends AbstractCrawlerStrategy {
@Override
public String getPlatformName() {
return "ACM Digital Library";
}
@Override
public boolean supportsUrl(String url) {
return url != null && url.contains("dl.acm.org");
}
@Override
protected List<Paper> fetchPapers(String url, int count) throws Exception {
List<Paper> papers = new ArrayList<>();
System.out.println("=== 开始使用ACM Digital Library获取论文 ===");
addDelay(2000, 3000);
String html = Utils.sendGetRequest(url);
if (html.isEmpty()) return papers;
Document doc = Jsoup.parse(html);
Elements paperElements = doc.select(".search__item");
int collected = 0;
for (Element element : paperElements) {
if (collected >= count) break;
try {
Element titleElement = element.selectFirst("h5 a");
String title = titleElement != null ? titleElement.text() : "";
String paperUrl = titleElement != null ? titleElement.attr("href") : "";
if (!paperUrl.startsWith("http")) {
paperUrl = "https://dl.acm.org" + paperUrl;
}
Element authorsElement = element.selectFirst(".search__authors");
String authors = authorsElement != null ? authorsElement.text() : "";
if (title.length() < 5 || paperUrl.isEmpty()) continue;
papers.add(new Paper(title, authors, "", paperUrl, getPlatformName()));
collected++;
} catch (Exception e) {
continue;
}
}
return papers;
}
}

28
w10/AbstractCrawlerStrategy.java

@ -0,0 +1,28 @@
package strategy;
import model.Paper;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
public abstract class AbstractCrawlerStrategy implements CrawlerStrategy {
protected Random random = new Random();
@Override
public List<Paper> crawl(String url, int count) throws Exception {
List<Paper> papers = new ArrayList<>();
papers.addAll(fetchPapers(url, count));
return papers;
}
protected abstract List<Paper> fetchPapers(String url, int count) throws Exception;
protected void addDelay(int minMs, int maxMs) {
try {
int delay = minMs + random.nextInt(maxMs - minMs + 1);
Thread.sleep(delay);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}

128
w10/ArXivStrategy.java

@ -0,0 +1,128 @@
package strategy;
import model.Paper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import utils.Utils;
import java.util.ArrayList;
import java.util.List;
public class ArXivStrategy extends AbstractCrawlerStrategy {
@Override
public String getPlatformName() {
return "arXiv";
}
@Override
public boolean supportsUrl(String url) {
return url != null && url.contains("arxiv.org");
}
private String cleanPrefix(String text, String[] prefixes) {
if (text == null) {
return "";
}
String cleaned = text.trim();
for (String prefix : prefixes) {
if (cleaned.toLowerCase().startsWith(prefix.toLowerCase())) {
cleaned = cleaned.substring(prefix.length()).trim();
}
}
return cleaned;
}
@Override
protected List<Paper> fetchPapers(String url, int count) throws Exception {
List<Paper> papers = new ArrayList<>();
System.out.println("=== 开始使用arXiv获取论文 ===");
String html = Utils.sendGetRequest(url);
if (html.isEmpty()) return papers;
Document doc = Jsoup.parse(html);
Elements paperElements = doc.select(".arxiv-result");
if (!paperElements.isEmpty()) {
System.out.println("检测到搜索结果页面,找到 " + paperElements.size() + " 篇论文");
for (Element element : paperElements) {
if (papers.size() >= count) break;
try {
Element titleElement = element.selectFirst(".title");
String title = titleElement != null ? titleElement.text().trim() : "";
title = cleanPrefix(title, new String[]{"Title:", "Title"});
Element linkElement = element.selectFirst(".list-title a");
String link = linkElement != null ? linkElement.attr("href") : "";
if (!link.isEmpty() && !link.startsWith("http")) {
link = "https://arxiv.org" + link;
}
Element abstractElement = element.selectFirst(".abstract");
String abstractText = abstractElement != null ? abstractElement.text().trim() : "";
abstractText = cleanPrefix(abstractText, new String[]{"Abstract:", "Abstract"});
Element authorsElement = element.selectFirst(".authors");
String authors = authorsElement != null ? authorsElement.text().trim() : "";
authors = cleanPrefix(authors, new String[]{"Authors:", "Author:", "Authors", "Author"});
if (title.length() < 5 || link.isEmpty()) continue;
papers.add(new Paper(title, authors, abstractText, link, getPlatformName()));
} catch (Exception e) {
continue;
}
}
} else {
Element titleElement = doc.selectFirst("h1.title.mathjax");
if (titleElement != null) {
System.out.println("检测到单个论文页面");
String title = titleElement.text().trim();
title = cleanPrefix(title, new String[]{"Title:", "Title"});
Element authorsElement = doc.selectFirst(".authors");
String authors = authorsElement != null ? authorsElement.text().trim() : "";
authors = cleanPrefix(authors, new String[]{"Authors:", "Author:", "Authors", "Author"});
Element abstractElement = doc.selectFirst(".abstract.mathjax");
String abstractText = abstractElement != null ? abstractElement.text().trim() : "";
abstractText = cleanPrefix(abstractText, new String[]{"Abstract:", "Abstract"});
String link = url;
if (title.length() >= 5 && !link.isEmpty()) {
papers.add(new Paper(title, authors, abstractText, link, getPlatformName()));
}
} else {
System.out.println("未找到论文元素,尝试其他选择器...");
String[] selectors = {"article", ".paper", ".document", "div[role='main']"};
for (String selector : selectors) {
Elements articles = doc.select(selector);
if (!articles.isEmpty()) {
for (Element article : articles) {
if (papers.size() >= count) break;
String title = article.selectFirst("h1, h2, .title") != null ?
article.selectFirst("h1, h2, .title").text().trim() : "";
title = cleanPrefix(title, new String[]{"Title:", "Title"});
String link = article.selectFirst("a[href*='/abs/']") != null ?
"https://arxiv.org" + article.selectFirst("a[href*='/abs/']").attr("href") : url;
if (title.length() >= 5 && !link.isEmpty()) {
papers.add(new Paper(title, "", "", link, getPlatformName()));
}
}
break;
}
}
}
}
return papers;
}
}

52
w10/ConsoleView.java

@ -0,0 +1,52 @@
package view;
import model.Paper;
import command.Command;
import java.util.Scanner;
public class ConsoleView {
private Scanner scanner;
public ConsoleView() {
this.scanner = new Scanner(System.in);
}
public void displayWelcome() {
System.out.println("===== 学术论文爬虫程序 =====");
System.out.println("输入 help 查看可用命令");
System.out.println();
}
public String getInput() {
System.out.print("> ");
return scanner.nextLine().trim();
}
public void showInfo(String message) {
System.out.println("[INFO] " + message);
}
public void showSuccess(String message) {
System.out.println("[SUCCESS] " + message);
}
public void showError(String message) {
System.out.println("[ERROR] " + message);
}
public void showPaperInfo(int index, Paper paper) {
System.out.println(index + ". " + paper.getTitle());
System.out.println(" 作者: " + (paper.getAuthors() != null ? paper.getAuthors() : "未知"));
System.out.println(" 来源: " + (paper.getPlatform() != null ? paper.getPlatform() : "未知"));
System.out.println(" URL: " + (paper.getUrl() != null ? paper.getUrl() : "未知"));
System.out.println();
}
public void showCommandInfo(Command command) {
System.out.println(" " + command.getName() + " - " + command.getDescription());
}
public void close() {
scanner.close();
}
}

14
w10/Main.java

@ -0,0 +1,14 @@
import controller.CrawlerController;
import view.ConsoleView;
import repository.PaperRepository;
import strategy.StrategyFactory;
public class Main {
public static void main(String[] args) {
ConsoleView view = new ConsoleView();
PaperRepository repository = new PaperRepository();
StrategyFactory strategyFactory = new StrategyFactory();
CrawlerController controller = new CrawlerController(view, repository, strategyFactory);
controller.run();
}
}
Loading…
Cancel
Save