diff --git a/w10/ACMDigitalLibraryStrategy.java b/w10/ACMDigitalLibraryStrategy.java new file mode 100644 index 0000000..7414915 --- /dev/null +++ b/w10/ACMDigitalLibraryStrategy.java @@ -0,0 +1,64 @@ +package strategy; + +import model.Paper; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import utils.Utils; +import java.util.ArrayList; +import java.util.List; + +public class ACMDigitalLibraryStrategy extends AbstractCrawlerStrategy { + @Override + public String getPlatformName() { + return "ACM Digital Library"; + } + + @Override + public boolean supportsUrl(String url) { + return url != null && url.contains("dl.acm.org"); + } + + @Override + protected List fetchPapers(String url, int count) throws Exception { + List papers = new ArrayList<>(); + System.out.println("=== 开始使用ACM Digital Library获取论文 ==="); + + addDelay(2000, 3000); + + String html = Utils.sendGetRequest(url); + if (html.isEmpty()) return papers; + + Document doc = Jsoup.parse(html); + + Elements paperElements = doc.select(".search__item"); + + int collected = 0; + for (Element element : paperElements) { + if (collected >= count) break; + + try { + Element titleElement = element.selectFirst("h5 a"); + String title = titleElement != null ? titleElement.text() : ""; + + String paperUrl = titleElement != null ? titleElement.attr("href") : ""; + if (!paperUrl.startsWith("http")) { + paperUrl = "https://dl.acm.org" + paperUrl; + } + + Element authorsElement = element.selectFirst(".search__authors"); + String authors = authorsElement != null ? authorsElement.text() : ""; + + if (title.length() < 5 || paperUrl.isEmpty()) continue; + + papers.add(new Paper(title, authors, "", paperUrl, getPlatformName())); + collected++; + } catch (Exception e) { + continue; + } + } + + return papers; + } +} \ No newline at end of file diff --git a/w10/AbstractCrawlerStrategy.java b/w10/AbstractCrawlerStrategy.java new file mode 100644 index 0000000..17dbb80 --- /dev/null +++ b/w10/AbstractCrawlerStrategy.java @@ -0,0 +1,28 @@ +package strategy; + +import model.Paper; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +public abstract class AbstractCrawlerStrategy implements CrawlerStrategy { + protected Random random = new Random(); + + @Override + public List crawl(String url, int count) throws Exception { + List papers = new ArrayList<>(); + papers.addAll(fetchPapers(url, count)); + return papers; + } + + protected abstract List fetchPapers(String url, int count) throws Exception; + + protected void addDelay(int minMs, int maxMs) { + try { + int delay = minMs + random.nextInt(maxMs - minMs + 1); + Thread.sleep(delay); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } +} \ No newline at end of file diff --git a/w10/ArXivStrategy.java b/w10/ArXivStrategy.java new file mode 100644 index 0000000..2acacbc --- /dev/null +++ b/w10/ArXivStrategy.java @@ -0,0 +1,128 @@ +package strategy; + +import model.Paper; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import utils.Utils; +import java.util.ArrayList; +import java.util.List; + +public class ArXivStrategy extends AbstractCrawlerStrategy { + @Override + public String getPlatformName() { + return "arXiv"; + } + + @Override + public boolean supportsUrl(String url) { + return url != null && url.contains("arxiv.org"); + } + + private String cleanPrefix(String text, String[] prefixes) { + if (text == null) { + return ""; + } + String cleaned = text.trim(); + for (String prefix : prefixes) { + if (cleaned.toLowerCase().startsWith(prefix.toLowerCase())) { + cleaned = cleaned.substring(prefix.length()).trim(); + } + } + return cleaned; + } + + @Override + protected List fetchPapers(String url, int count) throws Exception { + List papers = new ArrayList<>(); + System.out.println("=== 开始使用arXiv获取论文 ==="); + + String html = Utils.sendGetRequest(url); + if (html.isEmpty()) return papers; + + Document doc = Jsoup.parse(html); + + Elements paperElements = doc.select(".arxiv-result"); + + if (!paperElements.isEmpty()) { + System.out.println("检测到搜索结果页面,找到 " + paperElements.size() + " 篇论文"); + for (Element element : paperElements) { + if (papers.size() >= count) break; + + try { + Element titleElement = element.selectFirst(".title"); + String title = titleElement != null ? titleElement.text().trim() : ""; + title = cleanPrefix(title, new String[]{"Title:", "Title"}); + + Element linkElement = element.selectFirst(".list-title a"); + String link = linkElement != null ? linkElement.attr("href") : ""; + if (!link.isEmpty() && !link.startsWith("http")) { + link = "https://arxiv.org" + link; + } + + Element abstractElement = element.selectFirst(".abstract"); + String abstractText = abstractElement != null ? abstractElement.text().trim() : ""; + abstractText = cleanPrefix(abstractText, new String[]{"Abstract:", "Abstract"}); + + Element authorsElement = element.selectFirst(".authors"); + String authors = authorsElement != null ? authorsElement.text().trim() : ""; + authors = cleanPrefix(authors, new String[]{"Authors:", "Author:", "Authors", "Author"}); + + if (title.length() < 5 || link.isEmpty()) continue; + + papers.add(new Paper(title, authors, abstractText, link, getPlatformName())); + } catch (Exception e) { + continue; + } + } + } else { + Element titleElement = doc.selectFirst("h1.title.mathjax"); + if (titleElement != null) { + System.out.println("检测到单个论文页面"); + String title = titleElement.text().trim(); + title = cleanPrefix(title, new String[]{"Title:", "Title"}); + + Element authorsElement = doc.selectFirst(".authors"); + String authors = authorsElement != null ? authorsElement.text().trim() : ""; + authors = cleanPrefix(authors, new String[]{"Authors:", "Author:", "Authors", "Author"}); + + Element abstractElement = doc.selectFirst(".abstract.mathjax"); + String abstractText = abstractElement != null ? abstractElement.text().trim() : ""; + abstractText = cleanPrefix(abstractText, new String[]{"Abstract:", "Abstract"}); + + String link = url; + + if (title.length() >= 5 && !link.isEmpty()) { + papers.add(new Paper(title, authors, abstractText, link, getPlatformName())); + } + } else { + System.out.println("未找到论文元素,尝试其他选择器..."); + + String[] selectors = {"article", ".paper", ".document", "div[role='main']"}; + for (String selector : selectors) { + Elements articles = doc.select(selector); + if (!articles.isEmpty()) { + for (Element article : articles) { + if (papers.size() >= count) break; + + String title = article.selectFirst("h1, h2, .title") != null ? + article.selectFirst("h1, h2, .title").text().trim() : ""; + title = cleanPrefix(title, new String[]{"Title:", "Title"}); + + String link = article.selectFirst("a[href*='/abs/']") != null ? + "https://arxiv.org" + article.selectFirst("a[href*='/abs/']").attr("href") : url; + + if (title.length() >= 5 && !link.isEmpty()) { + papers.add(new Paper(title, "", "", link, getPlatformName())); + } + } + break; + } + } + } + } + + return papers; + } +} diff --git a/w10/ConsoleView.java b/w10/ConsoleView.java new file mode 100644 index 0000000..1d44d66 --- /dev/null +++ b/w10/ConsoleView.java @@ -0,0 +1,52 @@ +package view; + +import model.Paper; +import command.Command; +import java.util.Scanner; + +public class ConsoleView { + private Scanner scanner; + + public ConsoleView() { + this.scanner = new Scanner(System.in); + } + + public void displayWelcome() { + System.out.println("===== 学术论文爬虫程序 ====="); + System.out.println("输入 help 查看可用命令"); + System.out.println(); + } + + public String getInput() { + System.out.print("> "); + return scanner.nextLine().trim(); + } + + public void showInfo(String message) { + System.out.println("[INFO] " + message); + } + + public void showSuccess(String message) { + System.out.println("[SUCCESS] " + message); + } + + public void showError(String message) { + System.out.println("[ERROR] " + message); + } + + public void showPaperInfo(int index, Paper paper) { + System.out.println(index + ". " + paper.getTitle()); + System.out.println(" 作者: " + (paper.getAuthors() != null ? paper.getAuthors() : "未知")); + System.out.println(" 来源: " + (paper.getPlatform() != null ? paper.getPlatform() : "未知")); + System.out.println(" URL: " + (paper.getUrl() != null ? paper.getUrl() : "未知")); + System.out.println(); + } + + public void showCommandInfo(Command command) { + System.out.println(" " + command.getName() + " - " + command.getDescription()); + } + + public void close() { + scanner.close(); + } +} \ No newline at end of file diff --git a/w10/Main.java b/w10/Main.java new file mode 100644 index 0000000..39c86ac --- /dev/null +++ b/w10/Main.java @@ -0,0 +1,14 @@ +import controller.CrawlerController; +import view.ConsoleView; +import repository.PaperRepository; +import strategy.StrategyFactory; + +public class Main { + public static void main(String[] args) { + ConsoleView view = new ConsoleView(); + PaperRepository repository = new PaperRepository(); + StrategyFactory strategyFactory = new StrategyFactory(); + CrawlerController controller = new CrawlerController(view, repository, strategyFactory); + controller.run(); + } +} \ No newline at end of file