From 765f8861e9e52d5c42617f94ca313d25b014cdcf Mon Sep 17 00:00:00 2001 From: Zhengjie <2044415419@qq.com> Date: Thu, 14 May 2026 14:39:27 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20'w10'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- w10/CNKIStrategy.java | 82 +++++++++++++++++++++++++ w10/CrawlerStrategy.java | 10 +++ w10/IEEEStrategy.java | 64 +++++++++++++++++++ w10/ScienceDirectStrategy.java | 64 +++++++++++++++++++ w10/SemanticScholarStrategy.java | 102 +++++++++++++++++++++++++++++++ 5 files changed, 322 insertions(+) create mode 100644 w10/CNKIStrategy.java create mode 100644 w10/CrawlerStrategy.java create mode 100644 w10/IEEEStrategy.java create mode 100644 w10/ScienceDirectStrategy.java create mode 100644 w10/SemanticScholarStrategy.java diff --git a/w10/CNKIStrategy.java b/w10/CNKIStrategy.java new file mode 100644 index 0000000..95df621 --- /dev/null +++ b/w10/CNKIStrategy.java @@ -0,0 +1,82 @@ +package strategy; + +import model.Paper; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import utils.Utils; +import java.util.ArrayList; +import java.util.List; + +public class CNKIStrategy extends AbstractCrawlerStrategy { + @Override + public String getPlatformName() { + return "中国知网 (CNKI)"; + } + + @Override + public boolean supportsUrl(String url) { + return url != null && (url.contains("cnki.net") || url.contains("cnki.cn")); + } + + @Override + protected List fetchPapers(String url, int count) throws Exception { + List papers = new ArrayList<>(); + System.out.println("=== 开始使用中国知网获取论文 ==="); + + String html = Utils.sendGetRequest(url); + if (html.isEmpty()) return papers; + + Document doc = Utils.parseHtml(html); + + String[] selectors = {".list-item", ".article-item", ".result-item", "tr[class*='item']", "div[class*='result']", "li[class*='result']"}; + + Elements paperElements = null; + for (String selector : selectors) { + paperElements = doc.select(selector); + if (paperElements.size() > 0) { + break; + } + } + + if (paperElements != null && paperElements.size() > 0) { + int collected = 0; + for (Element element : paperElements) { + if (collected >= count) break; + + try { + Element titleElement = element.select("a").first(); + if (titleElement == null) continue; + + String title = titleElement.text(); + String paperUrl = titleElement.attr("href"); + + if (title.length() < 10 || paperUrl.isEmpty()) continue; + + if (!paperUrl.startsWith("http")) { + paperUrl = "https://kns.cnki.net" + paperUrl; + } + + String authors = ""; + Elements authorElements = element.select(".author"); + if (!authorElements.isEmpty()) { + authors = authorElements.first().text(); + } + + String abstractText = ""; + Elements abstractElements = element.select(".abstract"); + if (!abstractElements.isEmpty()) { + abstractText = abstractElements.first().text(); + } + + papers.add(new Paper(title, authors, abstractText, paperUrl, getPlatformName())); + collected++; + } catch (Exception e) { + continue; + } + } + } + + return papers; + } +} \ No newline at end of file diff --git a/w10/CrawlerStrategy.java b/w10/CrawlerStrategy.java new file mode 100644 index 0000000..0e226ba --- /dev/null +++ b/w10/CrawlerStrategy.java @@ -0,0 +1,10 @@ +package strategy; + +import model.Paper; +import java.util.List; + +public interface CrawlerStrategy { + String getPlatformName(); + List crawl(String url, int count) throws Exception; + boolean supportsUrl(String url); +} \ No newline at end of file diff --git a/w10/IEEEStrategy.java b/w10/IEEEStrategy.java new file mode 100644 index 0000000..d4817b1 --- /dev/null +++ b/w10/IEEEStrategy.java @@ -0,0 +1,64 @@ +package strategy; + +import model.Paper; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import utils.Utils; +import java.util.ArrayList; +import java.util.List; + +public class IEEEStrategy extends AbstractCrawlerStrategy { + @Override + public String getPlatformName() { + return "IEEE Xplore"; + } + + @Override + public boolean supportsUrl(String url) { + return url != null && url.contains("ieeexplore.ieee.org"); + } + + @Override + protected List fetchPapers(String url, int count) throws Exception { + List papers = new ArrayList<>(); + System.out.println("=== 开始使用IEEE Xplore获取论文 ==="); + + addDelay(2000, 3000); + + String html = Utils.sendGetRequest(url); + if (html.isEmpty()) return papers; + + Document doc = Jsoup.parse(html); + + Elements paperElements = doc.select(".List-results-items li"); + + int collected = 0; + for (Element element : paperElements) { + if (collected >= count) break; + + try { + Element titleElement = element.selectFirst("h2 a"); + String title = titleElement != null ? titleElement.text() : ""; + + String paperUrl = titleElement != null ? titleElement.attr("href") : ""; + if (!paperUrl.startsWith("http")) { + paperUrl = "https://ieeexplore.ieee.org" + paperUrl; + } + + Element authorsElement = element.selectFirst(".authors"); + String authors = authorsElement != null ? authorsElement.text() : ""; + + if (title.length() < 5 || paperUrl.isEmpty()) continue; + + papers.add(new Paper(title, authors, "", paperUrl, getPlatformName())); + collected++; + } catch (Exception e) { + continue; + } + } + + return papers; + } +} \ No newline at end of file diff --git a/w10/ScienceDirectStrategy.java b/w10/ScienceDirectStrategy.java new file mode 100644 index 0000000..a9e2bc7 --- /dev/null +++ b/w10/ScienceDirectStrategy.java @@ -0,0 +1,64 @@ +package strategy; + +import model.Paper; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import utils.Utils; +import java.util.ArrayList; +import java.util.List; + +public class ScienceDirectStrategy extends AbstractCrawlerStrategy { + @Override + public String getPlatformName() { + return "ScienceDirect"; + } + + @Override + public boolean supportsUrl(String url) { + return url != null && url.contains("sciencedirect.com"); + } + + @Override + protected List fetchPapers(String url, int count) throws Exception { + List papers = new ArrayList<>(); + System.out.println("=== 开始使用ScienceDirect获取论文 ==="); + + addDelay(2000, 3000); + + String html = Utils.sendGetRequest(url); + if (html.isEmpty()) return papers; + + Document doc = Jsoup.parse(html); + + Elements paperElements = doc.select(".result-item-content"); + + int collected = 0; + for (Element element : paperElements) { + if (collected >= count) break; + + try { + Element titleElement = element.selectFirst("h2 a"); + String title = titleElement != null ? titleElement.text() : ""; + + String paperUrl = titleElement != null ? titleElement.attr("href") : ""; + if (!paperUrl.startsWith("http")) { + paperUrl = "https://www.sciencedirect.com" + paperUrl; + } + + Element authorsElement = element.selectFirst(".author-group"); + String authors = authorsElement != null ? authorsElement.text() : ""; + + if (title.length() < 5 || paperUrl.isEmpty()) continue; + + papers.add(new Paper(title, authors, "", paperUrl, getPlatformName())); + collected++; + } catch (Exception e) { + continue; + } + } + + return papers; + } +} \ No newline at end of file diff --git a/w10/SemanticScholarStrategy.java b/w10/SemanticScholarStrategy.java new file mode 100644 index 0000000..41f5744 --- /dev/null +++ b/w10/SemanticScholarStrategy.java @@ -0,0 +1,102 @@ +package strategy; + +import model.Paper; +import com.fasterxml.jackson.databind.ObjectMapper; +import utils.Utils; +import java.util.ArrayList; +import java.util.List; + +public class SemanticScholarStrategy extends AbstractCrawlerStrategy { + @Override + public String getPlatformName() { + return "Semantic Scholar"; + } + + @Override + public boolean supportsUrl(String url) { + return url != null && url.contains("semanticscholar.org"); + } + + @Override + protected List fetchPapers(String url, int count) throws Exception { + List papers = new ArrayList<>(); + System.out.println("=== 开始使用Semantic Scholar获取论文 ==="); + + addDelay(1000, 1500); + + try { + String response = Utils.sendGetRequest(url); + if (response.isEmpty()) return papers; + + ObjectMapper objectMapper = new ObjectMapper(); + SemanticScholarResponse apiResponse = objectMapper.readValue(response, SemanticScholarResponse.class); + + if (apiResponse != null && apiResponse.getItems() != null) { + int collected = 0; + for (SemanticScholarPaper apiPaper : apiResponse.getItems()) { + if (collected >= count) break; + + String title = apiPaper.getTitle(); + String paperUrl = apiPaper.getUrl(); + String abstractText = apiPaper.getAbstractText(); + + StringBuilder authorsBuilder = new StringBuilder(); + if (apiPaper.getAuthors() != null) { + for (SemanticScholarAuthor author : apiPaper.getAuthors()) { + if (author != null && author.getName() != null) { + if (authorsBuilder.length() > 0) { + authorsBuilder.append(", "); + } + authorsBuilder.append(author.getName()); + } + } + } + String authors = authorsBuilder.toString(); + + if (title == null || title.length() < 5 || paperUrl == null || paperUrl.isEmpty()) continue; + + papers.add(new Paper(title, authors, abstractText != null ? abstractText : "", paperUrl, getPlatformName())); + collected++; + } + } + } catch (Exception e) { + System.out.println("Semantic Scholar解析失败: " + e.getMessage()); + } + + return papers; + } + + private static class SemanticScholarResponse { + private List items; + public List getItems() { return items; } + @SuppressWarnings("unused") + public void setItems(List items) { this.items = items; } + } + + private static class SemanticScholarPaper { + private String title; + private String url; + private List authors; + private String abstractText; + + public String getTitle() { return title; } + @SuppressWarnings("unused") + public void setTitle(String title) { this.title = title; } + public String getUrl() { return url; } + @SuppressWarnings("unused") + public void setUrl(String url) { this.url = url; } + public List getAuthors() { return authors; } + @SuppressWarnings("unused") + public void setAuthors(List authors) { this.authors = authors; } + public String getAbstractText() { return abstractText; } + @SuppressWarnings("unused") + public void setAbstractText(String abstractText) { this.abstractText = abstractText; } + } + + private static class SemanticScholarAuthor { + private String name; + public String getName() { return name; } + @SuppressWarnings("unused") + public void setName(String name) { this.name = name; } + } +} \ No newline at end of file