5 changed files with 322 additions and 0 deletions
@ -0,0 +1,82 @@ |
|||||
|
package strategy; |
||||
|
|
||||
|
import model.Paper; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import utils.Utils; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class CNKIStrategy extends AbstractCrawlerStrategy { |
||||
|
@Override |
||||
|
public String getPlatformName() { |
||||
|
return "中国知网 (CNKI)"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public boolean supportsUrl(String url) { |
||||
|
return url != null && (url.contains("cnki.net") || url.contains("cnki.cn")); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<Paper> fetchPapers(String url, int count) throws Exception { |
||||
|
List<Paper> papers = new ArrayList<>(); |
||||
|
System.out.println("=== 开始使用中国知网获取论文 ==="); |
||||
|
|
||||
|
String html = Utils.sendGetRequest(url); |
||||
|
if (html.isEmpty()) return papers; |
||||
|
|
||||
|
Document doc = Utils.parseHtml(html); |
||||
|
|
||||
|
String[] selectors = {".list-item", ".article-item", ".result-item", "tr[class*='item']", "div[class*='result']", "li[class*='result']"}; |
||||
|
|
||||
|
Elements paperElements = null; |
||||
|
for (String selector : selectors) { |
||||
|
paperElements = doc.select(selector); |
||||
|
if (paperElements.size() > 0) { |
||||
|
break; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (paperElements != null && paperElements.size() > 0) { |
||||
|
int collected = 0; |
||||
|
for (Element element : paperElements) { |
||||
|
if (collected >= count) break; |
||||
|
|
||||
|
try { |
||||
|
Element titleElement = element.select("a").first(); |
||||
|
if (titleElement == null) continue; |
||||
|
|
||||
|
String title = titleElement.text(); |
||||
|
String paperUrl = titleElement.attr("href"); |
||||
|
|
||||
|
if (title.length() < 10 || paperUrl.isEmpty()) continue; |
||||
|
|
||||
|
if (!paperUrl.startsWith("http")) { |
||||
|
paperUrl = "https://kns.cnki.net" + paperUrl; |
||||
|
} |
||||
|
|
||||
|
String authors = ""; |
||||
|
Elements authorElements = element.select(".author"); |
||||
|
if (!authorElements.isEmpty()) { |
||||
|
authors = authorElements.first().text(); |
||||
|
} |
||||
|
|
||||
|
String abstractText = ""; |
||||
|
Elements abstractElements = element.select(".abstract"); |
||||
|
if (!abstractElements.isEmpty()) { |
||||
|
abstractText = abstractElements.first().text(); |
||||
|
} |
||||
|
|
||||
|
papers.add(new Paper(title, authors, abstractText, paperUrl, getPlatformName())); |
||||
|
collected++; |
||||
|
} catch (Exception e) { |
||||
|
continue; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return papers; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,10 @@ |
|||||
|
package strategy; |
||||
|
|
||||
|
import model.Paper; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface CrawlerStrategy { |
||||
|
String getPlatformName(); |
||||
|
List<Paper> crawl(String url, int count) throws Exception; |
||||
|
boolean supportsUrl(String url); |
||||
|
} |
||||
@ -0,0 +1,64 @@ |
|||||
|
package strategy; |
||||
|
|
||||
|
import model.Paper; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import utils.Utils; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class IEEEStrategy extends AbstractCrawlerStrategy { |
||||
|
@Override |
||||
|
public String getPlatformName() { |
||||
|
return "IEEE Xplore"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public boolean supportsUrl(String url) { |
||||
|
return url != null && url.contains("ieeexplore.ieee.org"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<Paper> fetchPapers(String url, int count) throws Exception { |
||||
|
List<Paper> papers = new ArrayList<>(); |
||||
|
System.out.println("=== 开始使用IEEE Xplore获取论文 ==="); |
||||
|
|
||||
|
addDelay(2000, 3000); |
||||
|
|
||||
|
String html = Utils.sendGetRequest(url); |
||||
|
if (html.isEmpty()) return papers; |
||||
|
|
||||
|
Document doc = Jsoup.parse(html); |
||||
|
|
||||
|
Elements paperElements = doc.select(".List-results-items li"); |
||||
|
|
||||
|
int collected = 0; |
||||
|
for (Element element : paperElements) { |
||||
|
if (collected >= count) break; |
||||
|
|
||||
|
try { |
||||
|
Element titleElement = element.selectFirst("h2 a"); |
||||
|
String title = titleElement != null ? titleElement.text() : ""; |
||||
|
|
||||
|
String paperUrl = titleElement != null ? titleElement.attr("href") : ""; |
||||
|
if (!paperUrl.startsWith("http")) { |
||||
|
paperUrl = "https://ieeexplore.ieee.org" + paperUrl; |
||||
|
} |
||||
|
|
||||
|
Element authorsElement = element.selectFirst(".authors"); |
||||
|
String authors = authorsElement != null ? authorsElement.text() : ""; |
||||
|
|
||||
|
if (title.length() < 5 || paperUrl.isEmpty()) continue; |
||||
|
|
||||
|
papers.add(new Paper(title, authors, "", paperUrl, getPlatformName())); |
||||
|
collected++; |
||||
|
} catch (Exception e) { |
||||
|
continue; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return papers; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,64 @@ |
|||||
|
package strategy; |
||||
|
|
||||
|
import model.Paper; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import utils.Utils; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class ScienceDirectStrategy extends AbstractCrawlerStrategy { |
||||
|
@Override |
||||
|
public String getPlatformName() { |
||||
|
return "ScienceDirect"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public boolean supportsUrl(String url) { |
||||
|
return url != null && url.contains("sciencedirect.com"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<Paper> fetchPapers(String url, int count) throws Exception { |
||||
|
List<Paper> papers = new ArrayList<>(); |
||||
|
System.out.println("=== 开始使用ScienceDirect获取论文 ==="); |
||||
|
|
||||
|
addDelay(2000, 3000); |
||||
|
|
||||
|
String html = Utils.sendGetRequest(url); |
||||
|
if (html.isEmpty()) return papers; |
||||
|
|
||||
|
Document doc = Jsoup.parse(html); |
||||
|
|
||||
|
Elements paperElements = doc.select(".result-item-content"); |
||||
|
|
||||
|
int collected = 0; |
||||
|
for (Element element : paperElements) { |
||||
|
if (collected >= count) break; |
||||
|
|
||||
|
try { |
||||
|
Element titleElement = element.selectFirst("h2 a"); |
||||
|
String title = titleElement != null ? titleElement.text() : ""; |
||||
|
|
||||
|
String paperUrl = titleElement != null ? titleElement.attr("href") : ""; |
||||
|
if (!paperUrl.startsWith("http")) { |
||||
|
paperUrl = "https://www.sciencedirect.com" + paperUrl; |
||||
|
} |
||||
|
|
||||
|
Element authorsElement = element.selectFirst(".author-group"); |
||||
|
String authors = authorsElement != null ? authorsElement.text() : ""; |
||||
|
|
||||
|
if (title.length() < 5 || paperUrl.isEmpty()) continue; |
||||
|
|
||||
|
papers.add(new Paper(title, authors, "", paperUrl, getPlatformName())); |
||||
|
collected++; |
||||
|
} catch (Exception e) { |
||||
|
continue; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return papers; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,102 @@ |
|||||
|
package strategy; |
||||
|
|
||||
|
import model.Paper; |
||||
|
import com.fasterxml.jackson.databind.ObjectMapper; |
||||
|
import utils.Utils; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class SemanticScholarStrategy extends AbstractCrawlerStrategy { |
||||
|
@Override |
||||
|
public String getPlatformName() { |
||||
|
return "Semantic Scholar"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public boolean supportsUrl(String url) { |
||||
|
return url != null && url.contains("semanticscholar.org"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<Paper> fetchPapers(String url, int count) throws Exception { |
||||
|
List<Paper> papers = new ArrayList<>(); |
||||
|
System.out.println("=== 开始使用Semantic Scholar获取论文 ==="); |
||||
|
|
||||
|
addDelay(1000, 1500); |
||||
|
|
||||
|
try { |
||||
|
String response = Utils.sendGetRequest(url); |
||||
|
if (response.isEmpty()) return papers; |
||||
|
|
||||
|
ObjectMapper objectMapper = new ObjectMapper(); |
||||
|
SemanticScholarResponse apiResponse = objectMapper.readValue(response, SemanticScholarResponse.class); |
||||
|
|
||||
|
if (apiResponse != null && apiResponse.getItems() != null) { |
||||
|
int collected = 0; |
||||
|
for (SemanticScholarPaper apiPaper : apiResponse.getItems()) { |
||||
|
if (collected >= count) break; |
||||
|
|
||||
|
String title = apiPaper.getTitle(); |
||||
|
String paperUrl = apiPaper.getUrl(); |
||||
|
String abstractText = apiPaper.getAbstractText(); |
||||
|
|
||||
|
StringBuilder authorsBuilder = new StringBuilder(); |
||||
|
if (apiPaper.getAuthors() != null) { |
||||
|
for (SemanticScholarAuthor author : apiPaper.getAuthors()) { |
||||
|
if (author != null && author.getName() != null) { |
||||
|
if (authorsBuilder.length() > 0) { |
||||
|
authorsBuilder.append(", "); |
||||
|
} |
||||
|
authorsBuilder.append(author.getName()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
String authors = authorsBuilder.toString(); |
||||
|
|
||||
|
if (title == null || title.length() < 5 || paperUrl == null || paperUrl.isEmpty()) continue; |
||||
|
|
||||
|
papers.add(new Paper(title, authors, abstractText != null ? abstractText : "", paperUrl, getPlatformName())); |
||||
|
collected++; |
||||
|
} |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
System.out.println("Semantic Scholar解析失败: " + e.getMessage()); |
||||
|
} |
||||
|
|
||||
|
return papers; |
||||
|
} |
||||
|
|
||||
|
private static class SemanticScholarResponse { |
||||
|
private List<SemanticScholarPaper> items; |
||||
|
public List<SemanticScholarPaper> getItems() { return items; } |
||||
|
@SuppressWarnings("unused") |
||||
|
public void setItems(List<SemanticScholarPaper> items) { this.items = items; } |
||||
|
} |
||||
|
|
||||
|
private static class SemanticScholarPaper { |
||||
|
private String title; |
||||
|
private String url; |
||||
|
private List<SemanticScholarAuthor> authors; |
||||
|
private String abstractText; |
||||
|
|
||||
|
public String getTitle() { return title; } |
||||
|
@SuppressWarnings("unused") |
||||
|
public void setTitle(String title) { this.title = title; } |
||||
|
public String getUrl() { return url; } |
||||
|
@SuppressWarnings("unused") |
||||
|
public void setUrl(String url) { this.url = url; } |
||||
|
public List<SemanticScholarAuthor> getAuthors() { return authors; } |
||||
|
@SuppressWarnings("unused") |
||||
|
public void setAuthors(List<SemanticScholarAuthor> authors) { this.authors = authors; } |
||||
|
public String getAbstractText() { return abstractText; } |
||||
|
@SuppressWarnings("unused") |
||||
|
public void setAbstractText(String abstractText) { this.abstractText = abstractText; } |
||||
|
} |
||||
|
|
||||
|
private static class SemanticScholarAuthor { |
||||
|
private String name; |
||||
|
public String getName() { return name; } |
||||
|
@SuppressWarnings("unused") |
||||
|
public void setName(String name) { this.name = name; } |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue