5 changed files with 322 additions and 0 deletions
@ -0,0 +1,82 @@ |
|||
package strategy; |
|||
|
|||
import model.Paper; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import utils.Utils; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class CNKIStrategy extends AbstractCrawlerStrategy { |
|||
@Override |
|||
public String getPlatformName() { |
|||
return "中国知网 (CNKI)"; |
|||
} |
|||
|
|||
@Override |
|||
public boolean supportsUrl(String url) { |
|||
return url != null && (url.contains("cnki.net") || url.contains("cnki.cn")); |
|||
} |
|||
|
|||
@Override |
|||
protected List<Paper> fetchPapers(String url, int count) throws Exception { |
|||
List<Paper> papers = new ArrayList<>(); |
|||
System.out.println("=== 开始使用中国知网获取论文 ==="); |
|||
|
|||
String html = Utils.sendGetRequest(url); |
|||
if (html.isEmpty()) return papers; |
|||
|
|||
Document doc = Utils.parseHtml(html); |
|||
|
|||
String[] selectors = {".list-item", ".article-item", ".result-item", "tr[class*='item']", "div[class*='result']", "li[class*='result']"}; |
|||
|
|||
Elements paperElements = null; |
|||
for (String selector : selectors) { |
|||
paperElements = doc.select(selector); |
|||
if (paperElements.size() > 0) { |
|||
break; |
|||
} |
|||
} |
|||
|
|||
if (paperElements != null && paperElements.size() > 0) { |
|||
int collected = 0; |
|||
for (Element element : paperElements) { |
|||
if (collected >= count) break; |
|||
|
|||
try { |
|||
Element titleElement = element.select("a").first(); |
|||
if (titleElement == null) continue; |
|||
|
|||
String title = titleElement.text(); |
|||
String paperUrl = titleElement.attr("href"); |
|||
|
|||
if (title.length() < 10 || paperUrl.isEmpty()) continue; |
|||
|
|||
if (!paperUrl.startsWith("http")) { |
|||
paperUrl = "https://kns.cnki.net" + paperUrl; |
|||
} |
|||
|
|||
String authors = ""; |
|||
Elements authorElements = element.select(".author"); |
|||
if (!authorElements.isEmpty()) { |
|||
authors = authorElements.first().text(); |
|||
} |
|||
|
|||
String abstractText = ""; |
|||
Elements abstractElements = element.select(".abstract"); |
|||
if (!abstractElements.isEmpty()) { |
|||
abstractText = abstractElements.first().text(); |
|||
} |
|||
|
|||
papers.add(new Paper(title, authors, abstractText, paperUrl, getPlatformName())); |
|||
collected++; |
|||
} catch (Exception e) { |
|||
continue; |
|||
} |
|||
} |
|||
} |
|||
|
|||
return papers; |
|||
} |
|||
} |
|||
@ -0,0 +1,10 @@ |
|||
package strategy; |
|||
|
|||
import model.Paper; |
|||
import java.util.List; |
|||
|
|||
public interface CrawlerStrategy { |
|||
String getPlatformName(); |
|||
List<Paper> crawl(String url, int count) throws Exception; |
|||
boolean supportsUrl(String url); |
|||
} |
|||
@ -0,0 +1,64 @@ |
|||
package strategy; |
|||
|
|||
import model.Paper; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import utils.Utils; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class IEEEStrategy extends AbstractCrawlerStrategy { |
|||
@Override |
|||
public String getPlatformName() { |
|||
return "IEEE Xplore"; |
|||
} |
|||
|
|||
@Override |
|||
public boolean supportsUrl(String url) { |
|||
return url != null && url.contains("ieeexplore.ieee.org"); |
|||
} |
|||
|
|||
@Override |
|||
protected List<Paper> fetchPapers(String url, int count) throws Exception { |
|||
List<Paper> papers = new ArrayList<>(); |
|||
System.out.println("=== 开始使用IEEE Xplore获取论文 ==="); |
|||
|
|||
addDelay(2000, 3000); |
|||
|
|||
String html = Utils.sendGetRequest(url); |
|||
if (html.isEmpty()) return papers; |
|||
|
|||
Document doc = Jsoup.parse(html); |
|||
|
|||
Elements paperElements = doc.select(".List-results-items li"); |
|||
|
|||
int collected = 0; |
|||
for (Element element : paperElements) { |
|||
if (collected >= count) break; |
|||
|
|||
try { |
|||
Element titleElement = element.selectFirst("h2 a"); |
|||
String title = titleElement != null ? titleElement.text() : ""; |
|||
|
|||
String paperUrl = titleElement != null ? titleElement.attr("href") : ""; |
|||
if (!paperUrl.startsWith("http")) { |
|||
paperUrl = "https://ieeexplore.ieee.org" + paperUrl; |
|||
} |
|||
|
|||
Element authorsElement = element.selectFirst(".authors"); |
|||
String authors = authorsElement != null ? authorsElement.text() : ""; |
|||
|
|||
if (title.length() < 5 || paperUrl.isEmpty()) continue; |
|||
|
|||
papers.add(new Paper(title, authors, "", paperUrl, getPlatformName())); |
|||
collected++; |
|||
} catch (Exception e) { |
|||
continue; |
|||
} |
|||
} |
|||
|
|||
return papers; |
|||
} |
|||
} |
|||
@ -0,0 +1,64 @@ |
|||
package strategy; |
|||
|
|||
import model.Paper; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import utils.Utils; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class ScienceDirectStrategy extends AbstractCrawlerStrategy { |
|||
@Override |
|||
public String getPlatformName() { |
|||
return "ScienceDirect"; |
|||
} |
|||
|
|||
@Override |
|||
public boolean supportsUrl(String url) { |
|||
return url != null && url.contains("sciencedirect.com"); |
|||
} |
|||
|
|||
@Override |
|||
protected List<Paper> fetchPapers(String url, int count) throws Exception { |
|||
List<Paper> papers = new ArrayList<>(); |
|||
System.out.println("=== 开始使用ScienceDirect获取论文 ==="); |
|||
|
|||
addDelay(2000, 3000); |
|||
|
|||
String html = Utils.sendGetRequest(url); |
|||
if (html.isEmpty()) return papers; |
|||
|
|||
Document doc = Jsoup.parse(html); |
|||
|
|||
Elements paperElements = doc.select(".result-item-content"); |
|||
|
|||
int collected = 0; |
|||
for (Element element : paperElements) { |
|||
if (collected >= count) break; |
|||
|
|||
try { |
|||
Element titleElement = element.selectFirst("h2 a"); |
|||
String title = titleElement != null ? titleElement.text() : ""; |
|||
|
|||
String paperUrl = titleElement != null ? titleElement.attr("href") : ""; |
|||
if (!paperUrl.startsWith("http")) { |
|||
paperUrl = "https://www.sciencedirect.com" + paperUrl; |
|||
} |
|||
|
|||
Element authorsElement = element.selectFirst(".author-group"); |
|||
String authors = authorsElement != null ? authorsElement.text() : ""; |
|||
|
|||
if (title.length() < 5 || paperUrl.isEmpty()) continue; |
|||
|
|||
papers.add(new Paper(title, authors, "", paperUrl, getPlatformName())); |
|||
collected++; |
|||
} catch (Exception e) { |
|||
continue; |
|||
} |
|||
} |
|||
|
|||
return papers; |
|||
} |
|||
} |
|||
@ -0,0 +1,102 @@ |
|||
package strategy; |
|||
|
|||
import model.Paper; |
|||
import com.fasterxml.jackson.databind.ObjectMapper; |
|||
import utils.Utils; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class SemanticScholarStrategy extends AbstractCrawlerStrategy { |
|||
@Override |
|||
public String getPlatformName() { |
|||
return "Semantic Scholar"; |
|||
} |
|||
|
|||
@Override |
|||
public boolean supportsUrl(String url) { |
|||
return url != null && url.contains("semanticscholar.org"); |
|||
} |
|||
|
|||
@Override |
|||
protected List<Paper> fetchPapers(String url, int count) throws Exception { |
|||
List<Paper> papers = new ArrayList<>(); |
|||
System.out.println("=== 开始使用Semantic Scholar获取论文 ==="); |
|||
|
|||
addDelay(1000, 1500); |
|||
|
|||
try { |
|||
String response = Utils.sendGetRequest(url); |
|||
if (response.isEmpty()) return papers; |
|||
|
|||
ObjectMapper objectMapper = new ObjectMapper(); |
|||
SemanticScholarResponse apiResponse = objectMapper.readValue(response, SemanticScholarResponse.class); |
|||
|
|||
if (apiResponse != null && apiResponse.getItems() != null) { |
|||
int collected = 0; |
|||
for (SemanticScholarPaper apiPaper : apiResponse.getItems()) { |
|||
if (collected >= count) break; |
|||
|
|||
String title = apiPaper.getTitle(); |
|||
String paperUrl = apiPaper.getUrl(); |
|||
String abstractText = apiPaper.getAbstractText(); |
|||
|
|||
StringBuilder authorsBuilder = new StringBuilder(); |
|||
if (apiPaper.getAuthors() != null) { |
|||
for (SemanticScholarAuthor author : apiPaper.getAuthors()) { |
|||
if (author != null && author.getName() != null) { |
|||
if (authorsBuilder.length() > 0) { |
|||
authorsBuilder.append(", "); |
|||
} |
|||
authorsBuilder.append(author.getName()); |
|||
} |
|||
} |
|||
} |
|||
String authors = authorsBuilder.toString(); |
|||
|
|||
if (title == null || title.length() < 5 || paperUrl == null || paperUrl.isEmpty()) continue; |
|||
|
|||
papers.add(new Paper(title, authors, abstractText != null ? abstractText : "", paperUrl, getPlatformName())); |
|||
collected++; |
|||
} |
|||
} |
|||
} catch (Exception e) { |
|||
System.out.println("Semantic Scholar解析失败: " + e.getMessage()); |
|||
} |
|||
|
|||
return papers; |
|||
} |
|||
|
|||
private static class SemanticScholarResponse { |
|||
private List<SemanticScholarPaper> items; |
|||
public List<SemanticScholarPaper> getItems() { return items; } |
|||
@SuppressWarnings("unused") |
|||
public void setItems(List<SemanticScholarPaper> items) { this.items = items; } |
|||
} |
|||
|
|||
private static class SemanticScholarPaper { |
|||
private String title; |
|||
private String url; |
|||
private List<SemanticScholarAuthor> authors; |
|||
private String abstractText; |
|||
|
|||
public String getTitle() { return title; } |
|||
@SuppressWarnings("unused") |
|||
public void setTitle(String title) { this.title = title; } |
|||
public String getUrl() { return url; } |
|||
@SuppressWarnings("unused") |
|||
public void setUrl(String url) { this.url = url; } |
|||
public List<SemanticScholarAuthor> getAuthors() { return authors; } |
|||
@SuppressWarnings("unused") |
|||
public void setAuthors(List<SemanticScholarAuthor> authors) { this.authors = authors; } |
|||
public String getAbstractText() { return abstractText; } |
|||
@SuppressWarnings("unused") |
|||
public void setAbstractText(String abstractText) { this.abstractText = abstractText; } |
|||
} |
|||
|
|||
private static class SemanticScholarAuthor { |
|||
private String name; |
|||
public String getName() { return name; } |
|||
@SuppressWarnings("unused") |
|||
public void setName(String name) { this.name = name; } |
|||
} |
|||
} |
|||
Loading…
Reference in new issue