You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

82 lines
2.9 KiB

package strategy;
import model.Paper;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import utils.Utils;
import java.util.ArrayList;
import java.util.List;
public class CNKIStrategy extends AbstractCrawlerStrategy {
@Override
public String getPlatformName() {
return "中国知网 (CNKI)";
}
@Override
public boolean supportsUrl(String url) {
return url != null && (url.contains("cnki.net") || url.contains("cnki.cn"));
}
@Override
protected List<Paper> fetchPapers(String url, int count) throws Exception {
List<Paper> papers = new ArrayList<>();
System.out.println("=== 开始使用中国知网获取论文 ===");
String html = Utils.sendGetRequest(url);
if (html.isEmpty()) return papers;
Document doc = Utils.parseHtml(html);
String[] selectors = {".list-item", ".article-item", ".result-item", "tr[class*='item']", "div[class*='result']", "li[class*='result']"};
Elements paperElements = null;
for (String selector : selectors) {
paperElements = doc.select(selector);
if (paperElements.size() > 0) {
break;
}
}
if (paperElements != null && paperElements.size() > 0) {
int collected = 0;
for (Element element : paperElements) {
if (collected >= count) break;
try {
Element titleElement = element.select("a").first();
if (titleElement == null) continue;
String title = titleElement.text();
String paperUrl = titleElement.attr("href");
if (title.length() < 10 || paperUrl.isEmpty()) continue;
if (!paperUrl.startsWith("http")) {
paperUrl = "https://kns.cnki.net" + paperUrl;
}
String authors = "";
Elements authorElements = element.select(".author");
if (!authorElements.isEmpty()) {
authors = authorElements.first().text();
}
String abstractText = "";
Elements abstractElements = element.select(".abstract");
if (!abstractElements.isEmpty()) {
abstractText = abstractElements.first().text();
}
papers.add(new Paper(title, authors, abstractText, paperUrl, getPlatformName()));
collected++;
} catch (Exception e) {
continue;
}
}
}
return papers;
}
}