2 changed files with 104 additions and 0 deletions
@ -0,0 +1,64 @@ |
|||||
|
package strategy; |
||||
|
|
||||
|
import model.Paper; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import utils.Utils; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class SpringerLinkStrategy extends AbstractCrawlerStrategy { |
||||
|
@Override |
||||
|
public String getPlatformName() { |
||||
|
return "Springer Link"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public boolean supportsUrl(String url) { |
||||
|
return url != null && url.contains("link.springer.com"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<Paper> fetchPapers(String url, int count) throws Exception { |
||||
|
List<Paper> papers = new ArrayList<>(); |
||||
|
System.out.println("=== 开始使用Springer Link获取论文 ==="); |
||||
|
|
||||
|
addDelay(2000, 3000); |
||||
|
|
||||
|
String html = Utils.sendGetRequest(url); |
||||
|
if (html.isEmpty()) return papers; |
||||
|
|
||||
|
Document doc = Jsoup.parse(html); |
||||
|
|
||||
|
Elements paperElements = doc.select(".result-item"); |
||||
|
|
||||
|
int collected = 0; |
||||
|
for (Element element : paperElements) { |
||||
|
if (collected >= count) break; |
||||
|
|
||||
|
try { |
||||
|
Element titleElement = element.selectFirst("h2 a"); |
||||
|
String title = titleElement != null ? titleElement.text() : ""; |
||||
|
|
||||
|
String paperUrl = titleElement != null ? titleElement.attr("href") : ""; |
||||
|
if (!paperUrl.startsWith("http")) { |
||||
|
paperUrl = "https://link.springer.com" + paperUrl; |
||||
|
} |
||||
|
|
||||
|
Element authorsElement = element.selectFirst(".authors"); |
||||
|
String authors = authorsElement != null ? authorsElement.text() : ""; |
||||
|
|
||||
|
if (title.length() < 5 || paperUrl.isEmpty()) continue; |
||||
|
|
||||
|
papers.add(new Paper(title, authors, "", paperUrl, getPlatformName())); |
||||
|
collected++; |
||||
|
} catch (Exception e) { |
||||
|
continue; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return papers; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,40 @@ |
|||||
|
package strategy; |
||||
|
|
||||
|
import java.util.List; |
||||
|
import java.util.ArrayList; |
||||
|
|
||||
|
public class StrategyFactory { |
||||
|
private final List<CrawlerStrategy> STRATEGIES = new ArrayList<>(); |
||||
|
|
||||
|
public StrategyFactory() { |
||||
|
STRATEGIES.add(new CNKIStrategy()); |
||||
|
STRATEGIES.add(new IEEEStrategy()); |
||||
|
STRATEGIES.add(new ACMDigitalLibraryStrategy()); |
||||
|
STRATEGIES.add(new SpringerLinkStrategy()); |
||||
|
STRATEGIES.add(new ScienceDirectStrategy()); |
||||
|
STRATEGIES.add(new ArXivStrategy()); |
||||
|
STRATEGIES.add(new SemanticScholarStrategy()); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 根据URL创建对应的爬虫策略 |
||||
|
* @param url 要创建策略的URL |
||||
|
* @return 对应的爬虫策略,如果URL不支持任何策略则返回null |
||||
|
*/ |
||||
|
public CrawlerStrategy createCrawlerByUrl(String url) { |
||||
|
for (CrawlerStrategy strategy : STRATEGIES) { |
||||
|
if (strategy.supportsUrl(url)) { |
||||
|
return strategy; |
||||
|
} |
||||
|
} |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public int getPlatformCount() { |
||||
|
return STRATEGIES.size(); |
||||
|
} |
||||
|
|
||||
|
public List<CrawlerStrategy> getAllStrategies() { |
||||
|
return STRATEGIES; |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue