1 changed files with 64 additions and 0 deletions
@ -0,0 +1,64 @@ |
|||
package strategy; |
|||
|
|||
import model.Paper; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import utils.Utils; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class ACMDigitalLibraryStrategy extends AbstractCrawlerStrategy { |
|||
@Override |
|||
public String getPlatformName() { |
|||
return "ACM Digital Library"; |
|||
} |
|||
|
|||
@Override |
|||
public boolean supportsUrl(String url) { |
|||
return url != null && url.contains("dl.acm.org"); |
|||
} |
|||
|
|||
@Override |
|||
protected List<Paper> fetchPapers(String url, int count) throws Exception { |
|||
List<Paper> papers = new ArrayList<>(); |
|||
System.out.println("=== 开始使用ACM Digital Library获取论文 ==="); |
|||
|
|||
addDelay(2000, 3000); |
|||
|
|||
String html = Utils.sendGetRequest(url); |
|||
if (html.isEmpty()) return papers; |
|||
|
|||
Document doc = Jsoup.parse(html); |
|||
|
|||
Elements paperElements = doc.select(".search__item"); |
|||
|
|||
int collected = 0; |
|||
for (Element element : paperElements) { |
|||
if (collected >= count) break; |
|||
|
|||
try { |
|||
Element titleElement = element.selectFirst("h5 a"); |
|||
String title = titleElement != null ? titleElement.text() : ""; |
|||
|
|||
String paperUrl = titleElement != null ? titleElement.attr("href") : ""; |
|||
if (!paperUrl.startsWith("http")) { |
|||
paperUrl = "https://dl.acm.org" + paperUrl; |
|||
} |
|||
|
|||
Element authorsElement = element.selectFirst(".search__authors"); |
|||
String authors = authorsElement != null ? authorsElement.text() : ""; |
|||
|
|||
if (title.length() < 5 || paperUrl.isEmpty()) continue; |
|||
|
|||
papers.add(new Paper(title, authors, "", paperUrl, getPlatformName())); |
|||
collected++; |
|||
} catch (Exception e) { |
|||
continue; |
|||
} |
|||
} |
|||
|
|||
return papers; |
|||
} |
|||
} |
|||
Loading…
Reference in new issue