5 changed files with 547 additions and 0 deletions
@ -0,0 +1,56 @@ |
|||
package controller; |
|||
|
|||
import command.Command; |
|||
import command.CrawlCommand; |
|||
import command.ListCommand; |
|||
import command.HelpCommand; |
|||
import command.ExitCommand; |
|||
import command.PlatformCommand; |
|||
import view.ConsoleView; |
|||
import repository.PaperRepository; |
|||
import strategy.StrategyFactory; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.HashMap; |
|||
import java.util.Map; |
|||
|
|||
public class CrawlerController { |
|||
private final ConsoleView view; |
|||
private final PaperRepository repository; |
|||
private final Map<String, Command> commands = new HashMap<>(); |
|||
|
|||
public CrawlerController(ConsoleView view, PaperRepository repository, StrategyFactory strategyFactory) { |
|||
this.view = view; |
|||
this.repository = repository; |
|||
|
|||
register(new CrawlCommand(view, strategyFactory)); |
|||
register(new ListCommand(view)); |
|||
register(new PlatformCommand(view, strategyFactory)); |
|||
register(new ExitCommand(view)); |
|||
register(new HelpCommand(view, new ArrayList<>(commands.values()))); |
|||
} |
|||
|
|||
private void register(Command command) { |
|||
commands.put(command.getName(), command); |
|||
} |
|||
|
|||
public void run() { |
|||
view.displayWelcome(); |
|||
|
|||
while (true) { |
|||
String input = view.getInput(); |
|||
if (input.isEmpty()) continue; |
|||
|
|||
String[] parts = input.split("\\s+"); |
|||
String commandName = parts[0].toLowerCase(); |
|||
|
|||
if (!commands.containsKey(commandName)) { |
|||
view.showError("未知命令,请输入 help 查看可用命令"); |
|||
continue; |
|||
} |
|||
|
|||
Command command = commands.get(commandName); |
|||
command.execute(parts, repository); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,45 @@ |
|||
package model; |
|||
|
|||
public class Paper { |
|||
private String title; |
|||
private String authors; |
|||
private String abstractText; |
|||
private String url; |
|||
private String platform; |
|||
|
|||
public Paper() { |
|||
} |
|||
|
|||
public Paper(String title, String authors, String abstractText, String url, String platform) { |
|||
this.title = title; |
|||
this.authors = authors; |
|||
this.abstractText = abstractText; |
|||
this.url = url; |
|||
this.platform = platform; |
|||
} |
|||
|
|||
public String getTitle() { return title; } |
|||
public void setTitle(String title) { this.title = title; } |
|||
|
|||
public String getAuthors() { return authors; } |
|||
public void setAuthors(String authors) { this.authors = authors; } |
|||
|
|||
public String getAbstractText() { return abstractText; } |
|||
public void setAbstractText(String abstractText) { this.abstractText = abstractText; } |
|||
|
|||
public String getUrl() { return url; } |
|||
public void setUrl(String url) { this.url = url; } |
|||
|
|||
public String getPlatform() { return platform; } |
|||
public void setPlatform(String platform) { this.platform = platform; } |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "Paper{" + |
|||
"title='" + title + '\'' + |
|||
", authors='" + authors + '\'' + |
|||
", url='" + url + '\'' + |
|||
", platform='" + platform + '\'' + |
|||
'}'; |
|||
} |
|||
} |
|||
@ -0,0 +1,145 @@ |
|||
package repository; |
|||
|
|||
import model.Paper; |
|||
import com.fasterxml.jackson.databind.ObjectMapper; |
|||
import com.fasterxml.jackson.databind.SerializationFeature; |
|||
import utils.Utils; |
|||
import java.io.File; |
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.HashMap; |
|||
import java.util.HashSet; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
import java.util.Set; |
|||
|
|||
public class PaperRepository { |
|||
private String baseDir = "论文爬取"; |
|||
private String subDir; |
|||
private ObjectMapper objectMapper; |
|||
|
|||
public PaperRepository() { |
|||
objectMapper = new ObjectMapper(); |
|||
objectMapper.enable(SerializationFeature.INDENT_OUTPUT); |
|||
} |
|||
|
|||
public void init(String platformName) { |
|||
this.subDir = baseDir + File.separator + Utils.cleanFileName(platformName); |
|||
|
|||
File dir = new File(subDir); |
|||
if (!dir.exists()) { |
|||
dir.mkdirs(); |
|||
} |
|||
} |
|||
|
|||
public List<Paper> removeDuplicates(List<Paper> papers) { |
|||
Set<String> existingTitles = new HashSet<>(); |
|||
List<Paper> uniquePapers = new ArrayList<>(); |
|||
|
|||
File[] files = new File(subDir).listFiles(); |
|||
if (files != null) { |
|||
for (File file : files) { |
|||
if (file.isFile() && file.getName().endsWith(".json")) { |
|||
try { |
|||
Paper[] existingPapers = objectMapper.readValue(file, Paper[].class); |
|||
for (Paper paper : existingPapers) { |
|||
existingTitles.add(paper.getTitle()); |
|||
} |
|||
} catch (IOException e) { |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
for (Paper paper : papers) { |
|||
if (!existingTitles.contains(paper.getTitle())) { |
|||
uniquePapers.add(paper); |
|||
existingTitles.add(paper.getTitle()); |
|||
} |
|||
} |
|||
|
|||
return uniquePapers; |
|||
} |
|||
|
|||
public void savePapers(List<Paper> papers) throws Exception { |
|||
if (papers.isEmpty()) { |
|||
System.out.println("没有论文需要保存"); |
|||
return; |
|||
} |
|||
|
|||
int savedCount = 0; |
|||
for (Paper paper : papers) { |
|||
String title = paper.getTitle(); |
|||
String fileName = Utils.cleanTitleForFileName(title) + ".json"; |
|||
String filePath = subDir + File.separator + fileName; |
|||
|
|||
List<Paper> singlePaperList = new ArrayList<>(); |
|||
singlePaperList.add(paper); |
|||
|
|||
objectMapper.writeValue(new File(filePath), singlePaperList); |
|||
savedCount++; |
|||
System.out.println("论文已保存: " + filePath); |
|||
} |
|||
System.out.println("共保存 " + savedCount + " 篇论文到: " + subDir); |
|||
} |
|||
|
|||
public List<Paper> loadPapers() throws IOException { |
|||
List<Paper> allPapers = new ArrayList<>(); |
|||
|
|||
File[] files = new File(subDir).listFiles(); |
|||
if (files != null) { |
|||
for (File file : files) { |
|||
if (file.isFile() && file.getName().endsWith(".json")) { |
|||
Paper[] papers = objectMapper.readValue(file, Paper[].class); |
|||
for (Paper paper : papers) { |
|||
allPapers.add(paper); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
return allPapers; |
|||
} |
|||
|
|||
public Map<String, List<Paper>> loadAllPapersGroupedByPlatform() throws IOException { |
|||
Map<String, List<Paper>> papersByPlatform = new HashMap<>(); |
|||
|
|||
File baseDirFile = new File(baseDir); |
|||
if (!baseDirFile.exists()) { |
|||
return papersByPlatform; |
|||
} |
|||
|
|||
File[] platformDirs = baseDirFile.listFiles(); |
|||
if (platformDirs != null) { |
|||
for (File platformDir : platformDirs) { |
|||
if (platformDir.isDirectory()) { |
|||
String platformName = platformDir.getName(); |
|||
List<Paper> platformPapers = new ArrayList<>(); |
|||
|
|||
File[] files = platformDir.listFiles(); |
|||
if (files != null) { |
|||
for (File file : files) { |
|||
if (file.isFile() && file.getName().endsWith(".json")) { |
|||
try { |
|||
Paper[] papers = objectMapper.readValue(file, Paper[].class); |
|||
for (Paper paper : papers) { |
|||
platformPapers.add(paper); |
|||
} |
|||
} catch (IOException e) { |
|||
System.out.println("读取文件失败: " + file.getName()); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
if (!platformPapers.isEmpty()) { |
|||
papersByPlatform.put(platformName, platformPapers); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
return papersByPlatform; |
|||
} |
|||
} |
|||
@ -0,0 +1,49 @@ |
|||
package command; |
|||
|
|||
import strategy.CrawlerStrategy; |
|||
import strategy.StrategyFactory; |
|||
import view.ConsoleView; |
|||
import java.util.List; |
|||
import repository.PaperRepository; |
|||
|
|||
public class PlatformCommand implements Command { |
|||
private StrategyFactory strategyFactory; |
|||
private ConsoleView view; |
|||
|
|||
public PlatformCommand(ConsoleView view, StrategyFactory strategyFactory) { |
|||
this.view = view; |
|||
this.strategyFactory = strategyFactory; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, PaperRepository repository) { |
|||
List<CrawlerStrategy> strategies = strategyFactory.getAllStrategies(); |
|||
|
|||
if (strategies.isEmpty()) { |
|||
view.showInfo("暂不支持任何论文平台"); |
|||
} else { |
|||
view.showInfo("当前支持 " + strategies.size() + " 个论文平台:"); |
|||
System.out.println(); |
|||
|
|||
int index = 1; |
|||
for (CrawlerStrategy strategy : strategies) { |
|||
System.out.println(index + ". " + strategy.getPlatformName()); |
|||
index++; |
|||
} |
|||
|
|||
System.out.println(); |
|||
view.showInfo("使用示例: crawl <平台URL>"); |
|||
view.showInfo("例如: crawl https://arxiv.org/search/?query=machine+learning"); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public String getDescription() { |
|||
return "显示支持的论文平台列表"; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "platforms"; |
|||
} |
|||
} |
|||
@ -0,0 +1,252 @@ |
|||
package utils; |
|||
|
|||
import org.apache.hc.client5.http.classic.methods.HttpGet; |
|||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; |
|||
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; |
|||
import org.apache.hc.client5.http.impl.classic.HttpClients; |
|||
import org.apache.hc.core5.http.protocol.BasicHttpContext; |
|||
|
|||
import org.apache.hc.core5.http.io.entity.EntityUtils; |
|||
|
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import java.net.URLEncoder; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.Random; |
|||
|
|||
|
|||
public class Utils { |
|||
// 随机User-Agent列表
|
|||
private static final List<String> USER_AGENTS = new ArrayList<>(); |
|||
static { |
|||
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"); |
|||
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"); |
|||
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/124.0"); |
|||
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/123.0.0.0"); |
|||
USER_AGENTS.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15"); |
|||
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"); |
|||
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/123.0"); |
|||
USER_AGENTS.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15"); |
|||
USER_AGENTS.add("Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36"); |
|||
USER_AGENTS.add("Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1"); |
|||
} |
|||
|
|||
// 随机Referer列表
|
|||
private static final List<String> REFERERS = new ArrayList<>(); |
|||
static { |
|||
REFERERS.add("https://www.google.com/"); |
|||
REFERERS.add("https://www.bing.com/"); |
|||
REFERERS.add("https://www.baidu.com/"); |
|||
REFERERS.add("https://scholar.google.com/"); |
|||
REFERERS.add("https://www.sciencedirect.com/"); |
|||
REFERERS.add("https://link.springer.com/"); |
|||
REFERERS.add("https://ieeexplore.ieee.org/"); |
|||
REFERERS.add("https://dl.acm.org/"); |
|||
REFERERS.add("https://kns.cnki.net/"); |
|||
REFERERS.add("https://www.google.com/search"); |
|||
} |
|||
|
|||
private static final Random RANDOM = new Random(); |
|||
|
|||
// 发送 HTTP GET 请求
|
|||
public static String sendGetRequest(String urlString) throws Exception { |
|||
System.out.println("正在发送HTTP请求: " + urlString); |
|||
|
|||
// 尝试多次普通HTTP请求,使用不同的User-Agent和Referer
|
|||
for (int i = 0; i < 2; i++) { // 减少重试次数,避免卡住
|
|||
String html = sendHttpGetRequest(urlString); |
|||
if (!html.isEmpty()) { |
|||
return html; |
|||
} |
|||
// 每次失败后添加更长的延迟
|
|||
int delay = 2000 + i * 1000; |
|||
System.out.println("第 " + (i + 1) + " 次请求失败,添加延迟: " + delay + "ms"); |
|||
Thread.sleep(delay); |
|||
} |
|||
|
|||
// 暂时禁用Selenium,因为初始化可能会卡住
|
|||
System.out.println("所有HTTP请求都失败,暂时跳过Selenium..."); |
|||
return ""; |
|||
} |
|||
|
|||
// 使用普通HTTP请求
|
|||
private static String sendHttpGetRequest(String urlString) throws Exception { |
|||
long startTime = System.currentTimeMillis(); |
|||
|
|||
// 设置请求超时时间
|
|||
final int TIMEOUT = 15000; // 15秒
|
|||
|
|||
try { |
|||
// 使用默认的HttpClient
|
|||
CloseableHttpClient httpClient = HttpClients.createDefault(); |
|||
|
|||
HttpGet httpGet = new HttpGet(urlString); |
|||
|
|||
// 随机选择User-Agent
|
|||
String userAgent = USER_AGENTS.get(RANDOM.nextInt(USER_AGENTS.size())); |
|||
// 随机选择Referer
|
|||
String referer = REFERERS.get(RANDOM.nextInt(REFERERS.size())); |
|||
|
|||
// 添加更完整的HTTP头信息,模拟真实浏览器
|
|||
httpGet.setHeader("User-Agent", userAgent); |
|||
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"); |
|||
httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"); |
|||
httpGet.setHeader("Accept-Encoding", "gzip, deflate, br"); |
|||
httpGet.setHeader("Connection", "keep-alive"); |
|||
httpGet.setHeader("Referer", referer); |
|||
httpGet.setHeader("Upgrade-Insecure-Requests", "1"); |
|||
httpGet.setHeader("Sec-Fetch-Dest", "document"); |
|||
httpGet.setHeader("Sec-Fetch-Mode", "navigate"); |
|||
httpGet.setHeader("Sec-Fetch-Site", "cross-site"); |
|||
httpGet.setHeader("Sec-Fetch-User", "?1"); |
|||
httpGet.setHeader("Cache-Control", "max-age=0"); |
|||
httpGet.setHeader("DNT", "1"); |
|||
httpGet.setHeader("TE", "trailers"); |
|||
|
|||
// 执行请求
|
|||
System.out.println("开始执行HTTP请求..."); |
|||
System.out.println("请求超时设置: " + TIMEOUT + "ms"); |
|||
|
|||
CloseableHttpResponse response = null; |
|||
try { |
|||
// 使用Future来处理超时
|
|||
java.util.concurrent.Future<CloseableHttpResponse> future = java.util.concurrent.Executors.newSingleThreadExecutor().submit(new java.util.concurrent.Callable<CloseableHttpResponse>() { |
|||
@Override |
|||
public CloseableHttpResponse call() throws Exception { |
|||
try { |
|||
return (CloseableHttpResponse) httpClient.executeOpen(null, httpGet, new BasicHttpContext()); |
|||
} catch (Exception e) { |
|||
throw new RuntimeException(e); |
|||
} |
|||
} |
|||
}); |
|||
|
|||
try { |
|||
response = future.get(TIMEOUT, java.util.concurrent.TimeUnit.MILLISECONDS); |
|||
} catch (java.util.concurrent.TimeoutException e) { |
|||
System.out.println("HTTP请求超时: " + e.getMessage()); |
|||
future.cancel(true); |
|||
return ""; |
|||
} |
|||
|
|||
// 获取响应状态码
|
|||
int statusCode = response.getCode(); |
|||
System.out.println("HTTP响应状态码: " + statusCode); |
|||
System.out.println("使用的User-Agent: " + userAgent); |
|||
System.out.println("使用的Referer: " + referer); |
|||
|
|||
if (statusCode != 200) { |
|||
System.out.println("HTTP请求失败,状态码: " + statusCode); |
|||
return ""; |
|||
} |
|||
|
|||
// 读取响应内容
|
|||
System.out.println("正在读取响应内容..."); |
|||
// 限制读取的内容长度,避免程序卡住
|
|||
String html = EntityUtils.toString(response.getEntity(), "UTF-8"); |
|||
// 如果内容长度超过100000字符,只保留前100000字符
|
|||
if (html.length() > 100000) { |
|||
html = html.substring(0, 100000); |
|||
System.out.println("响应内容过长,已截断为100000字符"); |
|||
} |
|||
|
|||
long endTime = System.currentTimeMillis(); |
|||
System.out.println("HTTP请求完成,耗时: " + (endTime - startTime) + "ms"); |
|||
System.out.println("响应内容长度: " + html.length() + " 字符"); |
|||
|
|||
// 检查响应内容是否为空或包含反爬信息
|
|||
if (html == null || html.isEmpty()) { |
|||
System.out.println("响应内容为空"); |
|||
return ""; |
|||
} |
|||
|
|||
// 检查是否是反爬页面
|
|||
boolean isAntiCrawl = false; |
|||
String[] antiCrawlKeywords = {"captcha", "verify", "robot", "Robot", "reCAPTCHA", "blocked", "Blocked"}; |
|||
for (String keyword : antiCrawlKeywords) { |
|||
if (html.contains(keyword)) { |
|||
isAntiCrawl = true; |
|||
break; |
|||
} |
|||
} |
|||
|
|||
// 特殊处理arXiv,因为它的页面可能包含一些被误判为反爬的关键词
|
|||
if (urlString.contains("arxiv.org")) { |
|||
isAntiCrawl = false; // 对于arXiv,我们信任它返回的内容
|
|||
} |
|||
|
|||
if (isAntiCrawl) { |
|||
System.out.println("检测到反爬页面"); |
|||
return ""; |
|||
} |
|||
|
|||
// 智能延迟,模拟真实用户行为,使用更随机的延迟时间
|
|||
int delay = RANDOM.nextInt(1500) + 800; // 800-2300ms
|
|||
System.out.println("添加随机延迟: " + delay + "ms"); |
|||
Thread.sleep(delay); |
|||
|
|||
return html; |
|||
} finally { |
|||
// 确保响应和客户端被关闭
|
|||
if (response != null) { |
|||
try { |
|||
response.close(); |
|||
} catch (Exception e) { |
|||
System.out.println("关闭响应时出错: " + e.getMessage()); |
|||
} |
|||
} |
|||
try { |
|||
httpClient.close(); |
|||
} catch (Exception e) { |
|||
System.out.println("关闭HTTP客户端时出错: " + e.getMessage()); |
|||
} |
|||
} |
|||
} catch (java.net.SocketTimeoutException e) { |
|||
System.out.println("HTTP请求超时: " + e.getMessage()); |
|||
return ""; |
|||
} catch (java.io.IOException e) { |
|||
System.out.println("HTTP请求IO错误: " + e.getMessage()); |
|||
return ""; |
|||
} catch (Exception e) { |
|||
System.out.println("发送HTTP请求时出错: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
return ""; |
|||
} |
|||
} |
|||
|
|||
// 解析 HTML
|
|||
public static Document parseHtml(String html) { |
|||
return Jsoup.parse(html); |
|||
} |
|||
|
|||
// URL 编码
|
|||
public static String urlEncode(String value) throws Exception { |
|||
return URLEncoder.encode(value, "UTF-8"); |
|||
} |
|||
|
|||
// 生成唯一文件名
|
|||
public static String generateFileName(String keyword) { |
|||
return keyword + "_" + System.currentTimeMillis() + ".json"; |
|||
} |
|||
|
|||
// 清理文件名中的非法字符
|
|||
public static String cleanFileName(String fileName) { |
|||
return fileName.replaceAll("[\\/:*?\"<>|]", "_"); |
|||
} |
|||
|
|||
// 清理论文标题用于文件名
|
|||
public static String cleanTitleForFileName(String title) { |
|||
if (title == null || title.isEmpty()) { |
|||
return "untitled"; |
|||
} |
|||
String cleaned = title.trim() |
|||
.replaceAll("[\\\\/:*?\"<>|]", "_") |
|||
.replaceAll("\\s+", "_") |
|||
.replaceAll("_+", "_"); |
|||
if (cleaned.length() > 100) { |
|||
cleaned = cleaned.substring(0, 100); |
|||
} |
|||
return cleaned; |
|||
} |
|||
} |
|||
Loading…
Reference in new issue