Browse Source

上传文件至 'w10'

main
Zhengjie 1 month ago
parent
commit
58c2936cbb
  1. 56
      w10/CrawlerController.java
  2. 45
      w10/Paper.java
  3. 145
      w10/PaperRepository.java
  4. 49
      w10/PlatformCommand.java
  5. 252
      w10/Utils.java

56
w10/CrawlerController.java

@ -0,0 +1,56 @@
package controller;
import command.Command;
import command.CrawlCommand;
import command.ListCommand;
import command.HelpCommand;
import command.ExitCommand;
import command.PlatformCommand;
import view.ConsoleView;
import repository.PaperRepository;
import strategy.StrategyFactory;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
public class CrawlerController {
private final ConsoleView view;
private final PaperRepository repository;
private final Map<String, Command> commands = new HashMap<>();
public CrawlerController(ConsoleView view, PaperRepository repository, StrategyFactory strategyFactory) {
this.view = view;
this.repository = repository;
register(new CrawlCommand(view, strategyFactory));
register(new ListCommand(view));
register(new PlatformCommand(view, strategyFactory));
register(new ExitCommand(view));
register(new HelpCommand(view, new ArrayList<>(commands.values())));
}
private void register(Command command) {
commands.put(command.getName(), command);
}
public void run() {
view.displayWelcome();
while (true) {
String input = view.getInput();
if (input.isEmpty()) continue;
String[] parts = input.split("\\s+");
String commandName = parts[0].toLowerCase();
if (!commands.containsKey(commandName)) {
view.showError("未知命令,请输入 help 查看可用命令");
continue;
}
Command command = commands.get(commandName);
command.execute(parts, repository);
}
}
}

45
w10/Paper.java

@ -0,0 +1,45 @@
package model;
public class Paper {
private String title;
private String authors;
private String abstractText;
private String url;
private String platform;
public Paper() {
}
public Paper(String title, String authors, String abstractText, String url, String platform) {
this.title = title;
this.authors = authors;
this.abstractText = abstractText;
this.url = url;
this.platform = platform;
}
public String getTitle() { return title; }
public void setTitle(String title) { this.title = title; }
public String getAuthors() { return authors; }
public void setAuthors(String authors) { this.authors = authors; }
public String getAbstractText() { return abstractText; }
public void setAbstractText(String abstractText) { this.abstractText = abstractText; }
public String getUrl() { return url; }
public void setUrl(String url) { this.url = url; }
public String getPlatform() { return platform; }
public void setPlatform(String platform) { this.platform = platform; }
@Override
public String toString() {
return "Paper{" +
"title='" + title + '\'' +
", authors='" + authors + '\'' +
", url='" + url + '\'' +
", platform='" + platform + '\'' +
'}';
}
}

145
w10/PaperRepository.java

@ -0,0 +1,145 @@
package repository;
import model.Paper;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import utils.Utils;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class PaperRepository {
private String baseDir = "论文爬取";
private String subDir;
private ObjectMapper objectMapper;
public PaperRepository() {
objectMapper = new ObjectMapper();
objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
}
public void init(String platformName) {
this.subDir = baseDir + File.separator + Utils.cleanFileName(platformName);
File dir = new File(subDir);
if (!dir.exists()) {
dir.mkdirs();
}
}
public List<Paper> removeDuplicates(List<Paper> papers) {
Set<String> existingTitles = new HashSet<>();
List<Paper> uniquePapers = new ArrayList<>();
File[] files = new File(subDir).listFiles();
if (files != null) {
for (File file : files) {
if (file.isFile() && file.getName().endsWith(".json")) {
try {
Paper[] existingPapers = objectMapper.readValue(file, Paper[].class);
for (Paper paper : existingPapers) {
existingTitles.add(paper.getTitle());
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
for (Paper paper : papers) {
if (!existingTitles.contains(paper.getTitle())) {
uniquePapers.add(paper);
existingTitles.add(paper.getTitle());
}
}
return uniquePapers;
}
public void savePapers(List<Paper> papers) throws Exception {
if (papers.isEmpty()) {
System.out.println("没有论文需要保存");
return;
}
int savedCount = 0;
for (Paper paper : papers) {
String title = paper.getTitle();
String fileName = Utils.cleanTitleForFileName(title) + ".json";
String filePath = subDir + File.separator + fileName;
List<Paper> singlePaperList = new ArrayList<>();
singlePaperList.add(paper);
objectMapper.writeValue(new File(filePath), singlePaperList);
savedCount++;
System.out.println("论文已保存: " + filePath);
}
System.out.println("共保存 " + savedCount + " 篇论文到: " + subDir);
}
public List<Paper> loadPapers() throws IOException {
List<Paper> allPapers = new ArrayList<>();
File[] files = new File(subDir).listFiles();
if (files != null) {
for (File file : files) {
if (file.isFile() && file.getName().endsWith(".json")) {
Paper[] papers = objectMapper.readValue(file, Paper[].class);
for (Paper paper : papers) {
allPapers.add(paper);
}
}
}
}
return allPapers;
}
public Map<String, List<Paper>> loadAllPapersGroupedByPlatform() throws IOException {
Map<String, List<Paper>> papersByPlatform = new HashMap<>();
File baseDirFile = new File(baseDir);
if (!baseDirFile.exists()) {
return papersByPlatform;
}
File[] platformDirs = baseDirFile.listFiles();
if (platformDirs != null) {
for (File platformDir : platformDirs) {
if (platformDir.isDirectory()) {
String platformName = platformDir.getName();
List<Paper> platformPapers = new ArrayList<>();
File[] files = platformDir.listFiles();
if (files != null) {
for (File file : files) {
if (file.isFile() && file.getName().endsWith(".json")) {
try {
Paper[] papers = objectMapper.readValue(file, Paper[].class);
for (Paper paper : papers) {
platformPapers.add(paper);
}
} catch (IOException e) {
System.out.println("读取文件失败: " + file.getName());
}
}
}
}
if (!platformPapers.isEmpty()) {
papersByPlatform.put(platformName, platformPapers);
}
}
}
}
return papersByPlatform;
}
}

49
w10/PlatformCommand.java

@ -0,0 +1,49 @@
package command;
import strategy.CrawlerStrategy;
import strategy.StrategyFactory;
import view.ConsoleView;
import java.util.List;
import repository.PaperRepository;
public class PlatformCommand implements Command {
private StrategyFactory strategyFactory;
private ConsoleView view;
public PlatformCommand(ConsoleView view, StrategyFactory strategyFactory) {
this.view = view;
this.strategyFactory = strategyFactory;
}
@Override
public void execute(String[] args, PaperRepository repository) {
List<CrawlerStrategy> strategies = strategyFactory.getAllStrategies();
if (strategies.isEmpty()) {
view.showInfo("暂不支持任何论文平台");
} else {
view.showInfo("当前支持 " + strategies.size() + " 个论文平台:");
System.out.println();
int index = 1;
for (CrawlerStrategy strategy : strategies) {
System.out.println(index + ". " + strategy.getPlatformName());
index++;
}
System.out.println();
view.showInfo("使用示例: crawl <平台URL>");
view.showInfo("例如: crawl https://arxiv.org/search/?query=machine+learning");
}
}
@Override
public String getDescription() {
return "显示支持的论文平台列表";
}
@Override
public String getName() {
return "platforms";
}
}

252
w10/Utils.java

@ -0,0 +1,252 @@
package utils;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.protocol.BasicHttpContext;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
public class Utils {
// 随机User-Agent列表
private static final List<String> USER_AGENTS = new ArrayList<>();
static {
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36");
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36");
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/124.0");
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/123.0.0.0");
USER_AGENTS.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15");
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36");
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/123.0");
USER_AGENTS.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15");
USER_AGENTS.add("Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36");
USER_AGENTS.add("Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1");
}
// 随机Referer列表
private static final List<String> REFERERS = new ArrayList<>();
static {
REFERERS.add("https://www.google.com/");
REFERERS.add("https://www.bing.com/");
REFERERS.add("https://www.baidu.com/");
REFERERS.add("https://scholar.google.com/");
REFERERS.add("https://www.sciencedirect.com/");
REFERERS.add("https://link.springer.com/");
REFERERS.add("https://ieeexplore.ieee.org/");
REFERERS.add("https://dl.acm.org/");
REFERERS.add("https://kns.cnki.net/");
REFERERS.add("https://www.google.com/search");
}
private static final Random RANDOM = new Random();
// 发送 HTTP GET 请求
public static String sendGetRequest(String urlString) throws Exception {
System.out.println("正在发送HTTP请求: " + urlString);
// 尝试多次普通HTTP请求,使用不同的User-Agent和Referer
for (int i = 0; i < 2; i++) { // 减少重试次数,避免卡住
String html = sendHttpGetRequest(urlString);
if (!html.isEmpty()) {
return html;
}
// 每次失败后添加更长的延迟
int delay = 2000 + i * 1000;
System.out.println("第 " + (i + 1) + " 次请求失败,添加延迟: " + delay + "ms");
Thread.sleep(delay);
}
// 暂时禁用Selenium,因为初始化可能会卡住
System.out.println("所有HTTP请求都失败,暂时跳过Selenium...");
return "";
}
// 使用普通HTTP请求
private static String sendHttpGetRequest(String urlString) throws Exception {
long startTime = System.currentTimeMillis();
// 设置请求超时时间
final int TIMEOUT = 15000; // 15秒
try {
// 使用默认的HttpClient
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(urlString);
// 随机选择User-Agent
String userAgent = USER_AGENTS.get(RANDOM.nextInt(USER_AGENTS.size()));
// 随机选择Referer
String referer = REFERERS.get(RANDOM.nextInt(REFERERS.size()));
// 添加更完整的HTTP头信息,模拟真实浏览器
httpGet.setHeader("User-Agent", userAgent);
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8");
httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
httpGet.setHeader("Accept-Encoding", "gzip, deflate, br");
httpGet.setHeader("Connection", "keep-alive");
httpGet.setHeader("Referer", referer);
httpGet.setHeader("Upgrade-Insecure-Requests", "1");
httpGet.setHeader("Sec-Fetch-Dest", "document");
httpGet.setHeader("Sec-Fetch-Mode", "navigate");
httpGet.setHeader("Sec-Fetch-Site", "cross-site");
httpGet.setHeader("Sec-Fetch-User", "?1");
httpGet.setHeader("Cache-Control", "max-age=0");
httpGet.setHeader("DNT", "1");
httpGet.setHeader("TE", "trailers");
// 执行请求
System.out.println("开始执行HTTP请求...");
System.out.println("请求超时设置: " + TIMEOUT + "ms");
CloseableHttpResponse response = null;
try {
// 使用Future来处理超时
java.util.concurrent.Future<CloseableHttpResponse> future = java.util.concurrent.Executors.newSingleThreadExecutor().submit(new java.util.concurrent.Callable<CloseableHttpResponse>() {
@Override
public CloseableHttpResponse call() throws Exception {
try {
return (CloseableHttpResponse) httpClient.executeOpen(null, httpGet, new BasicHttpContext());
} catch (Exception e) {
throw new RuntimeException(e);
}
}
});
try {
response = future.get(TIMEOUT, java.util.concurrent.TimeUnit.MILLISECONDS);
} catch (java.util.concurrent.TimeoutException e) {
System.out.println("HTTP请求超时: " + e.getMessage());
future.cancel(true);
return "";
}
// 获取响应状态码
int statusCode = response.getCode();
System.out.println("HTTP响应状态码: " + statusCode);
System.out.println("使用的User-Agent: " + userAgent);
System.out.println("使用的Referer: " + referer);
if (statusCode != 200) {
System.out.println("HTTP请求失败,状态码: " + statusCode);
return "";
}
// 读取响应内容
System.out.println("正在读取响应内容...");
// 限制读取的内容长度,避免程序卡住
String html = EntityUtils.toString(response.getEntity(), "UTF-8");
// 如果内容长度超过100000字符,只保留前100000字符
if (html.length() > 100000) {
html = html.substring(0, 100000);
System.out.println("响应内容过长,已截断为100000字符");
}
long endTime = System.currentTimeMillis();
System.out.println("HTTP请求完成,耗时: " + (endTime - startTime) + "ms");
System.out.println("响应内容长度: " + html.length() + " 字符");
// 检查响应内容是否为空或包含反爬信息
if (html == null || html.isEmpty()) {
System.out.println("响应内容为空");
return "";
}
// 检查是否是反爬页面
boolean isAntiCrawl = false;
String[] antiCrawlKeywords = {"captcha", "verify", "robot", "Robot", "reCAPTCHA", "blocked", "Blocked"};
for (String keyword : antiCrawlKeywords) {
if (html.contains(keyword)) {
isAntiCrawl = true;
break;
}
}
// 特殊处理arXiv,因为它的页面可能包含一些被误判为反爬的关键词
if (urlString.contains("arxiv.org")) {
isAntiCrawl = false; // 对于arXiv,我们信任它返回的内容
}
if (isAntiCrawl) {
System.out.println("检测到反爬页面");
return "";
}
// 智能延迟,模拟真实用户行为,使用更随机的延迟时间
int delay = RANDOM.nextInt(1500) + 800; // 800-2300ms
System.out.println("添加随机延迟: " + delay + "ms");
Thread.sleep(delay);
return html;
} finally {
// 确保响应和客户端被关闭
if (response != null) {
try {
response.close();
} catch (Exception e) {
System.out.println("关闭响应时出错: " + e.getMessage());
}
}
try {
httpClient.close();
} catch (Exception e) {
System.out.println("关闭HTTP客户端时出错: " + e.getMessage());
}
}
} catch (java.net.SocketTimeoutException e) {
System.out.println("HTTP请求超时: " + e.getMessage());
return "";
} catch (java.io.IOException e) {
System.out.println("HTTP请求IO错误: " + e.getMessage());
return "";
} catch (Exception e) {
System.out.println("发送HTTP请求时出错: " + e.getMessage());
e.printStackTrace();
return "";
}
}
// 解析 HTML
public static Document parseHtml(String html) {
return Jsoup.parse(html);
}
// URL 编码
public static String urlEncode(String value) throws Exception {
return URLEncoder.encode(value, "UTF-8");
}
// 生成唯一文件名
public static String generateFileName(String keyword) {
return keyword + "_" + System.currentTimeMillis() + ".json";
}
// 清理文件名中的非法字符
public static String cleanFileName(String fileName) {
return fileName.replaceAll("[\\/:*?\"<>|]", "_");
}
// 清理论文标题用于文件名
public static String cleanTitleForFileName(String title) {
if (title == null || title.isEmpty()) {
return "untitled";
}
String cleaned = title.trim()
.replaceAll("[\\\\/:*?\"<>|]", "_")
.replaceAll("\\s+", "_")
.replaceAll("_+", "_");
if (cleaned.length() > 100) {
cleaned = cleaned.substring(0, 100);
}
return cleaned;
}
}
Loading…
Cancel
Save