5 changed files with 547 additions and 0 deletions
@ -0,0 +1,56 @@ |
|||||
|
package controller; |
||||
|
|
||||
|
import command.Command; |
||||
|
import command.CrawlCommand; |
||||
|
import command.ListCommand; |
||||
|
import command.HelpCommand; |
||||
|
import command.ExitCommand; |
||||
|
import command.PlatformCommand; |
||||
|
import view.ConsoleView; |
||||
|
import repository.PaperRepository; |
||||
|
import strategy.StrategyFactory; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.HashMap; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
public class CrawlerController { |
||||
|
private final ConsoleView view; |
||||
|
private final PaperRepository repository; |
||||
|
private final Map<String, Command> commands = new HashMap<>(); |
||||
|
|
||||
|
public CrawlerController(ConsoleView view, PaperRepository repository, StrategyFactory strategyFactory) { |
||||
|
this.view = view; |
||||
|
this.repository = repository; |
||||
|
|
||||
|
register(new CrawlCommand(view, strategyFactory)); |
||||
|
register(new ListCommand(view)); |
||||
|
register(new PlatformCommand(view, strategyFactory)); |
||||
|
register(new ExitCommand(view)); |
||||
|
register(new HelpCommand(view, new ArrayList<>(commands.values()))); |
||||
|
} |
||||
|
|
||||
|
private void register(Command command) { |
||||
|
commands.put(command.getName(), command); |
||||
|
} |
||||
|
|
||||
|
public void run() { |
||||
|
view.displayWelcome(); |
||||
|
|
||||
|
while (true) { |
||||
|
String input = view.getInput(); |
||||
|
if (input.isEmpty()) continue; |
||||
|
|
||||
|
String[] parts = input.split("\\s+"); |
||||
|
String commandName = parts[0].toLowerCase(); |
||||
|
|
||||
|
if (!commands.containsKey(commandName)) { |
||||
|
view.showError("未知命令,请输入 help 查看可用命令"); |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
Command command = commands.get(commandName); |
||||
|
command.execute(parts, repository); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,45 @@ |
|||||
|
package model; |
||||
|
|
||||
|
public class Paper { |
||||
|
private String title; |
||||
|
private String authors; |
||||
|
private String abstractText; |
||||
|
private String url; |
||||
|
private String platform; |
||||
|
|
||||
|
public Paper() { |
||||
|
} |
||||
|
|
||||
|
public Paper(String title, String authors, String abstractText, String url, String platform) { |
||||
|
this.title = title; |
||||
|
this.authors = authors; |
||||
|
this.abstractText = abstractText; |
||||
|
this.url = url; |
||||
|
this.platform = platform; |
||||
|
} |
||||
|
|
||||
|
public String getTitle() { return title; } |
||||
|
public void setTitle(String title) { this.title = title; } |
||||
|
|
||||
|
public String getAuthors() { return authors; } |
||||
|
public void setAuthors(String authors) { this.authors = authors; } |
||||
|
|
||||
|
public String getAbstractText() { return abstractText; } |
||||
|
public void setAbstractText(String abstractText) { this.abstractText = abstractText; } |
||||
|
|
||||
|
public String getUrl() { return url; } |
||||
|
public void setUrl(String url) { this.url = url; } |
||||
|
|
||||
|
public String getPlatform() { return platform; } |
||||
|
public void setPlatform(String platform) { this.platform = platform; } |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "Paper{" + |
||||
|
"title='" + title + '\'' + |
||||
|
", authors='" + authors + '\'' + |
||||
|
", url='" + url + '\'' + |
||||
|
", platform='" + platform + '\'' + |
||||
|
'}'; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,145 @@ |
|||||
|
package repository; |
||||
|
|
||||
|
import model.Paper; |
||||
|
import com.fasterxml.jackson.databind.ObjectMapper; |
||||
|
import com.fasterxml.jackson.databind.SerializationFeature; |
||||
|
import utils.Utils; |
||||
|
import java.io.File; |
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.HashMap; |
||||
|
import java.util.HashSet; |
||||
|
import java.util.List; |
||||
|
import java.util.Map; |
||||
|
import java.util.Set; |
||||
|
|
||||
|
public class PaperRepository { |
||||
|
private String baseDir = "论文爬取"; |
||||
|
private String subDir; |
||||
|
private ObjectMapper objectMapper; |
||||
|
|
||||
|
public PaperRepository() { |
||||
|
objectMapper = new ObjectMapper(); |
||||
|
objectMapper.enable(SerializationFeature.INDENT_OUTPUT); |
||||
|
} |
||||
|
|
||||
|
public void init(String platformName) { |
||||
|
this.subDir = baseDir + File.separator + Utils.cleanFileName(platformName); |
||||
|
|
||||
|
File dir = new File(subDir); |
||||
|
if (!dir.exists()) { |
||||
|
dir.mkdirs(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public List<Paper> removeDuplicates(List<Paper> papers) { |
||||
|
Set<String> existingTitles = new HashSet<>(); |
||||
|
List<Paper> uniquePapers = new ArrayList<>(); |
||||
|
|
||||
|
File[] files = new File(subDir).listFiles(); |
||||
|
if (files != null) { |
||||
|
for (File file : files) { |
||||
|
if (file.isFile() && file.getName().endsWith(".json")) { |
||||
|
try { |
||||
|
Paper[] existingPapers = objectMapper.readValue(file, Paper[].class); |
||||
|
for (Paper paper : existingPapers) { |
||||
|
existingTitles.add(paper.getTitle()); |
||||
|
} |
||||
|
} catch (IOException e) { |
||||
|
e.printStackTrace(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
for (Paper paper : papers) { |
||||
|
if (!existingTitles.contains(paper.getTitle())) { |
||||
|
uniquePapers.add(paper); |
||||
|
existingTitles.add(paper.getTitle()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return uniquePapers; |
||||
|
} |
||||
|
|
||||
|
public void savePapers(List<Paper> papers) throws Exception { |
||||
|
if (papers.isEmpty()) { |
||||
|
System.out.println("没有论文需要保存"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
int savedCount = 0; |
||||
|
for (Paper paper : papers) { |
||||
|
String title = paper.getTitle(); |
||||
|
String fileName = Utils.cleanTitleForFileName(title) + ".json"; |
||||
|
String filePath = subDir + File.separator + fileName; |
||||
|
|
||||
|
List<Paper> singlePaperList = new ArrayList<>(); |
||||
|
singlePaperList.add(paper); |
||||
|
|
||||
|
objectMapper.writeValue(new File(filePath), singlePaperList); |
||||
|
savedCount++; |
||||
|
System.out.println("论文已保存: " + filePath); |
||||
|
} |
||||
|
System.out.println("共保存 " + savedCount + " 篇论文到: " + subDir); |
||||
|
} |
||||
|
|
||||
|
public List<Paper> loadPapers() throws IOException { |
||||
|
List<Paper> allPapers = new ArrayList<>(); |
||||
|
|
||||
|
File[] files = new File(subDir).listFiles(); |
||||
|
if (files != null) { |
||||
|
for (File file : files) { |
||||
|
if (file.isFile() && file.getName().endsWith(".json")) { |
||||
|
Paper[] papers = objectMapper.readValue(file, Paper[].class); |
||||
|
for (Paper paper : papers) { |
||||
|
allPapers.add(paper); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return allPapers; |
||||
|
} |
||||
|
|
||||
|
public Map<String, List<Paper>> loadAllPapersGroupedByPlatform() throws IOException { |
||||
|
Map<String, List<Paper>> papersByPlatform = new HashMap<>(); |
||||
|
|
||||
|
File baseDirFile = new File(baseDir); |
||||
|
if (!baseDirFile.exists()) { |
||||
|
return papersByPlatform; |
||||
|
} |
||||
|
|
||||
|
File[] platformDirs = baseDirFile.listFiles(); |
||||
|
if (platformDirs != null) { |
||||
|
for (File platformDir : platformDirs) { |
||||
|
if (platformDir.isDirectory()) { |
||||
|
String platformName = platformDir.getName(); |
||||
|
List<Paper> platformPapers = new ArrayList<>(); |
||||
|
|
||||
|
File[] files = platformDir.listFiles(); |
||||
|
if (files != null) { |
||||
|
for (File file : files) { |
||||
|
if (file.isFile() && file.getName().endsWith(".json")) { |
||||
|
try { |
||||
|
Paper[] papers = objectMapper.readValue(file, Paper[].class); |
||||
|
for (Paper paper : papers) { |
||||
|
platformPapers.add(paper); |
||||
|
} |
||||
|
} catch (IOException e) { |
||||
|
System.out.println("读取文件失败: " + file.getName()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (!platformPapers.isEmpty()) { |
||||
|
papersByPlatform.put(platformName, platformPapers); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return papersByPlatform; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,49 @@ |
|||||
|
package command; |
||||
|
|
||||
|
import strategy.CrawlerStrategy; |
||||
|
import strategy.StrategyFactory; |
||||
|
import view.ConsoleView; |
||||
|
import java.util.List; |
||||
|
import repository.PaperRepository; |
||||
|
|
||||
|
public class PlatformCommand implements Command { |
||||
|
private StrategyFactory strategyFactory; |
||||
|
private ConsoleView view; |
||||
|
|
||||
|
public PlatformCommand(ConsoleView view, StrategyFactory strategyFactory) { |
||||
|
this.view = view; |
||||
|
this.strategyFactory = strategyFactory; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, PaperRepository repository) { |
||||
|
List<CrawlerStrategy> strategies = strategyFactory.getAllStrategies(); |
||||
|
|
||||
|
if (strategies.isEmpty()) { |
||||
|
view.showInfo("暂不支持任何论文平台"); |
||||
|
} else { |
||||
|
view.showInfo("当前支持 " + strategies.size() + " 个论文平台:"); |
||||
|
System.out.println(); |
||||
|
|
||||
|
int index = 1; |
||||
|
for (CrawlerStrategy strategy : strategies) { |
||||
|
System.out.println(index + ". " + strategy.getPlatformName()); |
||||
|
index++; |
||||
|
} |
||||
|
|
||||
|
System.out.println(); |
||||
|
view.showInfo("使用示例: crawl <平台URL>"); |
||||
|
view.showInfo("例如: crawl https://arxiv.org/search/?query=machine+learning"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getDescription() { |
||||
|
return "显示支持的论文平台列表"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "platforms"; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,252 @@ |
|||||
|
package utils; |
||||
|
|
||||
|
import org.apache.hc.client5.http.classic.methods.HttpGet; |
||||
|
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; |
||||
|
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; |
||||
|
import org.apache.hc.client5.http.impl.classic.HttpClients; |
||||
|
import org.apache.hc.core5.http.protocol.BasicHttpContext; |
||||
|
|
||||
|
import org.apache.hc.core5.http.io.entity.EntityUtils; |
||||
|
|
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import java.net.URLEncoder; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.Random; |
||||
|
|
||||
|
|
||||
|
public class Utils { |
||||
|
// 随机User-Agent列表
|
||||
|
private static final List<String> USER_AGENTS = new ArrayList<>(); |
||||
|
static { |
||||
|
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"); |
||||
|
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"); |
||||
|
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/124.0"); |
||||
|
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/123.0.0.0"); |
||||
|
USER_AGENTS.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15"); |
||||
|
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"); |
||||
|
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/123.0"); |
||||
|
USER_AGENTS.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15"); |
||||
|
USER_AGENTS.add("Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36"); |
||||
|
USER_AGENTS.add("Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1"); |
||||
|
} |
||||
|
|
||||
|
// 随机Referer列表
|
||||
|
private static final List<String> REFERERS = new ArrayList<>(); |
||||
|
static { |
||||
|
REFERERS.add("https://www.google.com/"); |
||||
|
REFERERS.add("https://www.bing.com/"); |
||||
|
REFERERS.add("https://www.baidu.com/"); |
||||
|
REFERERS.add("https://scholar.google.com/"); |
||||
|
REFERERS.add("https://www.sciencedirect.com/"); |
||||
|
REFERERS.add("https://link.springer.com/"); |
||||
|
REFERERS.add("https://ieeexplore.ieee.org/"); |
||||
|
REFERERS.add("https://dl.acm.org/"); |
||||
|
REFERERS.add("https://kns.cnki.net/"); |
||||
|
REFERERS.add("https://www.google.com/search"); |
||||
|
} |
||||
|
|
||||
|
private static final Random RANDOM = new Random(); |
||||
|
|
||||
|
// 发送 HTTP GET 请求
|
||||
|
public static String sendGetRequest(String urlString) throws Exception { |
||||
|
System.out.println("正在发送HTTP请求: " + urlString); |
||||
|
|
||||
|
// 尝试多次普通HTTP请求,使用不同的User-Agent和Referer
|
||||
|
for (int i = 0; i < 2; i++) { // 减少重试次数,避免卡住
|
||||
|
String html = sendHttpGetRequest(urlString); |
||||
|
if (!html.isEmpty()) { |
||||
|
return html; |
||||
|
} |
||||
|
// 每次失败后添加更长的延迟
|
||||
|
int delay = 2000 + i * 1000; |
||||
|
System.out.println("第 " + (i + 1) + " 次请求失败,添加延迟: " + delay + "ms"); |
||||
|
Thread.sleep(delay); |
||||
|
} |
||||
|
|
||||
|
// 暂时禁用Selenium,因为初始化可能会卡住
|
||||
|
System.out.println("所有HTTP请求都失败,暂时跳过Selenium..."); |
||||
|
return ""; |
||||
|
} |
||||
|
|
||||
|
// 使用普通HTTP请求
|
||||
|
private static String sendHttpGetRequest(String urlString) throws Exception { |
||||
|
long startTime = System.currentTimeMillis(); |
||||
|
|
||||
|
// 设置请求超时时间
|
||||
|
final int TIMEOUT = 15000; // 15秒
|
||||
|
|
||||
|
try { |
||||
|
// 使用默认的HttpClient
|
||||
|
CloseableHttpClient httpClient = HttpClients.createDefault(); |
||||
|
|
||||
|
HttpGet httpGet = new HttpGet(urlString); |
||||
|
|
||||
|
// 随机选择User-Agent
|
||||
|
String userAgent = USER_AGENTS.get(RANDOM.nextInt(USER_AGENTS.size())); |
||||
|
// 随机选择Referer
|
||||
|
String referer = REFERERS.get(RANDOM.nextInt(REFERERS.size())); |
||||
|
|
||||
|
// 添加更完整的HTTP头信息,模拟真实浏览器
|
||||
|
httpGet.setHeader("User-Agent", userAgent); |
||||
|
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"); |
||||
|
httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"); |
||||
|
httpGet.setHeader("Accept-Encoding", "gzip, deflate, br"); |
||||
|
httpGet.setHeader("Connection", "keep-alive"); |
||||
|
httpGet.setHeader("Referer", referer); |
||||
|
httpGet.setHeader("Upgrade-Insecure-Requests", "1"); |
||||
|
httpGet.setHeader("Sec-Fetch-Dest", "document"); |
||||
|
httpGet.setHeader("Sec-Fetch-Mode", "navigate"); |
||||
|
httpGet.setHeader("Sec-Fetch-Site", "cross-site"); |
||||
|
httpGet.setHeader("Sec-Fetch-User", "?1"); |
||||
|
httpGet.setHeader("Cache-Control", "max-age=0"); |
||||
|
httpGet.setHeader("DNT", "1"); |
||||
|
httpGet.setHeader("TE", "trailers"); |
||||
|
|
||||
|
// 执行请求
|
||||
|
System.out.println("开始执行HTTP请求..."); |
||||
|
System.out.println("请求超时设置: " + TIMEOUT + "ms"); |
||||
|
|
||||
|
CloseableHttpResponse response = null; |
||||
|
try { |
||||
|
// 使用Future来处理超时
|
||||
|
java.util.concurrent.Future<CloseableHttpResponse> future = java.util.concurrent.Executors.newSingleThreadExecutor().submit(new java.util.concurrent.Callable<CloseableHttpResponse>() { |
||||
|
@Override |
||||
|
public CloseableHttpResponse call() throws Exception { |
||||
|
try { |
||||
|
return (CloseableHttpResponse) httpClient.executeOpen(null, httpGet, new BasicHttpContext()); |
||||
|
} catch (Exception e) { |
||||
|
throw new RuntimeException(e); |
||||
|
} |
||||
|
} |
||||
|
}); |
||||
|
|
||||
|
try { |
||||
|
response = future.get(TIMEOUT, java.util.concurrent.TimeUnit.MILLISECONDS); |
||||
|
} catch (java.util.concurrent.TimeoutException e) { |
||||
|
System.out.println("HTTP请求超时: " + e.getMessage()); |
||||
|
future.cancel(true); |
||||
|
return ""; |
||||
|
} |
||||
|
|
||||
|
// 获取响应状态码
|
||||
|
int statusCode = response.getCode(); |
||||
|
System.out.println("HTTP响应状态码: " + statusCode); |
||||
|
System.out.println("使用的User-Agent: " + userAgent); |
||||
|
System.out.println("使用的Referer: " + referer); |
||||
|
|
||||
|
if (statusCode != 200) { |
||||
|
System.out.println("HTTP请求失败,状态码: " + statusCode); |
||||
|
return ""; |
||||
|
} |
||||
|
|
||||
|
// 读取响应内容
|
||||
|
System.out.println("正在读取响应内容..."); |
||||
|
// 限制读取的内容长度,避免程序卡住
|
||||
|
String html = EntityUtils.toString(response.getEntity(), "UTF-8"); |
||||
|
// 如果内容长度超过100000字符,只保留前100000字符
|
||||
|
if (html.length() > 100000) { |
||||
|
html = html.substring(0, 100000); |
||||
|
System.out.println("响应内容过长,已截断为100000字符"); |
||||
|
} |
||||
|
|
||||
|
long endTime = System.currentTimeMillis(); |
||||
|
System.out.println("HTTP请求完成,耗时: " + (endTime - startTime) + "ms"); |
||||
|
System.out.println("响应内容长度: " + html.length() + " 字符"); |
||||
|
|
||||
|
// 检查响应内容是否为空或包含反爬信息
|
||||
|
if (html == null || html.isEmpty()) { |
||||
|
System.out.println("响应内容为空"); |
||||
|
return ""; |
||||
|
} |
||||
|
|
||||
|
// 检查是否是反爬页面
|
||||
|
boolean isAntiCrawl = false; |
||||
|
String[] antiCrawlKeywords = {"captcha", "verify", "robot", "Robot", "reCAPTCHA", "blocked", "Blocked"}; |
||||
|
for (String keyword : antiCrawlKeywords) { |
||||
|
if (html.contains(keyword)) { |
||||
|
isAntiCrawl = true; |
||||
|
break; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 特殊处理arXiv,因为它的页面可能包含一些被误判为反爬的关键词
|
||||
|
if (urlString.contains("arxiv.org")) { |
||||
|
isAntiCrawl = false; // 对于arXiv,我们信任它返回的内容
|
||||
|
} |
||||
|
|
||||
|
if (isAntiCrawl) { |
||||
|
System.out.println("检测到反爬页面"); |
||||
|
return ""; |
||||
|
} |
||||
|
|
||||
|
// 智能延迟,模拟真实用户行为,使用更随机的延迟时间
|
||||
|
int delay = RANDOM.nextInt(1500) + 800; // 800-2300ms
|
||||
|
System.out.println("添加随机延迟: " + delay + "ms"); |
||||
|
Thread.sleep(delay); |
||||
|
|
||||
|
return html; |
||||
|
} finally { |
||||
|
// 确保响应和客户端被关闭
|
||||
|
if (response != null) { |
||||
|
try { |
||||
|
response.close(); |
||||
|
} catch (Exception e) { |
||||
|
System.out.println("关闭响应时出错: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
try { |
||||
|
httpClient.close(); |
||||
|
} catch (Exception e) { |
||||
|
System.out.println("关闭HTTP客户端时出错: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} catch (java.net.SocketTimeoutException e) { |
||||
|
System.out.println("HTTP请求超时: " + e.getMessage()); |
||||
|
return ""; |
||||
|
} catch (java.io.IOException e) { |
||||
|
System.out.println("HTTP请求IO错误: " + e.getMessage()); |
||||
|
return ""; |
||||
|
} catch (Exception e) { |
||||
|
System.out.println("发送HTTP请求时出错: " + e.getMessage()); |
||||
|
e.printStackTrace(); |
||||
|
return ""; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 解析 HTML
|
||||
|
public static Document parseHtml(String html) { |
||||
|
return Jsoup.parse(html); |
||||
|
} |
||||
|
|
||||
|
// URL 编码
|
||||
|
public static String urlEncode(String value) throws Exception { |
||||
|
return URLEncoder.encode(value, "UTF-8"); |
||||
|
} |
||||
|
|
||||
|
// 生成唯一文件名
|
||||
|
public static String generateFileName(String keyword) { |
||||
|
return keyword + "_" + System.currentTimeMillis() + ".json"; |
||||
|
} |
||||
|
|
||||
|
// 清理文件名中的非法字符
|
||||
|
public static String cleanFileName(String fileName) { |
||||
|
return fileName.replaceAll("[\\/:*?\"<>|]", "_"); |
||||
|
} |
||||
|
|
||||
|
// 清理论文标题用于文件名
|
||||
|
public static String cleanTitleForFileName(String title) { |
||||
|
if (title == null || title.isEmpty()) { |
||||
|
return "untitled"; |
||||
|
} |
||||
|
String cleaned = title.trim() |
||||
|
.replaceAll("[\\\\/:*?\"<>|]", "_") |
||||
|
.replaceAll("\\s+", "_") |
||||
|
.replaceAll("_+", "_"); |
||||
|
if (cleaned.length() > 100) { |
||||
|
cleaned = cleaned.substring(0, 100); |
||||
|
} |
||||
|
return cleaned; |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue