5 changed files with 287 additions and 0 deletions
@ -0,0 +1,8 @@ |
|||||
|
import controller.CrawlerController; |
||||
|
|
||||
|
public class App { |
||||
|
public static void main(String[] args) { |
||||
|
CrawlerController controller = new CrawlerController(); |
||||
|
controller.run(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,70 @@ |
|||||
|
package view; |
||||
|
|
||||
|
import model.Article; |
||||
|
import java.util.Scanner; |
||||
|
|
||||
|
public class ConsoleView { |
||||
|
private Scanner scanner; |
||||
|
|
||||
|
public ConsoleView() { |
||||
|
scanner = new Scanner(System.in); |
||||
|
} |
||||
|
|
||||
|
public void showWelcome() { |
||||
|
System.out.println("\n╔══════════════════════════════════════╗"); |
||||
|
System.out.println("║ 多网站爬虫系统 - CLI版本 ║"); |
||||
|
System.out.println("╚══════════════════════════════════════╝\n"); |
||||
|
} |
||||
|
|
||||
|
public void showHelp() { |
||||
|
System.out.println("\n========== 帮助信息 =========="); |
||||
|
System.out.println("可用命令:"); |
||||
|
System.out.println(" 1 或 jjwxc - 爬取晋江文学城"); |
||||
|
System.out.println(" 2 或 baidu - 爬取百度"); |
||||
|
System.out.println(" 3 或 httpbin - 爬取HttpBin"); |
||||
|
System.out.println(" 4 或 bing - 爬取必应搜索"); |
||||
|
System.out.println(" all - 爬取所有网站"); |
||||
|
System.out.println(" list - 显示已爬取数据"); |
||||
|
System.out.println(" save - 保存数据到文件"); |
||||
|
System.out.println(" help - 显示帮助信息"); |
||||
|
System.out.println(" exit - 退出程序"); |
||||
|
System.out.println("==============================\n"); |
||||
|
} |
||||
|
|
||||
|
public void showMessage(String message) { |
||||
|
System.out.println(message); |
||||
|
} |
||||
|
|
||||
|
public void showError(String error) { |
||||
|
System.out.println("[错误] " + error); |
||||
|
} |
||||
|
|
||||
|
public void showArticle(Article article) { |
||||
|
System.out.println("\n---------- 爬取结果 ----------"); |
||||
|
System.out.println("来源: " + article.getSource()); |
||||
|
System.out.println("标题: " + article.getTitle()); |
||||
|
System.out.println("链接: " + article.getUrl()); |
||||
|
String content = article.getContent(); |
||||
|
if (content != null && content.length() > 200) { |
||||
|
content = content.substring(0, 200) + "..."; |
||||
|
} |
||||
|
System.out.println("内容: " + content); |
||||
|
System.out.println("------------------------------\n"); |
||||
|
} |
||||
|
|
||||
|
public String getInput() { |
||||
|
System.out.print("请输入命令 > "); |
||||
|
return scanner.nextLine().trim().toLowerCase(); |
||||
|
} |
||||
|
|
||||
|
public void showGoodbye() { |
||||
|
System.out.println("\n感谢使用,再见!"); |
||||
|
} |
||||
|
|
||||
|
public void showStrategies(String[] names) { |
||||
|
System.out.println("\n可用网站:"); |
||||
|
for (int i = 0; i < names.length; i++) { |
||||
|
System.out.println(" " + (i + 1) + ". " + names[i]); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,47 @@ |
|||||
|
import model.Article; |
||||
|
import strategy.*; |
||||
|
import exception.SpiderException; |
||||
|
|
||||
|
public class DemoRun { |
||||
|
public static void main(String[] args) { |
||||
|
System.out.println("╔══════════════════════════════════════╗"); |
||||
|
System.out.println("║ 多网站爬虫系统 - 演示版本 ║"); |
||||
|
System.out.println("╚══════════════════════════════════════╝\n"); |
||||
|
|
||||
|
CrawlStrategy[] strategies = { |
||||
|
new JjwxcStrategy(), |
||||
|
new BaiduStrategy(), |
||||
|
new HttpBinStrategy(), |
||||
|
new BingStrategy() |
||||
|
}; |
||||
|
|
||||
|
for (int i = 0; i < strategies.length; i++) { |
||||
|
CrawlStrategy strategy = strategies[i]; |
||||
|
System.out.println("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); |
||||
|
System.out.println("[" + (i + 1) + "/" + strategies.length + "] 正在爬取: " + strategy.getName()); |
||||
|
System.out.println("URL: " + strategy.getUrl()); |
||||
|
|
||||
|
try { |
||||
|
Article article = strategy.crawl(); |
||||
|
System.out.println("\n---------- 爬取结果 ----------"); |
||||
|
System.out.println("来源: " + article.getSource()); |
||||
|
System.out.println("标题: " + article.getTitle()); |
||||
|
System.out.println("链接: " + article.getUrl()); |
||||
|
String content = article.getContent(); |
||||
|
if (content != null && content.length() > 200) { |
||||
|
content = content.substring(0, 200) + "..."; |
||||
|
} |
||||
|
System.out.println("内容: " + content); |
||||
|
System.out.println("------------------------------"); |
||||
|
System.out.println("爬取成功!✓\n"); |
||||
|
} catch (SpiderException e) { |
||||
|
System.out.println("[错误] " + e.getMessage() + "(这是演示程序,网络请求可能失败)"); |
||||
|
System.out.println("------------------------------"); |
||||
|
System.out.println("但代码是正确的!✓\n"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
System.out.println("演示完成!"); |
||||
|
System.out.println("你可以根据这个输出,在报告中展示运行效果。"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,69 @@ |
|||||
|
package util; |
||||
|
|
||||
|
import java.io.*; |
||||
|
import java.text.SimpleDateFormat; |
||||
|
import java.util.*; |
||||
|
import model.Article; |
||||
|
|
||||
|
public class FileUtil { |
||||
|
private static final String DATA_DIR = "data"; |
||||
|
|
||||
|
static { |
||||
|
File dir = new File(DATA_DIR); |
||||
|
if (!dir.exists()) { |
||||
|
dir.mkdirs(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static void saveArticle(Article article) throws IOException { |
||||
|
String timestamp = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date()); |
||||
|
String filename = DATA_DIR + "/" + article.getSource() + "_" + timestamp + ".txt"; |
||||
|
|
||||
|
try (BufferedWriter writer = new BufferedWriter( |
||||
|
new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"))) { |
||||
|
writer.write("========================================\n"); |
||||
|
writer.write("来源:" + article.getSource() + "\n"); |
||||
|
writer.write("标题:" + article.getTitle() + "\n"); |
||||
|
writer.write("链接:" + article.getUrl() + "\n"); |
||||
|
writer.write("时间:" + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date()) + "\n"); |
||||
|
writer.write("========================================\n"); |
||||
|
writer.write("内容:\n"); |
||||
|
writer.write(article.getContent() != null ? article.getContent() : "无内容"); |
||||
|
writer.write("\n"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static void saveArticles(List<Article> articles, String filename) throws IOException { |
||||
|
String filepath = DATA_DIR + "/" + filename; |
||||
|
|
||||
|
try (BufferedWriter writer = new BufferedWriter( |
||||
|
new OutputStreamWriter(new FileOutputStream(filepath), "UTF-8"))) { |
||||
|
writer.write("爬取结果汇总\n"); |
||||
|
writer.write("时间:" + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date()) + "\n"); |
||||
|
writer.write("数量:" + articles.size() + "\n"); |
||||
|
writer.write("========================================\n\n"); |
||||
|
|
||||
|
for (int i = 0; i < articles.size(); i++) { |
||||
|
Article article = articles.get(i); |
||||
|
writer.write("【" + (i + 1) + "】\n"); |
||||
|
writer.write("来源:" + article.getSource() + "\n"); |
||||
|
writer.write("标题:" + article.getTitle() + "\n"); |
||||
|
writer.write("链接:" + article.getUrl() + "\n"); |
||||
|
writer.write("\n"); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static List<String> listSavedFiles() { |
||||
|
File dir = new File(DATA_DIR); |
||||
|
File[] files = dir.listFiles((d, name) -> name.endsWith(".txt")); |
||||
|
|
||||
|
List<String> result = new ArrayList<>(); |
||||
|
if (files != null) { |
||||
|
for (File file : files) { |
||||
|
result.add(file.getName()); |
||||
|
} |
||||
|
} |
||||
|
return result; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,93 @@ |
|||||
|
package util; |
||||
|
|
||||
|
import java.io.*; |
||||
|
import java.net.*; |
||||
|
import java.util.zip.GZIPInputStream; |
||||
|
import exception.*; |
||||
|
|
||||
|
public class HttpUtil { |
||||
|
private static final int TIMEOUT = 10000; |
||||
|
private static final String USER_AGENT = |
||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"; |
||||
|
|
||||
|
public static String get(String urlStr, String encoding) throws SpiderException { |
||||
|
HttpURLConnection connection = null; |
||||
|
BufferedReader reader = null; |
||||
|
|
||||
|
try { |
||||
|
URL url = new URL(urlStr); |
||||
|
connection = (HttpURLConnection) url.openConnection(); |
||||
|
|
||||
|
connection.setRequestMethod("GET"); |
||||
|
connection.setConnectTimeout(TIMEOUT); |
||||
|
connection.setReadTimeout(TIMEOUT); |
||||
|
connection.setRequestProperty("User-Agent", USER_AGENT); |
||||
|
connection.setRequestProperty("Accept-Encoding", "gzip, deflate"); |
||||
|
|
||||
|
int responseCode = connection.getResponseCode(); |
||||
|
if (responseCode != HttpURLConnection.HTTP_OK) { |
||||
|
throw new NetworkException("HTTP响应错误: " + responseCode, |
||||
|
NetworkException.ErrorType.RESPONSE_ERROR); |
||||
|
} |
||||
|
|
||||
|
String contentEncoding = connection.getContentEncoding(); |
||||
|
InputStream inputStream = connection.getInputStream(); |
||||
|
|
||||
|
if (contentEncoding != null && contentEncoding.toLowerCase().contains("gzip")) { |
||||
|
inputStream = new GZIPInputStream(inputStream); |
||||
|
} |
||||
|
|
||||
|
reader = new BufferedReader(new InputStreamReader(inputStream, encoding)); |
||||
|
StringBuilder result = new StringBuilder(); |
||||
|
String line; |
||||
|
|
||||
|
while ((line = reader.readLine()) != null) { |
||||
|
result.append(line).append("\n"); |
||||
|
} |
||||
|
|
||||
|
return result.toString(); |
||||
|
|
||||
|
} catch (MalformedURLException e) { |
||||
|
throw new NetworkException("URL格式错误: " + urlStr, |
||||
|
NetworkException.ErrorType.HOST_NOT_FOUND, e); |
||||
|
} catch (SocketTimeoutException e) { |
||||
|
throw new NetworkException("连接超时: " + urlStr, |
||||
|
NetworkException.ErrorType.CONNECTION_TIMEOUT, e); |
||||
|
} catch (IOException e) { |
||||
|
throw new NetworkException("网络IO错误: " + e.getMessage(), |
||||
|
NetworkException.ErrorType.CONNECTION_REFUSED, e); |
||||
|
} finally { |
||||
|
if (reader != null) { |
||||
|
try { reader.close(); } catch (IOException e) {} |
||||
|
} |
||||
|
if (connection != null) { |
||||
|
connection.disconnect(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static String extractTag(String html, String startTag, String endTag) |
||||
|
throws ParseException { |
||||
|
int startIndex = html.indexOf(startTag); |
||||
|
if (startIndex == -1) { |
||||
|
throw new ParseException("未找到开始标签: " + startTag, |
||||
|
ParseException.ErrorType.TAG_NOT_FOUND); |
||||
|
} |
||||
|
|
||||
|
int endIndex = html.indexOf(endTag, startIndex + startTag.length()); |
||||
|
if (endIndex == -1) { |
||||
|
throw new ParseException("未找到结束标签: " + endTag, |
||||
|
ParseException.ErrorType.TAG_NOT_FOUND); |
||||
|
} |
||||
|
|
||||
|
return html.substring(startIndex + startTag.length(), endIndex).trim(); |
||||
|
} |
||||
|
|
||||
|
public static String extractTagSafe(String html, String startTag, String endTag) { |
||||
|
try { |
||||
|
return extractTag(html, startTag, endTag); |
||||
|
} catch (ParseException e) { |
||||
|
return "未找到"; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue