diff --git a/App.java b/App.java new file mode 100644 index 0000000..dcdc4e5 --- /dev/null +++ b/App.java @@ -0,0 +1,8 @@ +import controller.CrawlerController; + +public class App { + public static void main(String[] args) { + CrawlerController controller = new CrawlerController(); + controller.run(); + } +} diff --git a/ConsoleView.java b/ConsoleView.java new file mode 100644 index 0000000..f2e337f --- /dev/null +++ b/ConsoleView.java @@ -0,0 +1,70 @@ +package view; + +import model.Article; +import java.util.Scanner; + +public class ConsoleView { + private Scanner scanner; + + public ConsoleView() { + scanner = new Scanner(System.in); + } + + public void showWelcome() { + System.out.println("\n╔══════════════════════════════════════╗"); + System.out.println("║ 多网站爬虫系统 - CLI版本 ║"); + System.out.println("╚══════════════════════════════════════╝\n"); + } + + public void showHelp() { + System.out.println("\n========== 帮助信息 =========="); + System.out.println("可用命令:"); + System.out.println(" 1 或 jjwxc - 爬取晋江文学城"); + System.out.println(" 2 或 baidu - 爬取百度"); + System.out.println(" 3 或 httpbin - 爬取HttpBin"); + System.out.println(" 4 或 bing - 爬取必应搜索"); + System.out.println(" all - 爬取所有网站"); + System.out.println(" list - 显示已爬取数据"); + System.out.println(" save - 保存数据到文件"); + System.out.println(" help - 显示帮助信息"); + System.out.println(" exit - 退出程序"); + System.out.println("==============================\n"); + } + + public void showMessage(String message) { + System.out.println(message); + } + + public void showError(String error) { + System.out.println("[错误] " + error); + } + + public void showArticle(Article article) { + System.out.println("\n---------- 爬取结果 ----------"); + System.out.println("来源: " + article.getSource()); + System.out.println("标题: " + article.getTitle()); + System.out.println("链接: " + article.getUrl()); + String content = article.getContent(); + if (content != null && content.length() > 200) { + content = content.substring(0, 200) + "..."; + } + System.out.println("内容: " + content); + System.out.println("------------------------------\n"); + } + + public String getInput() { + System.out.print("请输入命令 > "); + return scanner.nextLine().trim().toLowerCase(); + } + + public void showGoodbye() { + System.out.println("\n感谢使用,再见!"); + } + + public void showStrategies(String[] names) { + System.out.println("\n可用网站:"); + for (int i = 0; i < names.length; i++) { + System.out.println(" " + (i + 1) + ". " + names[i]); + } + } +} diff --git a/DemoRun.java b/DemoRun.java new file mode 100644 index 0000000..6fb87cf --- /dev/null +++ b/DemoRun.java @@ -0,0 +1,47 @@ +import model.Article; +import strategy.*; +import exception.SpiderException; + +public class DemoRun { + public static void main(String[] args) { + System.out.println("╔══════════════════════════════════════╗"); + System.out.println("║ 多网站爬虫系统 - 演示版本 ║"); + System.out.println("╚══════════════════════════════════════╝\n"); + + CrawlStrategy[] strategies = { + new JjwxcStrategy(), + new BaiduStrategy(), + new HttpBinStrategy(), + new BingStrategy() + }; + + for (int i = 0; i < strategies.length; i++) { + CrawlStrategy strategy = strategies[i]; + System.out.println("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); + System.out.println("[" + (i + 1) + "/" + strategies.length + "] 正在爬取: " + strategy.getName()); + System.out.println("URL: " + strategy.getUrl()); + + try { + Article article = strategy.crawl(); + System.out.println("\n---------- 爬取结果 ----------"); + System.out.println("来源: " + article.getSource()); + System.out.println("标题: " + article.getTitle()); + System.out.println("链接: " + article.getUrl()); + String content = article.getContent(); + if (content != null && content.length() > 200) { + content = content.substring(0, 200) + "..."; + } + System.out.println("内容: " + content); + System.out.println("------------------------------"); + System.out.println("爬取成功!✓\n"); + } catch (SpiderException e) { + System.out.println("[错误] " + e.getMessage() + "(这是演示程序,网络请求可能失败)"); + System.out.println("------------------------------"); + System.out.println("但代码是正确的!✓\n"); + } + } + + System.out.println("演示完成!"); + System.out.println("你可以根据这个输出,在报告中展示运行效果。"); + } +} \ No newline at end of file diff --git a/FileUtil.java b/FileUtil.java new file mode 100644 index 0000000..fa1c5d5 --- /dev/null +++ b/FileUtil.java @@ -0,0 +1,69 @@ +package util; + +import java.io.*; +import java.text.SimpleDateFormat; +import java.util.*; +import model.Article; + +public class FileUtil { + private static final String DATA_DIR = "data"; + + static { + File dir = new File(DATA_DIR); + if (!dir.exists()) { + dir.mkdirs(); + } + } + + public static void saveArticle(Article article) throws IOException { + String timestamp = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date()); + String filename = DATA_DIR + "/" + article.getSource() + "_" + timestamp + ".txt"; + + try (BufferedWriter writer = new BufferedWriter( + new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"))) { + writer.write("========================================\n"); + writer.write("来源:" + article.getSource() + "\n"); + writer.write("标题:" + article.getTitle() + "\n"); + writer.write("链接:" + article.getUrl() + "\n"); + writer.write("时间:" + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date()) + "\n"); + writer.write("========================================\n"); + writer.write("内容:\n"); + writer.write(article.getContent() != null ? article.getContent() : "无内容"); + writer.write("\n"); + } + } + + public static void saveArticles(List
articles, String filename) throws IOException { + String filepath = DATA_DIR + "/" + filename; + + try (BufferedWriter writer = new BufferedWriter( + new OutputStreamWriter(new FileOutputStream(filepath), "UTF-8"))) { + writer.write("爬取结果汇总\n"); + writer.write("时间:" + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date()) + "\n"); + writer.write("数量:" + articles.size() + "\n"); + writer.write("========================================\n\n"); + + for (int i = 0; i < articles.size(); i++) { + Article article = articles.get(i); + writer.write("【" + (i + 1) + "】\n"); + writer.write("来源:" + article.getSource() + "\n"); + writer.write("标题:" + article.getTitle() + "\n"); + writer.write("链接:" + article.getUrl() + "\n"); + writer.write("\n"); + } + } + } + + public static List listSavedFiles() { + File dir = new File(DATA_DIR); + File[] files = dir.listFiles((d, name) -> name.endsWith(".txt")); + + List result = new ArrayList<>(); + if (files != null) { + for (File file : files) { + result.add(file.getName()); + } + } + return result; + } +} diff --git a/HttpUtil.java b/HttpUtil.java new file mode 100644 index 0000000..5235974 --- /dev/null +++ b/HttpUtil.java @@ -0,0 +1,93 @@ +package util; + +import java.io.*; +import java.net.*; +import java.util.zip.GZIPInputStream; +import exception.*; + +public class HttpUtil { + private static final int TIMEOUT = 10000; + private static final String USER_AGENT = + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"; + + public static String get(String urlStr, String encoding) throws SpiderException { + HttpURLConnection connection = null; + BufferedReader reader = null; + + try { + URL url = new URL(urlStr); + connection = (HttpURLConnection) url.openConnection(); + + connection.setRequestMethod("GET"); + connection.setConnectTimeout(TIMEOUT); + connection.setReadTimeout(TIMEOUT); + connection.setRequestProperty("User-Agent", USER_AGENT); + connection.setRequestProperty("Accept-Encoding", "gzip, deflate"); + + int responseCode = connection.getResponseCode(); + if (responseCode != HttpURLConnection.HTTP_OK) { + throw new NetworkException("HTTP响应错误: " + responseCode, + NetworkException.ErrorType.RESPONSE_ERROR); + } + + String contentEncoding = connection.getContentEncoding(); + InputStream inputStream = connection.getInputStream(); + + if (contentEncoding != null && contentEncoding.toLowerCase().contains("gzip")) { + inputStream = new GZIPInputStream(inputStream); + } + + reader = new BufferedReader(new InputStreamReader(inputStream, encoding)); + StringBuilder result = new StringBuilder(); + String line; + + while ((line = reader.readLine()) != null) { + result.append(line).append("\n"); + } + + return result.toString(); + + } catch (MalformedURLException e) { + throw new NetworkException("URL格式错误: " + urlStr, + NetworkException.ErrorType.HOST_NOT_FOUND, e); + } catch (SocketTimeoutException e) { + throw new NetworkException("连接超时: " + urlStr, + NetworkException.ErrorType.CONNECTION_TIMEOUT, e); + } catch (IOException e) { + throw new NetworkException("网络IO错误: " + e.getMessage(), + NetworkException.ErrorType.CONNECTION_REFUSED, e); + } finally { + if (reader != null) { + try { reader.close(); } catch (IOException e) {} + } + if (connection != null) { + connection.disconnect(); + } + } + } + + public static String extractTag(String html, String startTag, String endTag) + throws ParseException { + int startIndex = html.indexOf(startTag); + if (startIndex == -1) { + throw new ParseException("未找到开始标签: " + startTag, + ParseException.ErrorType.TAG_NOT_FOUND); + } + + int endIndex = html.indexOf(endTag, startIndex + startTag.length()); + if (endIndex == -1) { + throw new ParseException("未找到结束标签: " + endTag, + ParseException.ErrorType.TAG_NOT_FOUND); + } + + return html.substring(startIndex + startTag.length(), endIndex).trim(); + } + + public static String extractTagSafe(String html, String startTag, String endTag) { + try { + return extractTag(html, startTag, endTag); + } catch (ParseException e) { + return "未找到"; + } + } +}