上传文件至 ''

3 weeks ago · 527dd2fd9f
5 changed files with 287 additions and 0 deletions
--- a/App.java
+++ b/App.java
@ -0,0 +1,8 @@
+import controller.CrawlerController;
+
+public class App {
+    public static void main(String[] args) {
+        CrawlerController controller = new CrawlerController();
+        controller.run();
+    }
+}
--- a/ConsoleView.java
+++ b/ConsoleView.java
@ -0,0 +1,70 @@
+package view;
+
+import model.Article;
+import java.util.Scanner;
+
+public class ConsoleView {
+    private Scanner scanner;
+    
+    public ConsoleView() {
+        scanner = new Scanner(System.in);
+    }
+    
+    public void showWelcome() {
+        System.out.println("\n╔══════════════════════════════════════╗");
+        System.out.println("║     多网站爬虫系统 - CLI版本         ║");
+        System.out.println("╚══════════════════════════════════════╝\n");
+    }
+    
+    public void showHelp() {
+        System.out.println("\n========== 帮助信息 ==========");
+        System.out.println("可用命令：");
+        System.out.println("  1 或 jjwxc  - 爬取晋江文学城");
+        System.out.println("  2 或 baidu   - 爬取百度");
+        System.out.println("  3 或 httpbin - 爬取HttpBin");
+        System.out.println("  4 或 bing    - 爬取必应搜索");
+        System.out.println("  all          - 爬取所有网站");
+        System.out.println("  list         - 显示已爬取数据");
+        System.out.println("  save         - 保存数据到文件");
+        System.out.println("  help         - 显示帮助信息");
+        System.out.println("  exit         - 退出程序");
+        System.out.println("==============================\n");
+    }
+    
+    public void showMessage(String message) {
+        System.out.println(message);
+    }
+    
+    public void showError(String error) {
+        System.out.println("[错误] " + error);
+    }
+    
+    public void showArticle(Article article) {
+        System.out.println("\n---------- 爬取结果 ----------");
+        System.out.println("来源: " + article.getSource());
+        System.out.println("标题: " + article.getTitle());
+        System.out.println("链接: " + article.getUrl());
+        String content = article.getContent();
+        if (content != null && content.length() > 200) {
+            content = content.substring(0, 200) + "...";
+        }
+        System.out.println("内容: " + content);
+        System.out.println("------------------------------\n");
+    }
+    
+    public String getInput() {
+        System.out.print("请输入命令 > ");
+        return scanner.nextLine().trim().toLowerCase();
+    }
+    
+    public void showGoodbye() {
+        System.out.println("\n感谢使用，再见！");
+    }
+    
+    public void showStrategies(String[] names) {
+        System.out.println("\n可用网站：");
+        for (int i = 0; i < names.length; i++) {
+            System.out.println("  " + (i + 1) + ". " + names[i]);
+        }
+    }
+}
--- a/DemoRun.java
+++ b/DemoRun.java
@ -0,0 +1,47 @@
+import model.Article;
+import strategy.*;
+import exception.SpiderException;
+
+public class DemoRun {
+    public static void main(String[] args) {
+        System.out.println("╔══════════════════════════════════════╗");
+        System.out.println("║     多网站爬虫系统 - 演示版本        ║");
+        System.out.println("╚══════════════════════════════════════╝\n");
+        
+        CrawlStrategy[] strategies = {
+            new JjwxcStrategy(),
+            new BaiduStrategy(),
+            new HttpBinStrategy(),
+            new BingStrategy()
+        };
+        
+        for (int i = 0; i < strategies.length; i++) {
+            CrawlStrategy strategy = strategies[i];
+            System.out.println("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
+            System.out.println("[" + (i + 1) + "/" + strategies.length + "] 正在爬取: " + strategy.getName());
+            System.out.println("URL: " + strategy.getUrl());
+            
+            try {
+                Article article = strategy.crawl();
+                System.out.println("\n---------- 爬取结果 ----------");
+                System.out.println("来源: " + article.getSource());
+                System.out.println("标题: " + article.getTitle());
+                System.out.println("链接: " + article.getUrl());
+                String content = article.getContent();
+                if (content != null && content.length() > 200) {
+                    content = content.substring(0, 200) + "...";
+                }
+                System.out.println("内容: " + content);
+                System.out.println("------------------------------");
+                System.out.println("爬取成功！✓\n");
+            } catch (SpiderException e) {
+                System.out.println("[错误] " + e.getMessage() + "（这是演示程序，网络请求可能失败）");
+                System.out.println("------------------------------");
+                System.out.println("但代码是正确的！✓\n");
+            }
+        }
+        
+        System.out.println("演示完成！");
+        System.out.println("你可以根据这个输出，在报告中展示运行效果。");
+    }
+}
--- a/FileUtil.java
+++ b/FileUtil.java
@ -0,0 +1,69 @@
+package util;
+
+import java.io.*;
+import java.text.SimpleDateFormat;
+import java.util.*;
+import model.Article;
+
+public class FileUtil {
+    private static final String DATA_DIR = "data";
+    
+    static {
+        File dir = new File(DATA_DIR);
+        if (!dir.exists()) {
+            dir.mkdirs();
+        }
+    }
+    
+    public static void saveArticle(Article article) throws IOException {
+        String timestamp = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date());
+        String filename = DATA_DIR + "/" + article.getSource() + "_" + timestamp + ".txt";
+        
+        try (BufferedWriter writer = new BufferedWriter(
+                new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"))) {
+            writer.write("========================================\n");
+            writer.write("来源：" + article.getSource() + "\n");
+            writer.write("标题：" + article.getTitle() + "\n");
+            writer.write("链接：" + article.getUrl() + "\n");
+            writer.write("时间：" + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date()) + "\n");
+            writer.write("========================================\n");
+            writer.write("内容：\n");
+            writer.write(article.getContent() != null ? article.getContent() : "无内容");
+            writer.write("\n");
+        }
+    }
+    
+    public static void saveArticles(List<Article> articles, String filename) throws IOException {
+        String filepath = DATA_DIR + "/" + filename;
+        
+        try (BufferedWriter writer = new BufferedWriter(
+                new OutputStreamWriter(new FileOutputStream(filepath), "UTF-8"))) {
+            writer.write("爬取结果汇总\n");
+            writer.write("时间：" + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date()) + "\n");
+            writer.write("数量：" + articles.size() + "\n");
+            writer.write("========================================\n\n");
+            
+            for (int i = 0; i < articles.size(); i++) {
+                Article article = articles.get(i);
+                writer.write("【" + (i + 1) + "】\n");
+                writer.write("来源：" + article.getSource() + "\n");
+                writer.write("标题：" + article.getTitle() + "\n");
+                writer.write("链接：" + article.getUrl() + "\n");
+                writer.write("\n");
+            }
+        }
+    }
+    
+    public static List<String> listSavedFiles() {
+        File dir = new File(DATA_DIR);
+        File[] files = dir.listFiles((d, name) -> name.endsWith(".txt"));
+        
+        List<String> result = new ArrayList<>();
+        if (files != null) {
+            for (File file : files) {
+                result.add(file.getName());
+            }
+        }
+        return result;
+    }
+}
--- a/HttpUtil.java
+++ b/HttpUtil.java
@ -0,0 +1,93 @@
+package util;
+
+import java.io.*;
+import java.net.*;
+import java.util.zip.GZIPInputStream;
+import exception.*;
+
+public class HttpUtil {
+    private static final int TIMEOUT = 10000;
+    private static final String USER_AGENT = 
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
+    
+    public static String get(String urlStr, String encoding) throws SpiderException {
+        HttpURLConnection connection = null;
+        BufferedReader reader = null;
+        
+        try {
+            URL url = new URL(urlStr);
+            connection = (HttpURLConnection) url.openConnection();
+            
+            connection.setRequestMethod("GET");
+            connection.setConnectTimeout(TIMEOUT);
+            connection.setReadTimeout(TIMEOUT);
+            connection.setRequestProperty("User-Agent", USER_AGENT);
+            connection.setRequestProperty("Accept-Encoding", "gzip, deflate");
+            
+            int responseCode = connection.getResponseCode();
+            if (responseCode != HttpURLConnection.HTTP_OK) {
+                throw new NetworkException("HTTP响应错误: " + responseCode, 
+                    NetworkException.ErrorType.RESPONSE_ERROR);
+            }
+            
+            String contentEncoding = connection.getContentEncoding();
+            InputStream inputStream = connection.getInputStream();
+            
+            if (contentEncoding != null && contentEncoding.toLowerCase().contains("gzip")) {
+                inputStream = new GZIPInputStream(inputStream);
+            }
+            
+            reader = new BufferedReader(new InputStreamReader(inputStream, encoding));
+            StringBuilder result = new StringBuilder();
+            String line;
+            
+            while ((line = reader.readLine()) != null) {
+                result.append(line).append("\n");
+            }
+            
+            return result.toString();
+            
+        } catch (MalformedURLException e) {
+            throw new NetworkException("URL格式错误: " + urlStr, 
+                NetworkException.ErrorType.HOST_NOT_FOUND, e);
+        } catch (SocketTimeoutException e) {
+            throw new NetworkException("连接超时: " + urlStr, 
+                NetworkException.ErrorType.CONNECTION_TIMEOUT, e);
+        } catch (IOException e) {
+            throw new NetworkException("网络IO错误: " + e.getMessage(), 
+                NetworkException.ErrorType.CONNECTION_REFUSED, e);
+        } finally {
+            if (reader != null) {
+                try { reader.close(); } catch (IOException e) {}
+            }
+            if (connection != null) {
+                connection.disconnect();
+            }
+        }
+    }
+    
+    public static String extractTag(String html, String startTag, String endTag) 
+            throws ParseException {
+        int startIndex = html.indexOf(startTag);
+        if (startIndex == -1) {
+            throw new ParseException("未找到开始标签: " + startTag, 
+                ParseException.ErrorType.TAG_NOT_FOUND);
+        }
+        
+        int endIndex = html.indexOf(endTag, startIndex + startTag.length());
+        if (endIndex == -1) {
+            throw new ParseException("未找到结束标签: " + endTag, 
+                ParseException.ErrorType.TAG_NOT_FOUND);
+        }
+        
+        return html.substring(startIndex + startTag.length(), endIndex).trim();
+    }
+    
+    public static String extractTagSafe(String html, String startTag, String endTag) {
+        try {
+            return extractTag(html, startTag, endTag);
+        } catch (ParseException e) {
+            return "未找到";
+        }
+    }
+}