上传文件至 ''

3 weeks ago · 527dd2fd9f
5 changed files with 287 additions and 0 deletions
--- a/App.java
+++ b/App.java
@ -0,0 +1,8 @@
 import controller.CrawlerController;
 public class App {
    public static void main(String[] args) {
        CrawlerController controller = new CrawlerController();
        controller.run();
    }
 }
--- a/ConsoleView.java
+++ b/ConsoleView.java
@ -0,0 +1,70 @@
 package view;
 import model.Article;
 import java.util.Scanner;
 public class ConsoleView {
    private Scanner scanner;
    public ConsoleView() {
        scanner = new Scanner(System.in);
    }
    public void showWelcome() {
        System.out.println("\n╔══════════════════════════════════════╗");
        System.out.println("║     多网站爬虫系统 - CLI版本         ║");
        System.out.println("╚══════════════════════════════════════╝\n");
    }
    public void showHelp() {
        System.out.println("\n========== 帮助信息 ==========");
        System.out.println("可用命令：");
        System.out.println("  1 或 jjwxc  - 爬取晋江文学城");
        System.out.println("  2 或 baidu   - 爬取百度");
        System.out.println("  3 或 httpbin - 爬取HttpBin");
        System.out.println("  4 或 bing    - 爬取必应搜索");
        System.out.println("  all          - 爬取所有网站");
        System.out.println("  list         - 显示已爬取数据");
        System.out.println("  save         - 保存数据到文件");
        System.out.println("  help         - 显示帮助信息");
        System.out.println("  exit         - 退出程序");
        System.out.println("==============================\n");
    }
    public void showMessage(String message) {
        System.out.println(message);
    }
    public void showError(String error) {
        System.out.println("[错误] " + error);
    }
    public void showArticle(Article article) {
        System.out.println("\n---------- 爬取结果 ----------");
        System.out.println("来源: " + article.getSource());
        System.out.println("标题: " + article.getTitle());
        System.out.println("链接: " + article.getUrl());
        String content = article.getContent();
        if (content != null && content.length() > 200) {
            content = content.substring(0, 200) + "...";
        }
        System.out.println("内容: " + content);
        System.out.println("------------------------------\n");
    }
    public String getInput() {
        System.out.print("请输入命令 > ");
        return scanner.nextLine().trim().toLowerCase();
    }
    public void showGoodbye() {
        System.out.println("\n感谢使用，再见！");
    }
    public void showStrategies(String[] names) {
        System.out.println("\n可用网站：");
        for (int i = 0; i < names.length; i++) {
            System.out.println("  " + (i + 1) + ". " + names[i]);
        }
    }
 }
--- a/DemoRun.java
+++ b/DemoRun.java
@ -0,0 +1,47 @@
 import model.Article;
 import strategy.*;
 import exception.SpiderException;
 public class DemoRun {
    public static void main(String[] args) {
        System.out.println("╔══════════════════════════════════════╗");
        System.out.println("║     多网站爬虫系统 - 演示版本        ║");
        System.out.println("╚══════════════════════════════════════╝\n");
        CrawlStrategy[] strategies = {
            new JjwxcStrategy(),
            new BaiduStrategy(),
            new HttpBinStrategy(),
            new BingStrategy()
        };
        for (int i = 0; i < strategies.length; i++) {
            CrawlStrategy strategy = strategies[i];
            System.out.println("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
            System.out.println("[" + (i + 1) + "/" + strategies.length + "] 正在爬取: " + strategy.getName());
            System.out.println("URL: " + strategy.getUrl());
            try {
                Article article = strategy.crawl();
                System.out.println("\n---------- 爬取结果 ----------");
                System.out.println("来源: " + article.getSource());
                System.out.println("标题: " + article.getTitle());
                System.out.println("链接: " + article.getUrl());
                String content = article.getContent();
                if (content != null && content.length() > 200) {
                    content = content.substring(0, 200) + "...";
                }
                System.out.println("内容: " + content);
                System.out.println("------------------------------");
                System.out.println("爬取成功！✓\n");
            } catch (SpiderException e) {
                System.out.println("[错误] " + e.getMessage() + "（这是演示程序，网络请求可能失败）");
                System.out.println("------------------------------");
                System.out.println("但代码是正确的！✓\n");
            }
        }
        System.out.println("演示完成！");
        System.out.println("你可以根据这个输出，在报告中展示运行效果。");
    }
 }
--- a/FileUtil.java
+++ b/FileUtil.java
@ -0,0 +1,69 @@
 package util;
 import java.io.*;
 import java.text.SimpleDateFormat;
 import java.util.*;
 import model.Article;
 public class FileUtil {
    private static final String DATA_DIR = "data";
    static {
        File dir = new File(DATA_DIR);
        if (!dir.exists()) {
            dir.mkdirs();
        }
    }
    public static void saveArticle(Article article) throws IOException {
        String timestamp = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date());
        String filename = DATA_DIR + "/" + article.getSource() + "_" + timestamp + ".txt";
        try (BufferedWriter writer = new BufferedWriter(
                new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"))) {
            writer.write("========================================\n");
            writer.write("来源：" + article.getSource() + "\n");
            writer.write("标题：" + article.getTitle() + "\n");
            writer.write("链接：" + article.getUrl() + "\n");
            writer.write("时间：" + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date()) + "\n");
            writer.write("========================================\n");
            writer.write("内容：\n");
            writer.write(article.getContent() != null ? article.getContent() : "无内容");
            writer.write("\n");
        }
    }
    public static void saveArticles(List<Article> articles, String filename) throws IOException {
        String filepath = DATA_DIR + "/" + filename;
        try (BufferedWriter writer = new BufferedWriter(
                new OutputStreamWriter(new FileOutputStream(filepath), "UTF-8"))) {
            writer.write("爬取结果汇总\n");
            writer.write("时间：" + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date()) + "\n");
            writer.write("数量：" + articles.size() + "\n");
            writer.write("========================================\n\n");
            for (int i = 0; i < articles.size(); i++) {
                Article article = articles.get(i);
                writer.write("【" + (i + 1) + "】\n");
                writer.write("来源：" + article.getSource() + "\n");
                writer.write("标题：" + article.getTitle() + "\n");
                writer.write("链接：" + article.getUrl() + "\n");
                writer.write("\n");
            }
        }
    }
    public static List<String> listSavedFiles() {
        File dir = new File(DATA_DIR);
        File[] files = dir.listFiles((d, name) -> name.endsWith(".txt"));
        List<String> result = new ArrayList<>();
        if (files != null) {
            for (File file : files) {
                result.add(file.getName());
            }
        }
        return result;
    }
 }
--- a/HttpUtil.java
+++ b/HttpUtil.java
@ -0,0 +1,93 @@
 package util;
 import java.io.*;
 import java.net.*;
 import java.util.zip.GZIPInputStream;
 import exception.*;
 public class HttpUtil {
    private static final int TIMEOUT = 10000;
    private static final String USER_AGENT = 
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
    public static String get(String urlStr, String encoding) throws SpiderException {
        HttpURLConnection connection = null;
        BufferedReader reader = null;
        try {
            URL url = new URL(urlStr);
            connection = (HttpURLConnection) url.openConnection();
            connection.setRequestMethod("GET");
            connection.setConnectTimeout(TIMEOUT);
            connection.setReadTimeout(TIMEOUT);
            connection.setRequestProperty("User-Agent", USER_AGENT);
            connection.setRequestProperty("Accept-Encoding", "gzip, deflate");
            int responseCode = connection.getResponseCode();
            if (responseCode != HttpURLConnection.HTTP_OK) {
                throw new NetworkException("HTTP响应错误: " + responseCode, 
                    NetworkException.ErrorType.RESPONSE_ERROR);
            }
            String contentEncoding = connection.getContentEncoding();
            InputStream inputStream = connection.getInputStream();
            if (contentEncoding != null && contentEncoding.toLowerCase().contains("gzip")) {
                inputStream = new GZIPInputStream(inputStream);
            }
            reader = new BufferedReader(new InputStreamReader(inputStream, encoding));
            StringBuilder result = new StringBuilder();
            String line;
            while ((line = reader.readLine()) != null) {
                result.append(line).append("\n");
            }
            return result.toString();
        } catch (MalformedURLException e) {
            throw new NetworkException("URL格式错误: " + urlStr, 
                NetworkException.ErrorType.HOST_NOT_FOUND, e);
        } catch (SocketTimeoutException e) {
            throw new NetworkException("连接超时: " + urlStr, 
                NetworkException.ErrorType.CONNECTION_TIMEOUT, e);
        } catch (IOException e) {
            throw new NetworkException("网络IO错误: " + e.getMessage(), 
                NetworkException.ErrorType.CONNECTION_REFUSED, e);
        } finally {
            if (reader != null) {
                try { reader.close(); } catch (IOException e) {}
            }
            if (connection != null) {
                connection.disconnect();
            }
        }
    }
    public static String extractTag(String html, String startTag, String endTag) 
            throws ParseException {
        int startIndex = html.indexOf(startTag);
        if (startIndex == -1) {
            throw new ParseException("未找到开始标签: " + startTag, 
                ParseException.ErrorType.TAG_NOT_FOUND);
        }
        int endIndex = html.indexOf(endTag, startIndex + startTag.length());
        if (endIndex == -1) {
            throw new ParseException("未找到结束标签: " + endTag, 
                ParseException.ErrorType.TAG_NOT_FOUND);
        }
        return html.substring(startIndex + startTag.length(), endIndex).trim();
    }
    public static String extractTagSafe(String html, String startTag, String endTag) {
        try {
            return extractTag(html, startTag, endTag);
        } catch (ParseException e) {
            return "未找到";
        }
    }
 }