Browse Source

上传文件至 ''

main
Hanminxi 3 weeks ago
parent
commit
527dd2fd9f
  1. 8
      App.java
  2. 70
      ConsoleView.java
  3. 47
      DemoRun.java
  4. 69
      FileUtil.java
  5. 93
      HttpUtil.java

8
App.java

@ -0,0 +1,8 @@
import controller.CrawlerController;
public class App {
public static void main(String[] args) {
CrawlerController controller = new CrawlerController();
controller.run();
}
}

70
ConsoleView.java

@ -0,0 +1,70 @@
package view;
import model.Article;
import java.util.Scanner;
public class ConsoleView {
private Scanner scanner;
public ConsoleView() {
scanner = new Scanner(System.in);
}
public void showWelcome() {
System.out.println("\n╔══════════════════════════════════════╗");
System.out.println("║ 多网站爬虫系统 - CLI版本 ║");
System.out.println("╚══════════════════════════════════════╝\n");
}
public void showHelp() {
System.out.println("\n========== 帮助信息 ==========");
System.out.println("可用命令:");
System.out.println(" 1 或 jjwxc - 爬取晋江文学城");
System.out.println(" 2 或 baidu - 爬取百度");
System.out.println(" 3 或 httpbin - 爬取HttpBin");
System.out.println(" 4 或 bing - 爬取必应搜索");
System.out.println(" all - 爬取所有网站");
System.out.println(" list - 显示已爬取数据");
System.out.println(" save - 保存数据到文件");
System.out.println(" help - 显示帮助信息");
System.out.println(" exit - 退出程序");
System.out.println("==============================\n");
}
public void showMessage(String message) {
System.out.println(message);
}
public void showError(String error) {
System.out.println("[错误] " + error);
}
public void showArticle(Article article) {
System.out.println("\n---------- 爬取结果 ----------");
System.out.println("来源: " + article.getSource());
System.out.println("标题: " + article.getTitle());
System.out.println("链接: " + article.getUrl());
String content = article.getContent();
if (content != null && content.length() > 200) {
content = content.substring(0, 200) + "...";
}
System.out.println("内容: " + content);
System.out.println("------------------------------\n");
}
public String getInput() {
System.out.print("请输入命令 > ");
return scanner.nextLine().trim().toLowerCase();
}
public void showGoodbye() {
System.out.println("\n感谢使用,再见!");
}
public void showStrategies(String[] names) {
System.out.println("\n可用网站:");
for (int i = 0; i < names.length; i++) {
System.out.println(" " + (i + 1) + ". " + names[i]);
}
}
}

47
DemoRun.java

@ -0,0 +1,47 @@
import model.Article;
import strategy.*;
import exception.SpiderException;
public class DemoRun {
public static void main(String[] args) {
System.out.println("╔══════════════════════════════════════╗");
System.out.println("║ 多网站爬虫系统 - 演示版本 ║");
System.out.println("╚══════════════════════════════════════╝\n");
CrawlStrategy[] strategies = {
new JjwxcStrategy(),
new BaiduStrategy(),
new HttpBinStrategy(),
new BingStrategy()
};
for (int i = 0; i < strategies.length; i++) {
CrawlStrategy strategy = strategies[i];
System.out.println("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
System.out.println("[" + (i + 1) + "/" + strategies.length + "] 正在爬取: " + strategy.getName());
System.out.println("URL: " + strategy.getUrl());
try {
Article article = strategy.crawl();
System.out.println("\n---------- 爬取结果 ----------");
System.out.println("来源: " + article.getSource());
System.out.println("标题: " + article.getTitle());
System.out.println("链接: " + article.getUrl());
String content = article.getContent();
if (content != null && content.length() > 200) {
content = content.substring(0, 200) + "...";
}
System.out.println("内容: " + content);
System.out.println("------------------------------");
System.out.println("爬取成功!✓\n");
} catch (SpiderException e) {
System.out.println("[错误] " + e.getMessage() + "(这是演示程序,网络请求可能失败)");
System.out.println("------------------------------");
System.out.println("但代码是正确的!✓\n");
}
}
System.out.println("演示完成!");
System.out.println("你可以根据这个输出,在报告中展示运行效果。");
}
}

69
FileUtil.java

@ -0,0 +1,69 @@
package util;
import java.io.*;
import java.text.SimpleDateFormat;
import java.util.*;
import model.Article;
public class FileUtil {
private static final String DATA_DIR = "data";
static {
File dir = new File(DATA_DIR);
if (!dir.exists()) {
dir.mkdirs();
}
}
public static void saveArticle(Article article) throws IOException {
String timestamp = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date());
String filename = DATA_DIR + "/" + article.getSource() + "_" + timestamp + ".txt";
try (BufferedWriter writer = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"))) {
writer.write("========================================\n");
writer.write("来源:" + article.getSource() + "\n");
writer.write("标题:" + article.getTitle() + "\n");
writer.write("链接:" + article.getUrl() + "\n");
writer.write("时间:" + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date()) + "\n");
writer.write("========================================\n");
writer.write("内容:\n");
writer.write(article.getContent() != null ? article.getContent() : "无内容");
writer.write("\n");
}
}
public static void saveArticles(List<Article> articles, String filename) throws IOException {
String filepath = DATA_DIR + "/" + filename;
try (BufferedWriter writer = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(filepath), "UTF-8"))) {
writer.write("爬取结果汇总\n");
writer.write("时间:" + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date()) + "\n");
writer.write("数量:" + articles.size() + "\n");
writer.write("========================================\n\n");
for (int i = 0; i < articles.size(); i++) {
Article article = articles.get(i);
writer.write("【" + (i + 1) + "】\n");
writer.write("来源:" + article.getSource() + "\n");
writer.write("标题:" + article.getTitle() + "\n");
writer.write("链接:" + article.getUrl() + "\n");
writer.write("\n");
}
}
}
public static List<String> listSavedFiles() {
File dir = new File(DATA_DIR);
File[] files = dir.listFiles((d, name) -> name.endsWith(".txt"));
List<String> result = new ArrayList<>();
if (files != null) {
for (File file : files) {
result.add(file.getName());
}
}
return result;
}
}

93
HttpUtil.java

@ -0,0 +1,93 @@
package util;
import java.io.*;
import java.net.*;
import java.util.zip.GZIPInputStream;
import exception.*;
public class HttpUtil {
private static final int TIMEOUT = 10000;
private static final String USER_AGENT =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
public static String get(String urlStr, String encoding) throws SpiderException {
HttpURLConnection connection = null;
BufferedReader reader = null;
try {
URL url = new URL(urlStr);
connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("GET");
connection.setConnectTimeout(TIMEOUT);
connection.setReadTimeout(TIMEOUT);
connection.setRequestProperty("User-Agent", USER_AGENT);
connection.setRequestProperty("Accept-Encoding", "gzip, deflate");
int responseCode = connection.getResponseCode();
if (responseCode != HttpURLConnection.HTTP_OK) {
throw new NetworkException("HTTP响应错误: " + responseCode,
NetworkException.ErrorType.RESPONSE_ERROR);
}
String contentEncoding = connection.getContentEncoding();
InputStream inputStream = connection.getInputStream();
if (contentEncoding != null && contentEncoding.toLowerCase().contains("gzip")) {
inputStream = new GZIPInputStream(inputStream);
}
reader = new BufferedReader(new InputStreamReader(inputStream, encoding));
StringBuilder result = new StringBuilder();
String line;
while ((line = reader.readLine()) != null) {
result.append(line).append("\n");
}
return result.toString();
} catch (MalformedURLException e) {
throw new NetworkException("URL格式错误: " + urlStr,
NetworkException.ErrorType.HOST_NOT_FOUND, e);
} catch (SocketTimeoutException e) {
throw new NetworkException("连接超时: " + urlStr,
NetworkException.ErrorType.CONNECTION_TIMEOUT, e);
} catch (IOException e) {
throw new NetworkException("网络IO错误: " + e.getMessage(),
NetworkException.ErrorType.CONNECTION_REFUSED, e);
} finally {
if (reader != null) {
try { reader.close(); } catch (IOException e) {}
}
if (connection != null) {
connection.disconnect();
}
}
}
public static String extractTag(String html, String startTag, String endTag)
throws ParseException {
int startIndex = html.indexOf(startTag);
if (startIndex == -1) {
throw new ParseException("未找到开始标签: " + startTag,
ParseException.ErrorType.TAG_NOT_FOUND);
}
int endIndex = html.indexOf(endTag, startIndex + startTag.length());
if (endIndex == -1) {
throw new ParseException("未找到结束标签: " + endTag,
ParseException.ErrorType.TAG_NOT_FOUND);
}
return html.substring(startIndex + startTag.length(), endIndex).trim();
}
public static String extractTagSafe(String html, String startTag, String endTag) {
try {
return extractTag(html, startTag, endTag);
} catch (ParseException e) {
return "未找到";
}
}
}
Loading…
Cancel
Save