From 673243fa3462bfcbad330af0538d3ad70ea0638d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=95=85=E6=98=A5?= <3481369387@qq.com> Date: Tue, 7 Apr 2026 17:23:51 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=AE=9E=E9=AA=8C?= =?UTF-8?q?=E4=BA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 实验二/BaseCrawler.java | 28 ++++++++++++++++++++++++++++ 实验二/CrawlerTest.java | 13 +++++++++++++ 实验二/ImageCrawler.java | 30 ++++++++++++++++++++++++++++++ 实验二/WebPageCrawler.java | 30 ++++++++++++++++++++++++++++++ 4 files changed, 101 insertions(+) create mode 100644 实验二/BaseCrawler.java create mode 100644 实验二/CrawlerTest.java create mode 100644 实验二/ImageCrawler.java create mode 100644 实验二/WebPageCrawler.java diff --git a/实验二/BaseCrawler.java b/实验二/BaseCrawler.java new file mode 100644 index 0000000..cf2f055 --- /dev/null +++ b/实验二/BaseCrawler.java @@ -0,0 +1,28 @@ +package crawler; + +import org.jsoup.Connection; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import java.io.IOException; + +public abstract class BaseCrawler { + protected static final String USER_AGENT = + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"; + + protected String url; + protected int timeout = 10; + + public BaseCrawler(String url) { + this.url = url; + } + + protected Document getDocument() throws IOException { + Connection connect = Jsoup.connect(url) + .userAgent(USER_AGENT) + .timeout(timeout * 1000) + .ignoreContentType(true); + return connect.get(); + } + + public abstract void crawl(); +} \ No newline at end of file diff --git a/实验二/CrawlerTest.java b/实验二/CrawlerTest.java new file mode 100644 index 0000000..afeecd8 --- /dev/null +++ b/实验二/CrawlerTest.java @@ -0,0 +1,13 @@ +package crawler; + +public class CrawlerTest { + public static void main(String[] args) { + // 爬取网页 + BaseCrawler webCrawler = new WebPageCrawler("https://www.baidu.com"); + webCrawler.crawl(); + + // 爬取图片 + BaseCrawler imgCrawler = new ImageCrawler("https://www.baidu.com"); + imgCrawler.crawl(); + } +} \ No newline at end of file diff --git a/实验二/ImageCrawler.java b/实验二/ImageCrawler.java new file mode 100644 index 0000000..9a8a86b --- /dev/null +++ b/实验二/ImageCrawler.java @@ -0,0 +1,30 @@ +package crawler; + +import org.jsoup.nodes.Document; +import org.jsoup.select.Elements; + +public class ImageCrawler extends BaseCrawler { + + public ImageCrawler(String url) { + super(url); + } + + @Override + public void crawl() { + try { + Document doc = getDocument(); + Elements imgs = doc.select("img[src]"); + + System.out.println("\n===== 图片爬取完成 ====="); + System.out.println("URL:" + url); + System.out.println("找到图片数量:" + imgs.size()); + + for (int i = 0; i < Math.min(5, imgs.size()); i++) { + String src = imgs.get(i).attr("abs:src"); + System.out.println("图片" + (i + 1) + ":" + src); + } + } catch (Exception e) { + System.err.println("图片爬取失败:" + e.getMessage()); + } + } +} \ No newline at end of file diff --git a/实验二/WebPageCrawler.java b/实验二/WebPageCrawler.java new file mode 100644 index 0000000..4a5b43f --- /dev/null +++ b/实验二/WebPageCrawler.java @@ -0,0 +1,30 @@ +package crawler; + +import org.jsoup.nodes.Document; + +public class WebPageCrawler extends BaseCrawler { + + public WebPageCrawler(String url) { + super(url); + } + + @Override + public void crawl() { + try { + Document doc = getDocument(); + String title = doc.title(); + String text = doc.body().text(); + + if (text.length() > 200) { + text = text.substring(0, 200) + "..."; + } + + System.out.println("===== 网页爬取完成 ====="); + System.out.println("URL:" + url); + System.out.println("标题:" + title); + System.out.println("内容预览:" + text); + } catch (Exception e) { + System.err.println("网页爬取失败:" + e.getMessage()); + } + } +} \ No newline at end of file