diff --git a/实验二/BaseCrawler.java b/实验二/BaseCrawler.java new file mode 100644 index 0000000..cf2f055 --- /dev/null +++ b/实验二/BaseCrawler.java @@ -0,0 +1,28 @@ +package crawler; + +import org.jsoup.Connection; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import java.io.IOException; + +public abstract class BaseCrawler { + protected static final String USER_AGENT = + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"; + + protected String url; + protected int timeout = 10; + + public BaseCrawler(String url) { + this.url = url; + } + + protected Document getDocument() throws IOException { + Connection connect = Jsoup.connect(url) + .userAgent(USER_AGENT) + .timeout(timeout * 1000) + .ignoreContentType(true); + return connect.get(); + } + + public abstract void crawl(); +} \ No newline at end of file diff --git a/实验二/CrawlerTest.java b/实验二/CrawlerTest.java new file mode 100644 index 0000000..afeecd8 --- /dev/null +++ b/实验二/CrawlerTest.java @@ -0,0 +1,13 @@ +package crawler; + +public class CrawlerTest { + public static void main(String[] args) { + // 爬取网页 + BaseCrawler webCrawler = new WebPageCrawler("https://www.baidu.com"); + webCrawler.crawl(); + + // 爬取图片 + BaseCrawler imgCrawler = new ImageCrawler("https://www.baidu.com"); + imgCrawler.crawl(); + } +} \ No newline at end of file diff --git a/实验二/ImageCrawler.java b/实验二/ImageCrawler.java new file mode 100644 index 0000000..9a8a86b --- /dev/null +++ b/实验二/ImageCrawler.java @@ -0,0 +1,30 @@ +package crawler; + +import org.jsoup.nodes.Document; +import org.jsoup.select.Elements; + +public class ImageCrawler extends BaseCrawler { + + public ImageCrawler(String url) { + super(url); + } + + @Override + public void crawl() { + try { + Document doc = getDocument(); + Elements imgs = doc.select("img[src]"); + + System.out.println("\n===== 图片爬取完成 ====="); + System.out.println("URL:" + url); + System.out.println("找到图片数量:" + imgs.size()); + + for (int i = 0; i < Math.min(5, imgs.size()); i++) { + String src = imgs.get(i).attr("abs:src"); + System.out.println("图片" + (i + 1) + ":" + src); + } + } catch (Exception e) { + System.err.println("图片爬取失败:" + e.getMessage()); + } + } +} \ No newline at end of file diff --git a/实验二/WebPageCrawler.java b/实验二/WebPageCrawler.java new file mode 100644 index 0000000..4a5b43f --- /dev/null +++ b/实验二/WebPageCrawler.java @@ -0,0 +1,30 @@ +package crawler; + +import org.jsoup.nodes.Document; + +public class WebPageCrawler extends BaseCrawler { + + public WebPageCrawler(String url) { + super(url); + } + + @Override + public void crawl() { + try { + Document doc = getDocument(); + String title = doc.title(); + String text = doc.body().text(); + + if (text.length() > 200) { + text = text.substring(0, 200) + "..."; + } + + System.out.println("===== 网页爬取完成 ====="); + System.out.println("URL:" + url); + System.out.println("标题:" + title); + System.out.println("内容预览:" + text); + } catch (Exception e) { + System.err.println("网页爬取失败:" + e.getMessage()); + } + } +} \ No newline at end of file