Browse Source

feat: 添加实验二

main
故春 1 week ago
parent
commit
673243fa34
  1. 28
      实验二/BaseCrawler.java
  2. 13
      实验二/CrawlerTest.java
  3. 30
      实验二/ImageCrawler.java
  4. 30
      实验二/WebPageCrawler.java

28
实验二/BaseCrawler.java

@ -0,0 +1,28 @@
package crawler;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
public abstract class BaseCrawler {
protected static final String USER_AGENT =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
protected String url;
protected int timeout = 10;
public BaseCrawler(String url) {
this.url = url;
}
protected Document getDocument() throws IOException {
Connection connect = Jsoup.connect(url)
.userAgent(USER_AGENT)
.timeout(timeout * 1000)
.ignoreContentType(true);
return connect.get();
}
public abstract void crawl();
}

13
实验二/CrawlerTest.java

@ -0,0 +1,13 @@
package crawler;
public class CrawlerTest {
public static void main(String[] args) {
// 爬取网页
BaseCrawler webCrawler = new WebPageCrawler("https://www.baidu.com");
webCrawler.crawl();
// 爬取图片
BaseCrawler imgCrawler = new ImageCrawler("https://www.baidu.com");
imgCrawler.crawl();
}
}

30
实验二/ImageCrawler.java

@ -0,0 +1,30 @@
package crawler;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public class ImageCrawler extends BaseCrawler {
public ImageCrawler(String url) {
super(url);
}
@Override
public void crawl() {
try {
Document doc = getDocument();
Elements imgs = doc.select("img[src]");
System.out.println("\n===== 图片爬取完成 =====");
System.out.println("URL:" + url);
System.out.println("找到图片数量:" + imgs.size());
for (int i = 0; i < Math.min(5, imgs.size()); i++) {
String src = imgs.get(i).attr("abs:src");
System.out.println("图片" + (i + 1) + ":" + src);
}
} catch (Exception e) {
System.err.println("图片爬取失败:" + e.getMessage());
}
}
}

30
实验二/WebPageCrawler.java

@ -0,0 +1,30 @@
package crawler;
import org.jsoup.nodes.Document;
public class WebPageCrawler extends BaseCrawler {
public WebPageCrawler(String url) {
super(url);
}
@Override
public void crawl() {
try {
Document doc = getDocument();
String title = doc.title();
String text = doc.body().text();
if (text.length() > 200) {
text = text.substring(0, 200) + "...";
}
System.out.println("===== 网页爬取完成 =====");
System.out.println("URL:" + url);
System.out.println("标题:" + title);
System.out.println("内容预览:" + text);
} catch (Exception e) {
System.err.println("网页爬取失败:" + e.getMessage());
}
}
}
Loading…
Cancel
Save