4 changed files with 101 additions and 0 deletions
@ -0,0 +1,28 @@ |
|||
package crawler; |
|||
|
|||
import org.jsoup.Connection; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import java.io.IOException; |
|||
|
|||
public abstract class BaseCrawler { |
|||
protected static final String USER_AGENT = |
|||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"; |
|||
|
|||
protected String url; |
|||
protected int timeout = 10; |
|||
|
|||
public BaseCrawler(String url) { |
|||
this.url = url; |
|||
} |
|||
|
|||
protected Document getDocument() throws IOException { |
|||
Connection connect = Jsoup.connect(url) |
|||
.userAgent(USER_AGENT) |
|||
.timeout(timeout * 1000) |
|||
.ignoreContentType(true); |
|||
return connect.get(); |
|||
} |
|||
|
|||
public abstract void crawl(); |
|||
} |
|||
@ -0,0 +1,13 @@ |
|||
package crawler; |
|||
|
|||
public class CrawlerTest { |
|||
public static void main(String[] args) { |
|||
// 爬取网页
|
|||
BaseCrawler webCrawler = new WebPageCrawler("https://www.baidu.com"); |
|||
webCrawler.crawl(); |
|||
|
|||
// 爬取图片
|
|||
BaseCrawler imgCrawler = new ImageCrawler("https://www.baidu.com"); |
|||
imgCrawler.crawl(); |
|||
} |
|||
} |
|||
@ -0,0 +1,30 @@ |
|||
package crawler; |
|||
|
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
public class ImageCrawler extends BaseCrawler { |
|||
|
|||
public ImageCrawler(String url) { |
|||
super(url); |
|||
} |
|||
|
|||
@Override |
|||
public void crawl() { |
|||
try { |
|||
Document doc = getDocument(); |
|||
Elements imgs = doc.select("img[src]"); |
|||
|
|||
System.out.println("\n===== 图片爬取完成 ====="); |
|||
System.out.println("URL:" + url); |
|||
System.out.println("找到图片数量:" + imgs.size()); |
|||
|
|||
for (int i = 0; i < Math.min(5, imgs.size()); i++) { |
|||
String src = imgs.get(i).attr("abs:src"); |
|||
System.out.println("图片" + (i + 1) + ":" + src); |
|||
} |
|||
} catch (Exception e) { |
|||
System.err.println("图片爬取失败:" + e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,30 @@ |
|||
package crawler; |
|||
|
|||
import org.jsoup.nodes.Document; |
|||
|
|||
public class WebPageCrawler extends BaseCrawler { |
|||
|
|||
public WebPageCrawler(String url) { |
|||
super(url); |
|||
} |
|||
|
|||
@Override |
|||
public void crawl() { |
|||
try { |
|||
Document doc = getDocument(); |
|||
String title = doc.title(); |
|||
String text = doc.body().text(); |
|||
|
|||
if (text.length() > 200) { |
|||
text = text.substring(0, 200) + "..."; |
|||
} |
|||
|
|||
System.out.println("===== 网页爬取完成 ====="); |
|||
System.out.println("URL:" + url); |
|||
System.out.println("标题:" + title); |
|||
System.out.println("内容预览:" + text); |
|||
} catch (Exception e) { |
|||
System.err.println("网页爬取失败:" + e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
Loading…
Reference in new issue