You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
42 lines
1.3 KiB
42 lines
1.3 KiB
package com.crawler.crawler.impl;
|
|
|
|
import com.crawler.crawler.BaseCrawler;
|
|
import com.crawler.model.CrawlerData;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
public class ExampleCrawler extends BaseCrawler {
|
|
@Override
|
|
public String getCrawlerName() {
|
|
return "ExampleCrawler";
|
|
}
|
|
|
|
@Override
|
|
protected List<CrawlerData> parseHtml(String html) {
|
|
List<CrawlerData> results = new ArrayList<>();
|
|
|
|
String cleanHtml = html.replaceAll("\\s+", " ").replaceAll(">\\s*<", "><");
|
|
|
|
Pattern linkPattern = Pattern.compile("<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]+)</a>", Pattern.CASE_INSENSITIVE);
|
|
Matcher matcher = linkPattern.matcher(cleanHtml);
|
|
|
|
int count = 0;
|
|
while (matcher.find() && count < 10) {
|
|
CrawlerData data = new CrawlerData();
|
|
data.setTitle(cleanText(matcher.group(2)));
|
|
data.setUrl(matcher.group(1));
|
|
results.add(data);
|
|
count++;
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
private String cleanText(String text) {
|
|
if (text == null) return null;
|
|
return text.replaceAll("<[^>]+>", "").trim();
|
|
}
|
|
}
|