You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

42 lines
1.3 KiB

package com.crawler.crawler.impl;
import com.crawler.crawler.BaseCrawler;
import com.crawler.model.CrawlerData;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ExampleCrawler extends BaseCrawler {
@Override
public String getCrawlerName() {
return "ExampleCrawler";
}
@Override
protected List<CrawlerData> parseHtml(String html) {
List<CrawlerData> results = new ArrayList<>();
String cleanHtml = html.replaceAll("\\s+", " ").replaceAll(">\\s*<", "><");
Pattern linkPattern = Pattern.compile("<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]+)</a>", Pattern.CASE_INSENSITIVE);
Matcher matcher = linkPattern.matcher(cleanHtml);
int count = 0;
while (matcher.find() && count < 10) {
CrawlerData data = new CrawlerData();
data.setTitle(cleanText(matcher.group(2)));
data.setUrl(matcher.group(1));
results.add(data);
count++;
}
return results;
}
private String cleanText(String text) {
if (text == null) return null;
return text.replaceAll("<[^>]+>", "").trim();
}
}