22 changed files with 616 additions and 0 deletions
|
After Width: | Height: | Size: 718 KiB |
@ -0,0 +1,4 @@ |
|||
*.jar |
|||
*.jar |
|||
*.class |
|||
*.log |
|||
@ -0,0 +1,52 @@ |
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" |
|||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|||
<modelVersion>4.0.0</modelVersion> |
|||
<groupId>com.example</groupId> |
|||
<artifactId>datacollect-cli</artifactId> |
|||
<version>0.1.0</version> |
|||
<properties> |
|||
<maven.compiler.source>11</maven.compiler.source> |
|||
<maven.compiler.target>11</maven.compiler.target> |
|||
</properties> |
|||
<dependencies> |
|||
<dependency> |
|||
<groupId>org.jsoup</groupId> |
|||
<artifactId>jsoup</artifactId> |
|||
<version>1.17.2</version> |
|||
</dependency> |
|||
</dependencies> |
|||
<build> |
|||
<plugins> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-compiler-plugin</artifactId> |
|||
<version>3.8.1</version> |
|||
</plugin> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-assembly-plugin</artifactId> |
|||
<version>3.3.0</version> |
|||
<configuration> |
|||
<archive> |
|||
<manifest> |
|||
<mainClass>com.example.datacollect.Main</mainClass> |
|||
</manifest> |
|||
</archive> |
|||
<descriptorRefs> |
|||
<descriptorRef>jar-with-dependencies</descriptorRef> |
|||
</descriptorRefs> |
|||
</configuration> |
|||
<executions> |
|||
<execution> |
|||
<id>make-assembly</id> |
|||
<phase>package</phase> |
|||
<goals> |
|||
<goal>single</goal> |
|||
</goals> |
|||
</execution> |
|||
</executions> |
|||
</plugin> |
|||
</plugins> |
|||
</build> |
|||
</project> |
|||
@ -0,0 +1,21 @@ |
|||
package com.example.datacollect; |
|||
|
|||
import com.example.datacollect.controller.CrawlerController; |
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.strategy.StrategyFactory; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
|
|||
public class Main { |
|||
|
|||
public static void main(String[] args) { |
|||
ConsoleView view = new ConsoleView(); |
|||
ArticleRepository repository = new ArticleRepository(); |
|||
StrategyFactory strategyFactory = new StrategyFactory(); |
|||
CrawlerController controller = new CrawlerController(view, repository, strategyFactory); |
|||
|
|||
view.printSuccess("Welcome to CLI Crawler (w10_3)! Type help for commands."); |
|||
while (true) { |
|||
controller.handle(view.readLine()); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,64 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.strategy.CrawlStrategy; |
|||
import com.example.datacollect.strategy.StrategyFactory; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
|
|||
public class AnalyzeCommand implements Command { |
|||
private final ConsoleView view; |
|||
private final StrategyFactory strategyFactory; |
|||
|
|||
public AnalyzeCommand(ConsoleView view, StrategyFactory strategyFactory) { |
|||
this.view = view; |
|||
this.strategyFactory = strategyFactory; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "analyze"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
if (args.length < 2) { |
|||
view.printError("Usage: analyze <url>"); |
|||
return; |
|||
} |
|||
String url = args[1]; |
|||
|
|||
CrawlStrategy strategy = strategyFactory.getStrategy(url); |
|||
|
|||
try { |
|||
view.printInfo("Analyzing: " + url); |
|||
Document doc = Jsoup.connect(url).get(); |
|||
var articles = strategy.parse(url, doc); |
|||
|
|||
int count = articles.size(); |
|||
int totalTitleLength = 0; |
|||
int totalContentLength = 0; |
|||
|
|||
for (var article : articles) { |
|||
if (article.getTitle() != null) { |
|||
totalTitleLength += article.getTitle().length(); |
|||
} |
|||
if (article.getContent() != null) { |
|||
totalContentLength += article.getContent().length(); |
|||
} |
|||
} |
|||
|
|||
double avgTitleLength = count > 0 ? (double) totalTitleLength / count : 0; |
|||
double avgContentLength = count > 0 ? (double) totalContentLength / count : 0; |
|||
|
|||
view.printSuccess("Analysis Results:"); |
|||
view.printInfo(" Total Articles: " + count); |
|||
view.printInfo(" Average Title Length: " + String.format("%.2f", avgTitleLength)); |
|||
view.printInfo(" Average Content Length: " + String.format("%.2f", avgContentLength)); |
|||
view.printInfo(" Strategy Used: " + strategy.getClass().getSimpleName()); |
|||
} catch (Exception e) { |
|||
view.printError("Failed to analyze: " + e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,8 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
|
|||
public interface Command { |
|||
String getName(); |
|||
void execute(String[] args, ArticleRepository repository); |
|||
} |
|||
@ -0,0 +1,44 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.strategy.CrawlStrategy; |
|||
import com.example.datacollect.strategy.StrategyFactory; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
|
|||
public class CrawlCommand implements Command { |
|||
private final ConsoleView view; |
|||
private final StrategyFactory strategyFactory; |
|||
|
|||
public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) { |
|||
this.view = view; |
|||
this.strategyFactory = strategyFactory; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "crawl"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
if (args.length < 2) { |
|||
view.printError("Usage: crawl <url>"); |
|||
return; |
|||
} |
|||
String url = args[1]; |
|||
|
|||
CrawlStrategy strategy = strategyFactory.getStrategy(url); |
|||
|
|||
try { |
|||
view.printInfo("Crawling: " + url); |
|||
Document doc = Jsoup.connect(url).get(); |
|||
var articles = strategy.parse(url, doc); |
|||
repository.addAll(articles); |
|||
view.printSuccess("Crawled " + articles.size() + " articles."); |
|||
} catch (Exception e) { |
|||
view.printError("Failed to crawl: " + e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,23 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
|
|||
public class ExitCommand implements Command { |
|||
private final ConsoleView view; |
|||
|
|||
public ExitCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "exit"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
view.printSuccess("Bye!"); |
|||
System.exit(0); |
|||
} |
|||
} |
|||
@ -0,0 +1,22 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
|
|||
public class HelpCommand implements Command { |
|||
private final ConsoleView view; |
|||
|
|||
public HelpCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "help"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
view.printInfo("Commands: crawl <url>, analyze <url>, list, help, exit"); |
|||
} |
|||
} |
|||
@ -0,0 +1,22 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
|
|||
public class ListCommand implements Command { |
|||
private final ConsoleView view; |
|||
|
|||
public ListCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "list"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
view.display(repository.getAll()); |
|||
} |
|||
} |
|||
@ -0,0 +1,49 @@ |
|||
package com.example.datacollect.controller; |
|||
|
|||
import com.example.datacollect.command.AnalyzeCommand; |
|||
import com.example.datacollect.command.Command; |
|||
import com.example.datacollect.command.CrawlCommand; |
|||
import com.example.datacollect.command.ExitCommand; |
|||
import com.example.datacollect.command.HelpCommand; |
|||
import com.example.datacollect.command.ListCommand; |
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.strategy.StrategyFactory; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import java.util.HashMap; |
|||
import java.util.Map; |
|||
|
|||
public class CrawlerController { |
|||
private final Map<String, Command> commands = new HashMap<>(); |
|||
private final ConsoleView view; |
|||
private final ArticleRepository repository; |
|||
|
|||
public CrawlerController(ConsoleView view, ArticleRepository repository, StrategyFactory strategyFactory) { |
|||
this.view = view; |
|||
this.repository = repository; |
|||
register(new HelpCommand(view)); |
|||
register(new ListCommand(view)); |
|||
register(new CrawlCommand(view, strategyFactory)); |
|||
register(new AnalyzeCommand(view, strategyFactory)); |
|||
register(new ExitCommand(view)); |
|||
} |
|||
|
|||
private void register(Command command) { |
|||
commands.put(command.getName(), command); |
|||
} |
|||
|
|||
public void handle(String input) { |
|||
String text = input == null ? "" : input.trim(); |
|||
if (text.isEmpty()) { |
|||
return; |
|||
} |
|||
|
|||
String[] args = text.split("\\s+"); |
|||
String cmdName = args[0].toLowerCase(); |
|||
Command command = commands.get(cmdName); |
|||
if (command == null) { |
|||
view.printError("Unknown command: " + cmdName); |
|||
return; |
|||
} |
|||
command.execute(args, repository); |
|||
} |
|||
} |
|||
@ -0,0 +1,45 @@ |
|||
package com.example.datacollect.model; |
|||
|
|||
public class Article { |
|||
private String title; |
|||
private String url; |
|||
private String content; |
|||
|
|||
public Article(String title, String url, String content) { |
|||
this.title = title; |
|||
this.url = url; |
|||
this.content = content; |
|||
} |
|||
|
|||
public String getTitle() { |
|||
return title; |
|||
} |
|||
|
|||
public void setTitle(String title) { |
|||
this.title = title; |
|||
} |
|||
|
|||
public String getUrl() { |
|||
return url; |
|||
} |
|||
|
|||
public void setUrl(String url) { |
|||
this.url = url; |
|||
} |
|||
|
|||
public String getContent() { |
|||
return content; |
|||
} |
|||
|
|||
public void setContent(String content) { |
|||
this.content = content; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "Article{" |
|||
+ "title='" + title + '\'' |
|||
+ ", url='" + url + '\'' |
|||
+ '}'; |
|||
} |
|||
} |
|||
@ -0,0 +1,41 @@ |
|||
package com.example.datacollect.repository; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import java.util.ArrayList; |
|||
import java.util.Collections; |
|||
import java.util.List; |
|||
|
|||
public class ArticleRepository { |
|||
private final List<Article> articles = new ArrayList<>(); |
|||
|
|||
public void add(Article article) { |
|||
if (article == null) { |
|||
throw new IllegalArgumentException("Article cannot be null"); |
|||
} |
|||
articles.add(article); |
|||
} |
|||
|
|||
public void addAll(List<Article> articleList) { |
|||
if (articleList == null) { |
|||
throw new IllegalArgumentException("Article list cannot be null"); |
|||
} |
|||
for (Article article : articleList) { |
|||
if (article == null) { |
|||
throw new IllegalArgumentException("Article in list cannot be null"); |
|||
} |
|||
} |
|||
articles.addAll(articleList); |
|||
} |
|||
|
|||
public List<Article> getAll() { |
|||
return Collections.unmodifiableList(articles); |
|||
} |
|||
|
|||
public int size() { |
|||
return articles.size(); |
|||
} |
|||
|
|||
public void clear() { |
|||
articles.clear(); |
|||
} |
|||
} |
|||
@ -0,0 +1,27 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class BlogStrategy extends PriorityStrategy { |
|||
private static final int PRIORITY = 100; |
|||
private static final String URL_PATTERN = ".*blog\\.example\\.com.*"; |
|||
|
|||
public BlogStrategy() { |
|||
super(PRIORITY, URL_PATTERN); |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> parse(String url, Document doc) { |
|||
List<Article> articles = new ArrayList<>(); |
|||
Elements titles = doc.select(".post-title"); |
|||
for (Element e : titles) { |
|||
articles.add(new Article(e.text(), url, "")); |
|||
} |
|||
return articles; |
|||
} |
|||
} |
|||
@ -0,0 +1,10 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import org.jsoup.nodes.Document; |
|||
import java.util.List; |
|||
|
|||
public interface CrawlStrategy { |
|||
List<Article> parse(String url, Document doc); |
|||
boolean supports(String url); |
|||
} |
|||
@ -0,0 +1,37 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class DefaultStrategy implements CrawlStrategy { |
|||
|
|||
@Override |
|||
public boolean supports(String url) { |
|||
return true; |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> parse(String url, Document doc) { |
|||
List<Article> articles = new ArrayList<>(); |
|||
|
|||
Elements links = doc.select("a[href]"); |
|||
for (Element link : links) { |
|||
String title = link.text().trim(); |
|||
String href = link.attr("abs:href"); |
|||
|
|||
if (!title.isEmpty() && title.length() > 5) { |
|||
articles.add(new Article(title, href.isEmpty() ? url : href, "")); |
|||
} |
|||
|
|||
if (articles.size() >= 20) { |
|||
break; |
|||
} |
|||
} |
|||
|
|||
return articles; |
|||
} |
|||
} |
|||
@ -0,0 +1,51 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class HnuNewsStrategy extends PriorityStrategy { |
|||
private static final int PRIORITY = 200; |
|||
private static final String URL_PATTERN = ".*news\\.hnu\\.edu\\.cn.*"; |
|||
|
|||
public HnuNewsStrategy() { |
|||
super(PRIORITY, URL_PATTERN); |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> parse(String url, Document doc) { |
|||
List<Article> articles = new ArrayList<>(); |
|||
Elements listItems = doc.select("ul.list11 li"); |
|||
|
|||
for (Element li : listItems) { |
|||
Element link = li.selectFirst("a"); |
|||
if (link == null) continue; |
|||
|
|||
String articleUrl = link.attr("href"); |
|||
if (!articleUrl.startsWith("http")) { |
|||
articleUrl = "https://news.hnu.edu.cn" + articleUrl.replace("..", ""); |
|||
} |
|||
|
|||
String title = ""; |
|||
Element titleEl = link.selectFirst("h4.l2.h4s2"); |
|||
if (titleEl != null) { |
|||
title = titleEl.text().trim(); |
|||
} |
|||
|
|||
String content = ""; |
|||
Element contentEl = link.selectFirst("p.l3.ps3"); |
|||
if (contentEl != null) { |
|||
content = contentEl.text().trim(); |
|||
} |
|||
|
|||
if (!title.isEmpty()) { |
|||
articles.add(new Article(title, articleUrl, content)); |
|||
} |
|||
} |
|||
|
|||
return articles; |
|||
} |
|||
} |
|||
@ -0,0 +1,27 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class NewsStrategy extends PriorityStrategy { |
|||
private static final int PRIORITY = 100; |
|||
private static final String URL_PATTERN = ".*news\\.example\\.com.*"; |
|||
|
|||
public NewsStrategy() { |
|||
super(PRIORITY, URL_PATTERN); |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> parse(String url, Document doc) { |
|||
List<Article> articles = new ArrayList<>(); |
|||
Elements items = doc.select(".article-headline"); |
|||
for (Element e : items) { |
|||
articles.add(new Article(e.text(), url, "")); |
|||
} |
|||
return articles; |
|||
} |
|||
} |
|||
@ -0,0 +1,27 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import java.util.regex.Pattern; |
|||
|
|||
public abstract class PriorityStrategy implements CrawlStrategy, Comparable<PriorityStrategy> { |
|||
private final int priority; |
|||
private final Pattern urlPattern; |
|||
|
|||
public PriorityStrategy(int priority, String regexPattern) { |
|||
this.priority = priority; |
|||
this.urlPattern = Pattern.compile(regexPattern); |
|||
} |
|||
|
|||
@Override |
|||
public boolean supports(String url) { |
|||
return urlPattern.matcher(url).matches(); |
|||
} |
|||
|
|||
@Override |
|||
public int compareTo(PriorityStrategy other) { |
|||
return Integer.compare(other.priority, this.priority); |
|||
} |
|||
|
|||
public int getPriority() { |
|||
return priority; |
|||
} |
|||
} |
|||
@ -0,0 +1,42 @@ |
|||
package com.example.datacollect.view; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import java.util.List; |
|||
import java.util.Scanner; |
|||
|
|||
public class ConsoleView { |
|||
private static final String ANSI_RESET = "\u001B[0m"; |
|||
private static final String ANSI_GREEN = "\u001B[32m"; |
|||
private static final String ANSI_RED = "\u001B[31m"; |
|||
private static final String ANSI_BLUE = "\u001B[34m"; |
|||
|
|||
private final Scanner scanner = new Scanner(System.in); |
|||
|
|||
public String readLine() { |
|||
System.out.print("> "); |
|||
return scanner.nextLine(); |
|||
} |
|||
|
|||
public void printSuccess(String msg) { |
|||
System.out.println(ANSI_GREEN + msg + ANSI_RESET); |
|||
} |
|||
|
|||
public void printError(String msg) { |
|||
System.out.println(ANSI_RED + msg + ANSI_RESET); |
|||
} |
|||
|
|||
public void printInfo(String msg) { |
|||
System.out.println(ANSI_BLUE + msg + ANSI_RESET); |
|||
} |
|||
|
|||
public void display(List<Article> articles) { |
|||
if (articles.isEmpty()) { |
|||
printInfo("暂无文章,请先执行 crawl。"); |
|||
return; |
|||
} |
|||
for (int i = 0; i < articles.size(); i++) { |
|||
Article a = articles.get(i); |
|||
System.out.println((i + 1) + ". " + a.getTitle() + " | " + a.getUrl()); |
|||
} |
|||
} |
|||
} |
|||
|
After Width: | Height: | Size: 386 KiB |
|
After Width: | Height: | Size: 902 KiB |
Loading…
Reference in new issue