使用CompletableFuture在Java中使用多线程基本WebCrawler返回emptyresponse
下面是我使用CompletableFuture为模拟网络爬虫编写的代码,但返回空响应。 有人能帮我一下吗
class Solution {
public List<String> crawl(String startUrl, HtmlParser htmlParser) {
final Set<String> doneSet = ConcurrentHashMap.newKeySet();
final String calledDomain = getDomain(startUrl);
try {
return this.crawlHelper(startUrl, htmlParser, doneSet, calledDomain);
} catch (final Exception e) {
return Collections.emptyList();
}
}
public List<String> crawlHelper(final String url,
final HtmlParser parser,
final Set<String> doneSet,
final String calledDomain) {
if (doneSet.contains(url)) {
return Collections.emptyList();
}
doneSet.add(url);
try {
return CompletableFuture.supplyAsync(() -> parser.getUrls(url))
.thenApplyAsync(childUrls -> crawlChildren(childUrls, parser, doneSet, calledDomain)).get();
} catch (final Exception e) {
return Collections.emptyList();
}
}
public List<String> crawlChildren(final List<String> childUrls,
final HtmlParser parser,
final Set<String> doneSet,
final String calledDomain) {
try {
return childUrls.stream()
.filter(curl -> !doneSet.contains(curl))
.filter(curl -> getDomain(curl).equals(calledDomain))
.map(curl -> CompletableFuture.supplyAsync(() -> crawlHelper(curl, parser, doneSet, calledDomain)))
.collect(Collectors.toList())
.stream()
.map(CompletableFuture::join)
.flatMap(List::stream)
.collect(Collectors.toList());
} catch (final Exception e) {
return Collections.emptyList();
}
}
private String getDomain(final String url) {
return url.replace("http://", "").split("/")[0];
}
public static void main(String[] args) {
Solution sol = new Solution();
HtmlParser parser = new HtmlParser();
System.out.println(sol.crawl("http://news.yahoo.com/news/topics/", parser));
}
}
以下是模拟Html解析器代码,仅供更好地理解:
public class HtmlParser {
private static Map<String, List<String>> ret = new HashMap<String, List<String>>(){
{
put("http://new.yahoo.com", Arrays.asList("http://news.yahoo.com/news/topics/", "http://new.yahoo.com"));
put("http://news.yahoo.com/news", Arrays.asList("http://news.yahoo.com/news/topics/", "http://new.yahoo.com/news"));
put("http://news.yahoo.com/news/topics/", Arrays.asList("http://news.yahoo.com/news/topics/", "http://news.google.com"));
put("http://news.google.com", Arrays.asList("http://news.google.com", "http://news.yahoo.com/news"));
put("http://news.yahoo.com/us", Arrays.asList("http://news.yahoo.com/us", "http://news.yahoo.com"));
}
};
public List<String> getUrls(String url) {
return ret.get(url);
}
}
共 (0) 个答案