Spring boot+webMagic实现自动化爬取网站内容
作者:互联网
本文摘自:https://funyan.cn/p/6861.html
前言
在网站中,内容才是王道,如果你的网站功能很多也很好看,但是没有内容,那么将毫无意义,但是要是靠站长自己写的话那将是一段非常漫长的过程,所以这时候就需要借鉴其他站的内容,来丰富自己的网站,所以爬虫就出现了,那么今天就教大家如何使用Spring boot+webMagic实现自动化爬取网站内。
开发过程
第一步:引入webMagic
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
第二步:编写工具类
package cn.funyan.blog.task;
import cn.funyan.blog.Article;
import cn.funyan.blog.ArticleSpider;
import cn.funyan.blog.service.ArticleSpiderService;
import cn.funyan.utils.SpringUtil;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import com.baomidou.mybatisplus.core.conditions.update.LambdaUpdateWrapper;
import com.baomidou.mybatisplus.core.toolkit.Wrappers;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;
/**
* @Description: 自动化爬虫工具类
* @Author: Chris.Ren
* @Date: 2021-06-29
*/
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@Slf4j
@Component
public class WebMagicUtil implements PageProcessor
{
//id
private Integer id;
//url
private String url;
//文章列表dom值
private String listDom;
//文章分页dom值
private String pageDom;
//文章详情dom值
private String articleDom;
//文章标题dom值
private String headDom;
//文章内容dom值
private String contentDom;
//文章标签dom值
private String tagDom;
//文章标签类型
private Integer tagType;
//文章题图
private String img;
//这里需要autowird,因为我们在创建爬虫的时候需要用到这个保存
@Autowired
private SaveArticle saveArticle;
public WebMagicUtil(WebMagicUtil webMagicUtil) {
this.url = webMagicUtil.getUrl();
this.listDom = webMagicUtil.getListDom();
this.pageDom = webMagicUtil.getPageDom();
this.articleDom = webMagicUtil.getArticleDom();
this.headDom = webMagicUtil.getHeadDom();
this.contentDom = webMagicUtil.getContentDom();
this.tagDom = webMagicUtil.getTagDom();
this.tagType = webMagicUtil.getTagType();
this.img = webMagicUtil.getImg();
saveArticle = SpringUtil.getBean(SaveArticle.class);
articleSpiderService = SpringUtil.getBean(ArticleSpiderService.class);
}
@Override
public void process(Page page) {
Html html = page.getHtml();
//获取详情页链接
List<Selectable> linkList = html.css(this.listDom).nodes();
if (linkList.size() <= 1) {
//保存文章
preSaveArticle(page);
} else {
//遍历文章列表,获取详情
for (Selectable link : linkList) {
//详情页图片
String artLink = link.css(this.articleDom).links().toString();
page.addTargetRequest(artLink);
}
//找到下一页
String bkUrl = html.css(this.pageDom).links().toString();
if (!StringUtils.isEmpty(bkUrl)) {
page.addTargetRequest(bkUrl);
}
}
}
//获取文章详情
private void preSaveArticle(Page page) {
try {
Html html = page.getHtml();
//到这里已经有15个详情页
Article article = new Article();
article.setTitlePhoto(this.img);
//标题
article.setHead(html.css(this.headDom, "text").toString());
log.info("正在解析文章:{}", article.getHead());
//内容
String content = html.css(this.contentDom).toString();
article.setContent(content);
//标签
String tag = "";
if (tagType.equals(1)) {
List<Selectable> tagList = html.css(this.tagDom, "text").nodes();
for (Selectable tags : tagList) {
tag = tag + tags.toString() + ",";
}
article.setTags(tag.substring(0, tag.lastIndexOf(",")));
} else {
tag = this.tagDom;
article.setTags(tag);
}
//类型
article.setArticleClassify(3);
article.setCheckStatus(0);
article.setArticleType(2);
article.setDescription(article.getHead());
article.setUserId((int) (Math.random() * 5 + 1));
page.putField("artInfo", article);
} catch (Exception e) {
e.printStackTrace();
}
}
private Site site = new Site()
.setCharset("utf8")
.setTimeOut(10000)
.setRetrySleepTime(3000)
.setRetryTimes(3);
@Override
public Site getSite() {
return site;
}
public void processor(WebMagicUtil webMagicUtil) {
this.url = webMagicUtil.getUrl();
log.info("正在解析网站:{}", url);
Spider.create(new WebMagicUtil(webMagicUtil))
.addUrl(url)
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000)))
.thread(10)
.addPipeline(saveArticle)
.run();
}
public static void main(String[] args) {
WebMagicUtil wb = new WebMagicUtil();
wb.processor(WebMagicUtil
.builder()
.articleDom("111")
.contentDom("www")
.url("https://funyan.cn")
.build());
}
}
第三步:编写持久化类,就是把你爬取的东西保存下来
package cn.funyan.blog.task;
import cn.funyan.blog.Article;
import cn.funyan.blog.service.ArticleService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
/**
*
**/
@Slf4j
@Component
public class SaveArticle implements Pipeline
{
@Autowired
private ArticleService articleService;
@Override
public void process(ResultItems resultItems, Task task) {
Article article=resultItems.get("artInfo");
if(article!=null) {
log.info("正在保存文章:{}",article.getHead());
articleService.sendNewArticle(article);
}
}
}
第四步:调用,因为我们把工具类交给spring管理,所以我们只需要注入到需要的地方即可
@Autowired
private WebMagicUtil webMagicUtil;
@PostMapping("spiderArticle")
public Res spiderArticle(Integer id){
//查看是否爬过了
ArticleSpider as = service.getById(id);
if(as==null){
return new Res().failed();
}
if(as.getStatus().equals(1)){
return new Res().failed().setMsg("该文章已爬取过,请勿重复爬取");
}
webMagicUtil.processor(WebMagicUtil
.builder()
.id(as.getId())
.url(as.getUrl())
.img(as.getImg())
.listDom(as.getListDom())
.articleDom(as.getArticleDom())
.pageDom(as.getPageDom())
.headDom(as.getHeadDom())
.contentDom(as.getContentDom())
.tagDom(as.getTagDom())
.tagType(as.getTagType())
.build());
return new Res();
}
这样就完成了Spring boot+webMagic实现自动化爬取网站,后期就可以在后台管理系统调用接口,输入网站dom名实现自动化的爬取内容了
本文摘自:https://funyan.cn/p/6861.html
标签:codecraft,Spring,boot,private,webMagicUtil,爬取,article,import,webmagic 来源: https://blog.csdn.net/renhandong321/article/details/118336934