前言
- 上次写到bilibili爬虫,今天上一个Tumblr爬虫,使用的依然是webmagic. Tumblr是什么?老司机必须懂。
注意
-
常规网页展示部分一般分为列表页和详情页,Tumblr站是请求后台api返回json,例如整站搜索接口分页请求,参数加密暂时无解,但是对应blog的列表,可采用不需加密接口,所以还是有办法做对应blog列表的爬虫处理。
-
Tumblr站后端json返回格式各种乱,有json内含有html字符串,格式化时要注意容错,各种坑。
-
还有一个简单去重处理,每个列表信息均做入库处理,但下载时会用视频封面做一个md5比较去重处理,重复则不下载当前视频,并且使重复视频数据进行入库,(下期会细说不同来用视频下载区别,以及汇总处理)。
-
本人的springboot 是采用多线程定时器,分别定时跑爬虫数据和下载爬虫资源,以后会在这里慢慢列出解决方法。
-
SpiderTumblrService为一些数据的入库处理,SslDownloader为webmagic获取https,请求处理。
pom.xml配置
us.codecraft webmagic-core 0.5.2 org.slf4j slf4j-log4j12
process
package win.raychow.modules.spider.base.processor;import com.alibaba.fastjson.JSON;import org.json.XML;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import org.springframework.beans.factory.annotation.Autowired;import org.springframework.beans.factory.annotation.Value;import org.springframework.stereotype.Service;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.processor.PageProcessor;import win.raychow.core.base.dao.CacheKey;import win.raychow.core.base.service.HtmlTool;import win.raychow.demo.spider.tool.SslDownloader;import win.raychow.modules.spider.base.dao.SpiderTumblr;import win.raychow.modules.spider.base.domain.TumblrRecModel;import java.util.ArrayList;import java.util.Arrays;import java.util.List;/** * Created by ray on 2017/11/19. */@Servicepublic class TumblrProcessor implements PageProcessor { private Logger logger = LoggerFactory.getLogger(this.getClass()); @Autowired TumblrPipeLine pipeLine; @Value("${spider.tumblr.prefixSexList}") private String prefixSexList; @Value("${spider.tumblr.prefixAnimalList}") private String prefixAnimalList; public final static String bashUrl = ".tumblr.com/api/read?type=video&num=20&start="; private String getCategory(String url){ //性 String[] sexList = prefixSexList.split(CacheKey.Split); for (String id: sexList) { if (url.contains(id)){ return SpiderTumblr.Category_AV; } } //动物 String[] animalList = prefixAnimalList.split(CacheKey.Split); for (String id: animalList) { if (url.contains(id)){ return SpiderTumblr.Category_Animal; } } return SpiderTumblr.Category_Null; } @Override public Site getSite() { //HttpHost httpHost = new HttpHost("127.0.0.1",1087); Site site = Site.me() //.setHttpProxy(httpHost) .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36") .setSleepTime(30 * 1000) .setTimeOut(20 * 1000) .setRetryTimes(3) .setCycleRetryTimes(3); return site; } @Override public void process(Page page){ String pageUrl = page.getUrl().toString(); logger.info(pageUrl); if (pageUrl.contains(bashUrl)) { try { String xml = page.getJson().toString(); String json = XML.toJSONObject(xml).toString(); TumblrRecModel rec = JSON.parseObject(json, TumblrRecModel.class); Listposts = rec.getTumblr().getPosts().getPost(); List list = new ArrayList<>(); TumblrRecModel.Tumblelog tumblelog = rec.getTumblr().getTumblelog(); //增加请求 if (pageUrl.contains("1&fffff=0")) { List requestUrls =new ArrayList<>(); long total = Long.valueOf(rec.getTumblr().getPosts().getTotal()); long pageMax = total / 20 + 1; for (int j = 1; j < pageMax; j++) { String tmpUrl = pageUrl.replace("1&fffff=0",String.valueOf(20*j)) ; requestUrls.add(tmpUrl); } page.addTargetRequests(requestUrls); } if (posts.size() == 0) return; for (int i = 0; i < posts.size(); i++) { String str = ""; try { TumblrRecModel.Post post = posts.get(i); str = post.getVideoPlayer().get(0); str = str.replace("\"","'"); String id = HtmlTool.match(str,"video","id").get(0); String poster = HtmlTool.match(str,"video","poster").get(0); String optionsJson = HtmlTool.match(str,"video","data-crt-options").get(0); TumblrRecModel.Options optionsRec = JSON.parseObject(optionsJson,TumblrRecModel.Options.class); String file = HtmlTool.match(str,"source","src").get(0); //类型 String type = ""; if (str.toLowerCase().contains("video/mp4")){ type = "mp4"; } else if (str.toLowerCase().contains("video/ogg")){ type = "ogg"; } else if (str.toLowerCase().contains("video/webm")){ type = "webm"; } String category = pageUrl.split("&ggggg=")[1].toLowerCase(); if (optionsRec.getHdUrl().length() > 10){ file = optionsRec.getHdUrl(); } //String type = post.getVideoSource().getExtension(); String videoCaption = HtmlTool.removeHtmlTag(post.getVideoCaption()); String videoId = "tumblr_" + post.getUrl().substring(post.getUrl().lastIndexOf("/")).substring(1); SpiderTumblr tumblr = new SpiderTumblr(); tumblr.setVideoId(videoId); tumblr.setPosterImage(poster); tumblr.setVideoImage(optionsRec.getFilmstrip().getUrl()); tumblr.setVideoUrl(file); tumblr.setVideoType(type); tumblr.setTitle(videoCaption); tumblr.setBaseUrl(post.getUrl()); tumblr.setCategory(category); tumblr.setBlogTitle(tumblelog.getTitle()); list.add(tumblr); } catch (Exception e){ logger.error("xml to data error :" + str ); } } if (list.size() > 0){ page.putField("type", 0); page.putField("data", list); } } catch (Exception e){ logger.error("url:" + pageUrl ); } } } public void run(){ Spider spider = Spider.create(new TumblrProcessor()) //.setDownloader(new HttpClientDownloader()) //.setDownloader(new HttpDownloader()) .setDownloader(new SslDownloader()) //.addPipeline(new ConsolePipeline())//打印到控制台 .addPipeline(pipeLine); //animal String[] animalIds = prefixAnimalList.split(CacheKey.Split); for (String prefix:animalIds) { String tmpUrl = "https://" + prefix + bashUrl + "1&fffff=0" + "&ggggg=" + this.getCategory(prefix); spider.addUrl(tmpUrl); } //sex String[] sexIds = prefixSexList.split(CacheKey.Split); for (String prefix:sexIds) { String tmpUrl = "https://" + prefix + bashUrl + "1&fffff=0" + "&ggggg=" + this.getCategory(prefix); spider.addUrl(tmpUrl); } spider.run(); }}
PipeLine
package win.raychow.modules.spider.base.processor;import org.springframework.beans.factory.annotation.Autowired;import org.springframework.stereotype.Service;import us.codecraft.webmagic.ResultItems;import us.codecraft.webmagic.Task;import us.codecraft.webmagic.pipeline.Pipeline;import win.raychow.modules.spider.base.dao.SpiderTumblr;import win.raychow.modules.spider.base.dao.SpiderTumblrService;import java.util.List;/** * Created by ray on 2017/11/19. */@Servicepublic class TumblrPipeLine implements Pipeline { @Autowired SpiderTumblrService service; @Override public void process(ResultItems resultItems, Task task){ if (resultItems.getAll().isEmpty() == false) { int type = resultItems.get("type"); if (type == 0){ //列表内容 Listlist = resultItems.get("data"); for (SpiderTumblr tumblr: list) { try { String blogName = tumblr.getBaseUrl().replace("https://","").replace("http://","").split("\\.")[0]; tumblr.setBlogName(blogName); } catch (Exception e){ } service.updateBySpider(tumblr); } } else if(type == 1){ } } }}
原文: