package com.chao.crawler; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set;

import com.chao.util.ListUtil;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.processor.PageProcessor; public class PageProcesserProduct implements PageProcessor { // private Site site = null; // // public PageProcesser(String domain, String startUrl) { // // site = Site.me().setDomain(domain).addStartUrl(startUrl); // // }
//调试用 private Site site = Site.me().setDomain("http://www.babysittersnow.com") .addStartUrl("http://www.babysittersnow.com.au/babysitters/search"); @Override public void process(Page page) { //System.out.println(page.getUrl()); String Title= page.getHtml().xpath("//div[@class='profile-panel-main']/h1").toString().replaceAll("<[^>]*>", "");; page.putField("Title",Title); String Info=page.getHtml().xpath("//div[@class='profile-panel-details']").toString().replaceAll("<[^>]*>", "");; page.putField("Info",Info); String Review=page.getHtml().xpath("//div[@class='review']/p").toString(); page.putField("Review",Review); String Introduction=page.getHtml().xpath("//div[@id='profile-tab-introduction']").toString().replaceAll("<[^>]*>", "");; page.putField("Introduction",Introduction); String Details=page.getHtml().xpath("//div[@id='profile-tab-details']").toString().replaceAll("<[^>]*>", "");; page.putField("Details",Details); String Insights=page.getHtml().xpath("//div[@id='profile-tab-insights']").toString().replaceAll("<[^>]*>", "");; page.putField("Insights",Insights); System.out.println("商品筛选完毕,准备执行存储"); // page.putField("author", page.getHtml().$("div.Resume").toString()); // page.putField("info", page.getHtml().xpath("//p[@class='profile-panel-details']/p/label/text()").toString()); Product product = new Product(); product.setTitle(Title); product.setInfo(Info); product.setReview(Review); product.setIntroduction(Introduction); product.setDetails(Details); product.setInsights(Insights); page.putField("product", product); System.out.println("----------------------------------------------------"); } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new PageProcesserProduct()) .pipeline(new ConsolePipeline()).thread(10).run(); }
讯享网
}

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容,请联系我们,一经查实,本站将立刻删除。
如需转载请保留出处:https://51itzy.com/kjqy/193550.html