采集北京市政百姓信件内容——首都之窗(采用htmlunit,webmagic)附源代码、htmlUnit webmagic JAR包
作者:互联网
由于首都之窗网站第二页和第二页网址不变,已经和林子雨老师教程相差甚远,所以现在选择htmlunit模拟点击,(跳转摁钮显示网页仍是第一页),所以本代码用的一直是点击下一页摁钮。
爬取网址:http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow
获取代码:
1 package util; 2 3 import java.io.IOException; 4 import java.util.ArrayList; 5 import java.util.LinkedList; 6 import java.util.List; 7 8 import com.gargoylesoftware.htmlunit.BrowserVersion; 9 import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; 10 import com.gargoylesoftware.htmlunit.ImmediateRefreshHandler; 11 import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; 12 import com.gargoylesoftware.htmlunit.WebClient; 13 import com.gargoylesoftware.htmlunit.html.HtmlElement; 14 import com.gargoylesoftware.htmlunit.html.HtmlPage; 15 16 public class 首都之窗 { 17 static List<String> lines_zi=new LinkedList<String>(); 18 static List<String> lines_jian=new LinkedList<String>(); 19 static List<String> lines_tou=new LinkedList<String>(); 20 21 static String line; 22 public static void Value_start() 23 { 24 // TODO 自动生成的方法存根 25 WebClient webClient=new WebClient(BrowserVersion.CHROME); // 实例化Web客户端 26 27 System.out.println("AAAAAA"); 28 try { 29 webClient.getOptions().setActiveXNative(false); 30 //webClient.getOptions().setCssEnabled(false); 31 //webClient.getOptions().setRedirectEnabled(true); 32 webClient.getOptions().setJavaScriptEnabled(true); 33 webClient.getOptions().setDoNotTrackEnabled(true); 34 webClient.getOptions().setThrowExceptionOnScriptError(false); 35 webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); 36 webClient.getCache().setMaxSize(100); 37 webClient.getOptions().setJavaScriptEnabled(true);//运行js脚本执行 38 webClient.setAjaxController(new NicelyResynchronizingAjaxController());//设置支持AJAX 39 webClient.getOptions().setCssEnabled(false);//忽略css 40 webClient.getOptions().setUseInsecureSSL(true);//ssl安全访问 41 webClient.getOptions().setThrowExceptionOnScriptError(false); //解析js出错时不抛异常 42 //webClient.getOptions().setTimeout(50000); //超时时间 ms 43 webClient.getCookieManager().setCookiesEnabled(true); 44 webClient.getCache().clear(); 45 webClient.setRefreshHandler(new ImmediateRefreshHandler()); 46 webClient.getOptions().setTimeout(2*1000); //网页多少ms超时响应 47 webClient.setJavaScriptTimeout(600*1000); //javaScript多少ms超时 48 webClient.setAjaxController(new NicelyResynchronizingAjaxController()); 49 //webClient.setJavaScriptTimeout(600*1000); 50 //webClient.getOptions().setRedirectEnabled(true); 51 webClient.waitForBackgroundJavaScript(60*1000); 52 53 HtmlPage page=webClient.getPage("http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow"); // 解析获取页面 54 HtmlElement a=page.getElementByName("nextPage"); 55 int j=1,lastj=0; 56 FileHandle fh=new FileHandle(); 57 StringHandle sh=new StringHandle(); 58 List<String> lastInfo_zi=new ArrayList<String>(); 59 List<String> lastInfo_jian=new ArrayList<String>(); 60 List<String> lastInfo_tou=new ArrayList<String>(); 61 System.out.println("asdfsdaf"); 62 fh.outFile(""+"\r\n", "E:\\578095023\\FileRecv\\寒假作业\\大三寒假作业\\北京市政百姓信件分析实战\\list.txt", false); 63 64 while(j!=600) 65 { 66 67 String nowInfo=page.asXml(); 68 69 List<String> infoList_zi=sh.getExpString("letterdetail\\('.*?','.*?'\\)", nowInfo); 70 int g_size_zi=infoList_zi.size(); 71 if(sh.StringListSameOutStringList(infoList_zi, lastInfo_zi).size()!=g_size_zi&&g_size_zi==7) 72 { 73 //System.out.println(g_size); 74 for(int i=0;i<g_size_zi;i++) 75 { 76 String theWeb=infoList_zi.get(i).replaceAll("letterdetail\\('.*?','", "").replace("')", ""); 77 System.out.println(theWeb); 78 lines_zi.add(theWeb); 79 fh.outFile(theWeb+"\r\n", "E:\\578095023\\FileRecv\\寒假作业\\大三寒假作业\\北京市政百姓信件分析实战\\list.txt", true); 80 81 if(i==g_size_zi-1) 82 { 83 lastInfo_zi=infoList_zi; 84 System.out.println(j); 85 j++; 86 break; 87 } 88 89 } 90 page=a.click(); 91 } 92 //page=a.click(); 93 } 94 95 96 }catch (FailingHttpStatusCodeException | IOException e) { 97 // TODO Auto-generated catch block 98 e.printStackTrace(); 99 } finally{ 100 webClient.close(); // 关闭客户端,释放内存 101 } 102 103 } 104 public static void main(String[] args) { 105 Value_start(); 106 } 107 108 }getPass
爬取详细数据:
1 package util; 2 import java.io.*; 3 import java.util.List; 4 5 import org.jsoup.Connection; 6 import org.jsoup.Jsoup; 7 import org.jsoup.nodes.Document; 8 import org.jsoup.nodes.Element; 9 import org.jsoup.select.Elements; 10 11 import java.io.IOException; 12 import java.util.ArrayList; 13 import java.util.List; 14 15 import org.jsoup.Connection; 16 import org.jsoup.Jsoup; 17 import org.jsoup.nodes.Document; 18 import org.jsoup.nodes.Element; 19 import org.jsoup.select.Elements; 20 21 import util.SslUtils; 22 23 import us.codecraft.webmagic.Page; 24 import us.codecraft.webmagic.Site; 25 import us.codecraft.webmagic.Spider; 26 import us.codecraft.webmagic.processor.PageProcessor; 27 28 import com.bean.InfoBean; 29 import com.dao.InfoDao; 30 public class pa2 implements PageProcessor { 31 static int num=0; 32 static String Id; 33 static String Question; 34 static String Question_user; 35 static String Question_date; 36 static String Question_info; 37 static String Answer; 38 static String Answer_user; 39 static String Answer_date; 40 static String Answer_info; 41 static String Url; 42 //static String regEx="[\n`~!@#$%^&()+=|{}':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。, 、?? ]"; 43 static String aa = "";//这里是将特殊字符换为aa字符串," "代表直接去掉 44 45 // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 46 private Site site = Site.me().setRetryTimes(3).setSleepTime(100); 47 private static int count =0; 48 49 @Override 50 public Site getSite() { 51 return site; 52 } 53 //主页面 54 public void parent(Page page) 55 { 56 57 System.out.println("抓取的内容\n"+ 58 page.getHtml().xpath("//span[@name='cutStr' and @dispLength='68']//text()").get() 59 ); 60 } 61 //子页面 62 public void child(Page page) throws IOException { 63 64 System.out.println("RRRRRRRRR"); 65 BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File("E:\\578095023\\FileRecv\\寒假作业\\大三寒假作业\\list.txt")), 66 "UTF-8")); 67 String line=null; 68 System.out.println("SSSSSSSS"); 69 while((line=br.readLine())!=null) 70 { 71 72 String url= "http://www.beijing.gov.cn/hudong/hdjl/com.web.consult."; 73 String type="";//声明类型码 74 type="consultDetail.flow?originalId="; 75 url+=type; 76 url+=line; 77 System.out.println(url); 78 page.addTargetRequest(url); 79 80 url= "http://www.beijing.gov.cn/hudong/hdjl/com.web.consult."; 81 type="";//声明类型码 82 type="suggesDetail.flow?originalId="; 83 url+=type; 84 url+=line; 85 System.out.println(url); 86 page.addTargetRequest(url); 87 88 url= "http://www.beijing.gov.cn/hudong/hdjl/com.web.consult."; 89 type="";//声明类型码 90 type="complainDetail.flow?originalId="; 91 url+=type; 92 url+=line; 93 System.out.println(url); 94 page.addTargetRequest(url); 95 } 96 97 if(page.getUrl().regex("http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow").match()) 98 { 99 100 parent(page); 101 } 102 else 103 { 104 Question=page.getHtml().xpath("//div[contains(@class, 'col-xs-10')]/strong//text()").get().trim(); 105 // Question=Question.replaceAll(regEx, aa); 106 107 Question_user=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'my-3')]/div[contains(@class, 'col-xs-10') and contains(@class, 'text-muted')]//text()").get().trim(); 108 //Question_user=Question_user.replaceAll(regEx, aa); 109 Question_user=Question_user.replaceAll("来信人", aa).trim(); 110 Question_user=Question_user.replaceAll(":", aa).trim(); 111 Question_date=page.getHtml().xpath("//div[contains(@class, 'col-xs-12')]/div[contains(@class, 'col-xs-5')]//text()").get(); 112 // Question=Question.replaceAll(regEx, aa); 113 Question_date=Question_date.replaceAll("时间", aa).trim(); 114 Question_date=Question_date.replaceAll(":", aa).trim(); 115 116 Question_info=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'mx-2') ]//text()").get(); 117 //Question_info=Question_info.replaceAll(regEx, aa); 118 119 Answer=page.getHtml().xpath("//div[contains(@class, 'col-xs-9') and contains(@class, 'my-2')]//text()").get(); 120 //Answer=Answer.replaceAll(regEx, aa); 121 122 Answer_user=page.getHtml().xpath("//div[contains(@class, 'col-xs-9') and contains(@class, 'my-2')]//text()").get(); 123 // Answer_user=Answer_user.replaceAll(regEx, aa); 124 125 Answer_date=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'col-sm-3')and contains(@class, 'col-md-3') and contains(@class, 'my-2')]//text()").get(); 126 // Answer_date=Answer_date.replaceAll(regEx, aa); 127 Answer_date=Answer_date.replaceAll("答复时间", aa).trim(); 128 Answer_date=Answer_date.replaceAll(":", aa).trim(); 129 130 131 List<String> values=new ArrayList<String>(); 132 values=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'my-3')and contains(@class, 'p-4')]//*//text()").all(); 133 Answer_info=null; 134 for(String value:values) 135 { 136 Answer_info+=value; 137 } 138 if(Answer_info==null) 139 { 140 Answer_info=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'my-3')and contains(@class, 'p-4')]//text()").get(); 141 } 142 Answer_info=Answer_info.replaceAll("?", aa).trim(); 143 Answer_info=Answer_info.replaceAll("null", aa).trim(); 144 145 Url=page.getUrl().get(); 146 System.out.println("抓取的内容\n"+ 147 page.getHtml().xpath("//div[contains(@class, 'col-xs-10')]/strong//text()").get() 148 ); 149 150 System.out.println("Id:" + Id+ 151 "\n Question:" + Question+ 152 "\\n Question_user:" + Question_user+ 153 "\n Question_date:" + Question_date+ 154 "\n Question_info:" + Question_info+ 155 "\n Answer:" + Answer+ 156 "\n Answer_user:" + Answer_user+ 157 "\n Answer_date:" + Answer_date+ 158 "\n Answer_info:"+Answer_info+ 159 "\n Url:"+Url); 160 InfoDao.add(Question, Question_user, Question_date, Question_info, Answer, Answer_user, Answer_date, Answer_info, Url); 161 } 162 count ++; 163 } 164 @Override 165 public void process(Page page) { 166 num=num+1; 167 if(num==1) 168 { 169 try { 170 child(page); 171 } catch (IOException e) { 172 e.printStackTrace(); 173 } 174 } 175 else 176 { 177 Question=page.getHtml().xpath("//div[contains(@class, 'col-xs-10')]/strong//text()").get().trim(); 178 // Question=Question.replaceAll(regEx, aa); 179 180 Question_user=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'my-3')]/div[contains(@class, 'col-xs-10') and contains(@class, 'text-muted')]//text()").get().trim(); 181 //Question_user=Question_user.replaceAll(regEx, aa); 182 Question_user=Question_user.replaceAll("来信人", aa).trim(); 183 Question_user=Question_user.replaceAll(":", aa).trim(); 184 Question_date=page.getHtml().xpath("//div[contains(@class, 'col-xs-12')]/div[contains(@class, 'col-xs-5')]//text()").get(); 185 // Question=Question.replaceAll(regEx, aa); 186 Question_date=Question_date.replaceAll("时间", aa).trim(); 187 Question_date=Question_date.replaceAll(":", aa).trim(); 188 189 Question_info=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'mx-2') ]//text()").get(); 190 //Question_info=Question_info.replaceAll(regEx, aa); 191 192 Answer=page.getHtml().xpath("//div[contains(@class, 'col-xs-9') and contains(@class, 'my-2')]//text()").get(); 193 //Answer=Answer.replaceAll(regEx, aa); 194 195 Answer_user=page.getHtml().xpath("//div[contains(@class, 'col-xs-9') and contains(@class, 'my-2')]//text()").get(); 196 // Answer_user=Answer_user.replaceAll(regEx, aa); 197 198 Answer_date=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'col-sm-3')and contains(@class, 'col-md-3') and contains(@class, 'my-2')]//text()").get(); 199 // Answer_date=Answer_date.replaceAll(regEx, aa); 200 Answer_date=Answer_date.replaceAll("答复时间", aa).trim(); 201 Answer_date=Answer_date.replaceAll(":", aa).trim(); 202 203 List<String> values=new ArrayList<String>(); 204 values=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'my-3')and contains(@class, 'p-4')]//*//text()").all(); 205 Answer_info=null; 206 for(String value:values) 207 { 208 Answer_info+=value; 209 } 210 if(Answer_info==null) 211 { 212 Answer_info=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'my-3')and contains(@class, 'p-4')]//text()").get(); 213 } 214 Answer_info=Answer_info.replaceAll("?", aa).trim(); 215 Answer_info=Answer_info.replaceAll("null", aa).trim(); 216 217 Url=page.getUrl().get(); 218 System.out.println("抓取的内容\n"+ 219 page.getHtml().xpath("//div[contains(@class, 'col-xs-10')]/strong//text()").get() 220 ); 221 222 System.out.println("Id:" + Id+ 223 "\n Question:" + Question+ 224 "\\n Question_user:" + Question_user+ 225 "\n Question_date:" + Question_date+ 226 "\n Question_info:" + Question_info+ 227 "\n Answer:" + Answer+ 228 "\n Answer_user:" + Answer_user+ 229 "\n Answer_date:" + Answer_date+ 230 "\n Answer_info:"+Answer_info+ 231 "\n Url:"+Url); 232 InfoDao.add(Question, Question_user, Question_date, Question_info, Answer, Answer_user, Answer_date, Answer_info, Url); 233 } 234 235 } 236 237 public static void main(String[] args) { 238 try { 239 SslUtils.ignoreSsl(); 240 } catch (Exception e) { 241 e.printStackTrace(); 242 } 243 // jsoup("http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow"); 244 long startTime, endTime; 245 System.out.println("开始爬取..."); 246 InfoDao.delete(); 247 startTime = System.currentTimeMillis(); 248 Spider.create(new pa2()).addUrl("http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow").thread(5).run(); 249 endTime = System.currentTimeMillis(); 250 System.out.println("爬取结束,耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了"+count+"条记录"); 251 } 252 253 254 }getInfo
下载地址:
标签:info,htmlunit,contains,Question,date,Answer,源代码,class,webmagic 来源: https://www.cnblogs.com/smartisn/p/12237534.html