2021-03-27
作者:互联网
java爬虫(httpClient+Jsoup)爬取高清大图实例
package practice;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class BaidouCrawler {
static int sum = 0;
public static void main(String[] args) throws ClientProtocolException, IOException {
downLoadPicture(1);
}
public static void downLoadPicture(int page) throws ClientProtocolException, IOException {
// 创建一个链接
CloseableHttpClient httpClient = HttpClients.createDefault();
// 利用链接发送请求
HttpGet httpGet = new HttpGet("https://www.umei.cc/bizhitupian/diannaobizhi/" + page + ".htm");
// 接收回应
CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
// 获取响应解析工具httpEntity
HttpEntity httpEntity = httpResponse.getEntity();
// content响应内容(网页源代码),并解决中文乱码用UTF-8
String content = EntityUtils.toString(httpEntity, "UTF-8");
// 现在得到了网页源代码,而图片就在源代码中
// 拿到了脏数据----要清洗--像洗菜一样---
// Jsoup解析器
Document document = Jsoup.parse(content);
// elements标签下的内容
Elements elements = document.select("div.TypeList ul li a");// 选择,元素,标签
int i = 0;
try {//捕获异常
// 进一步清洗
for (i = 0; i < 30; i++) {// 观察到一个有略缩图的网页一共有30张
Element element = elements.get(i);
// 拿到高清图的链接,这个链接在 href 标签里面,attr接收到高清图链接
String attr = element.attr("href");
// 重复上面的步骤 //创建一个链接
CloseableHttpClient httpClient2 = HttpClients.createDefault();
// 利用高清图链接发送请求
HttpGet httpGet2 = new HttpGet(""+attr+"");
// 接收回应
CloseableHttpResponse httpResponse2 = httpClient2.execute(httpGet2);
// 获取响应解析工具httpEntity
HttpEntity httpEntity2 = httpResponse2.getEntity();
// content响应内容(网页源代码),并解决中文乱码用UTF-8
String content2 = EntityUtils.toString(httpEntity2, "UTF-8");
// Jsoup解析器 再进一步的清洗
Document document2 = Jsoup.parse(content2);
Elements elements2 = document2.select("div.wrap div.ImageBody p img");// 选择,元素,标签
// System.out.println( elements2);
Element element2 = elements2.get(0);
// src有图片的地址,得到了高清图片的地址src.....
String attr3 = element2.attr("src");
// 利用高清图地址发送请求
HttpGet httpGet3 = new HttpGet(attr3);
// 执行下载该高清原图
CloseableHttpResponse httpResponse3 = httpClient.execute(httpGet3);
HttpEntity httpEntity3 = httpResponse3.getEntity();
//流入本地文件夹
InputStream stream = httpEntity3.getContent();
FileUtils.copyInputStreamToFile(stream, new File("C://爬虫图片//" + page + "-" + i + ".png"));
sum++;
//
System.out.println("恭喜第" + page + " 页第"+(i+1)+ "张图,正在下载。。总共下载了" + sum + "张图;目录C:/爬虫图片");
}
} catch (Exception e) {
System.out.println("恭喜你,共为你下载了" + sum + "张图,程序结束了");
}
page++;//下一页
downLoadPicture(page);
}
}
运行效果如下
爬取的是电脑壁纸
标签:03,27,http,HttpGet,2021,import,apache,org,page 来源: https://blog.csdn.net/qq_22812043/article/details/115274136