java爬虫爬取高清图片
作者:互联网
代码1:
package com.xy;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* 爬取网页面图篇
*/
public class Test1 {
public static void main(String[] args) throws ClientProtocolException, IOException {
// 创建httpclient实例
CloseableHttpClient httpclient = HttpClients.createDefault();
// 创建httpget实例
CloseableHttpResponse pictureResponse = null;
CloseableHttpResponse response = null;
for (int i = 10; i < 99; i++) {
String ss= "https://pic.netbian.com/tupian/270"+i+".html";
System.out.println(ss);
HttpGet httpget = new HttpGet(ss);
// 执行get请求
response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
// 获取返回实体
String content = EntityUtils.toString(entity, "utf-8");
// 解析网页 得到文档对象
Document doc = Jsoup.parse(content);
// 获取指定的 <img />
Elements elements = doc.select(".photo-pic #img img");
try{
Element element = elements.get(0);
String src = element.attr("src");
String strpre = "https://pic.netbian.com";
String url = strpre + src;
System.out.println("第"+(i-9)+"张"+ url);
HttpGet picGet = new HttpGet(url);
pictureResponse = httpclient.execute(picGet);
HttpEntity pictureEntity = pictureResponse.getEntity();
InputStream inputStream = pictureEntity.getContent();
// 使用 common-io 下载图片到本地,注意图片名不能重复 ✔
FileUtils.copyToFile(inputStream, new File("D://img//imsge//" + i + "" + 1 + ".jpg"));
}catch (Exception e){
e.printStackTrace();
}
}
pictureResponse.close(); // pictureResponse关闭
response.close(); // response关闭
httpclient.close(); // httpClient关闭
}
}
代码2:
package com.xy;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* 爬取彼岸图网页面预览图
*/
public class Test2 {
public static void main(String[] args) throws ClientProtocolException, IOException {
// 创建httpclient实例
CloseableHttpClient httpclient = HttpClients.createDefault();
// 创建httpget实例
CloseableHttpResponse pictureResponse = null;
CloseableHttpResponse response = null;
for (int i = 0; i <=10 ; i++) {
String ss= "https://pic.netbian.com/e/search/result/index.php?page="+i+"&searchid=2453";
System.out.println(ss);
HttpGet httpget = new HttpGet(ss);
// https://pic.netbian.com/e/search/result/index.php?page=2&searchid=2453
// 执行get请求
response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
// 获取返回实体
String content = EntityUtils.toString(entity, "utf-8");
// 解析网页 得到文档对象
Document doc = Jsoup.parse(content);
// 获取指定的 <img />
Elements elements = doc.select(".slist ul li");
for (int j = 0; j <15 ; j++) {
Element element = elements.get(j);
Elements elements1 = element.select("a img");
String url = elements1.attr("src");
// 彼岸图网首页
String urlStr = "https://pic.netbian.com";
HttpGet picturehttpGet = new HttpGet(url);
String s = picturehttpGet+"";
String substring = s.substring(4);
String[] split = substring.split(" HTTP/1.1");
String s1 = String.valueOf(split[0]);
String string = urlStr+s1;
System.out.println(string);
HttpGet picGet = new HttpGet(string);
try {
pictureResponse = httpclient.execute(picGet);
HttpEntity pictureEntity = pictureResponse.getEntity();
InputStream inputStream = pictureEntity.getContent();
// 使用 common-io 下载图片到本地,注意图片名不能重复 ✔
FileUtils.copyToFile(inputStream, new File("D://img//" + i + "" + j + ".jpg"));
}catch (Exception e){
e.printStackTrace();
}
}
// System.out.println(elements.get(1));
// System.out.println(url);
}
// String pre = "https://pic.netbian.com/e/search/result/index.php?page=1&searchid=2453";
pictureResponse.close(); // pictureResponse关闭
response.close(); // response关闭
httpclient.close(); // httpClient关闭
// for (int i = 0; i < 10; i++) {
// Element element = elements.get(i);
// 获取 <img /> 的 src
// String url = element.attr("src");
// 再发请求最简单了,并由于该链接是没有 https:开头的,得人工补全 ✔
}
}
标签:HttpGet,java,String,http,爬虫,爬取,org,apache,import 来源: https://blog.csdn.net/weixin_45902973/article/details/115253031