做个爬虫看看豆瓣电影排行的电影的各项信息
作者:互联网
做个爬虫看看豆瓣电影排行的电影的各项信息
合理利用正则表达式,和懂得加个request header 不然网页会拦截。
package zhengze;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.net.URL;
import java.net.MalformedURLException;
import java.net.HttpURLConnection;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.BufferedReader;
class Hello{
private String name;
private static HttpURLConnection connection = null;
public String httpRequest(String url){
String content = "";
try{
URL u = new URL(url);
connection = (HttpURLConnection)u.openConnection();
connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.56");//connection.setRequestProperty
int code = connection.getResponseCode();
if(code == 200){
InputStream in = connection.getInputStream();
InputStreamReader isr = new InputStreamReader(in,"utf-8");
BufferedReader reader = new BufferedReader(isr);
String line = null;
while((line = reader.readLine()) != null){
content += line;
}
}
}catch(MalformedURLException e){
e.printStackTrace();
}catch(IOException e){
e.printStackTrace();
}finally{
if(connection != null){
connection.disconnect();
}
}
return content;
}
}
public class Test {
public static void main(String[] args) {
Hello x=new Hello();
String fts = x.httpRequest("https://movie.douban.com/chart");
//String regex = "<span style=\"font-size:13px;\">(.+?)</span>.+?<p class=.+?>(.+?)</p>.+?<span class=\"rating_nums\">(.+?)</span>";
String regex = "<a class=.+? href=.+? title=\"(.+?)\">.+? <span style=\"font-size:13px;\">(.+?)</span>.+?<p class=.+?>(.+?)</p>.+?<span class=\"rating_nums\">(.+?)</span>";
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(fts);
while(m.find())
{
System.out.println(m.group(1)+" "+m.group(2) +" "+m.group(4)+ " : " +m.group(3));
String regex2="<div class=\"pl2\">.+?<a href=\"(.+?)\" class=\".+?\">";
Pattern p2=Pattern.compile(regex2);
Matcher m2=p2.matcher(fts);
while(m2.find()) {
System.out.println(m2.group(1));
String fts2=x.httpRequest(m2.group(1));
String regex3=" <span class=\"short\">(.+?)</span>";
Pattern p3=Pattern.compile(regex3);
Matcher m3=p3.matcher(fts2);
while(m3.find()) {
System.out.println(m3.group(1));
}
}
}
}
}
放个运行截图
是不是很简单。
标签:.+,java,String,Pattern,电影,爬虫,connection,豆瓣,import 来源: https://blog.csdn.net/hhuhgfhggy/article/details/116753566