建方公寓挂牌房源信息爬取
作者:互联网
爬取建方公寓挂牌房源信息
背景自从青客公寓分城市挂牌房源和优客逸家挂牌房源爬取之后,发现爬虫也挺有趣的,于是今天又拿建方公寓练手,差点栽跟头了,且听我慢慢道来。有前两次爬虫经验,发现在爬取青客设计的半自动逻辑较好,所以这次采用了只要输入城市名称和城市代码以及总网页数3个参数然后再执行程序,发现自己挺喜欢这种互动式的模式,有参与感,但是打印整个解析网页的时候总提示我没找到我要找到的东西,经过一番折腾,发现是请求头出问题了,最初只构造了一个User-Agent, 很可能别人家服务器识别为爬虫程序,于是在网页源码Network下面把headers原原本本写下来
header={"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Cookie": "_site_id_cookie=1; clientlanguage=zh_CN; SESSION=62a74a27387f4f4a9ca7cf4e45768631; _cookie_city_name=%E5%B9%BF%E5%B7%9E",
"Host": "www.giantfind.com.cn",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"} #构造请求头
修改之后打印整个解析网页,发现要找的东西都出来,再也没有提示没找到我要找到的东西,心情瞬间大好,完整代码如下
完整代码# -*- coding: utf-8 -*-
"""
project_name:giantfind
@author: 帅帅de三叔
Created on Tue Aug 6 09:21:11 2019
"""
import requests #导入请求模块
from bs4 import BeautifulSoup #导入网页解析模块
import urllib.parse #url中文编码
import re #导入正则模块
import pymysql #导入数据库功能模块
import time #导入时间模块
host="http://www.giantfind.com.cn" #主域名
header={"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Cookie": "_site_id_cookie=1; clientlanguage=zh_CN; SESSION=62a74a27387f4f4a9ca7cf4e45768631; _cookie_city_name=%E5%B9%BF%E5%B7%9E",
"Host": "www.giantfind.com.cn",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"} #构造请求头
print("connecting mysql……\n")
db=pymysql.connect("localhost","root","123456","giantfind",charset='utf8') #链接数据库
print("connect successfully\n")
cursor=db.cursor() #获取游标
cursor.execute("drop table if exists giantfind_gz\n") #重新创建表
print("start creating table giantfind_gz")
c_sql="""CREATE TABLE giantfind_gz(
district varchar(8),
title varchar(20),
area varchar(6),
price varchar(6),
house_type varchar(6),
floor varchar(6),
towards_or_style varchar(4),
address varchar(30)
)Engine=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=UTF8"""
cursor.execute(c_sql)
print("table giantfind_gz has been created,please insert into data\n")
def generate_page(page_num,city,cityCode): #定义生成总网页数
url="http://www.giantfind.com.cn/findRoomPc/index_{}.jhtml?city={}&cityCode={}&reservationChannel=21"
for next_page in range(1,int(page_num)+1):
yield url.format(next_page,city,cityCode,next_page)
def get_detail_item(generate_page): #定义获取详情页网址
#print("网址是:",generate_page)
response=requests.get(generate_page,headers=header) #发出请求
time.sleep(1) #挂起进程1秒
soup=BeautifulSoup(response.text,'lxml') #解析网页
detail_list=soup.find("div","content").find("div",class_="list-life list-lifen").findAll("a",class_="list-la list-lb stat") #该页所有房源列表
#print(len(detail_list))
for content in detail_list:
detail_url=host+content['href'] #构造详情页
answer=requests.get(detail_url,headers=header) #进入详情页
answer_json=BeautifulSoup(answer.text,'lxml') #解析详情页
district=answer_json.find("div",class_="hos-csho").find("p").get_text().replace("建方·家","").replace("建方·寓","").strip() #区域
title=answer_json.find("div",class_="hos-csho").find("h2").find("span").get_text() #房源名称
area=answer_json.find("div",class_="hos-csho").find("ul",class_="hos-clist").findAll("li")[0].find("i").find("span").get_text().split(" ")[1].replace("㎡","") #居住面积
house_type=answer_json.find("div",class_="hos-csho").find("ul",class_="hos-clist").findAll("li")[0].find("i").find("span").get_text().split(" ")[0] #房型
pattern_price=re.compile("\d+") #用以正则价格
price=re.search(pattern_price,answer_json.find("div",class_="hos-csho").find("div").find("strong").get_text()).group(0) #价格
floor=answer_json.find("div",class_="hos-csho").find("ul",class_="hos-clist").findAll("li")[1].find("i").get_text().replace("层","") #楼层
towards_or_style=answer_json.find("div",class_="hos-csho").find("ul",class_="hos-clist").findAll("li")[2].find("i").get_text().strip() #朝向
address=answer_json.find("div",class_="hos-csho").find("ul",class_="hos-clist").findAll("li")[4].find("i").get_text().replace(">","").strip() #详细地址
print(district,title,area,price,house_type,floor,towards_or_style,address) #字段测试
insert_data=("INSERT INTO giantfind_gz(district,title,area,price,house_type,floor,towards_or_style,address)""VALUES(%s,%s,%s,%s,%s,%s,%s,%s)") #控制插入格式
gaintfind_data=([district,title,area,price,house_type,floor,towards_or_style,address]) #待插入数据
cursor.execute(insert_data,gaintfind_data) #执行插入操作
db.commit() #主动提交
def main(): #定义一个主函数整合其他所有函数
city=urllib.parse.quote(input("please input city name:")) #请输入城市名称并Unicode编码
cityCode=input("please input city code:") #请输入城市代码
page_num=input("please input total pages num:")
for page_link in generate_page(page_num,city,cityCode):
#print(page_link)
get_detail_item(page_link)
if __name__=="__main__":
main()
后话
谨以此篇记录遇到的header请求头问题,不做代码解析,爬虫仅作为交流,如有冒犯,请告知删。
标签:get,text,find,page,爬取,建方,房源,class,hos 来源: https://blog.51cto.com/u_15255081/2862973