其他分享
首页 > 其他分享> > 建方公寓挂牌房源信息爬取

建方公寓挂牌房源信息爬取

作者:互联网

爬取建方公寓挂牌房源信息

背景

自从青客公寓分城市挂牌房源和优客逸家挂牌房源爬取之后,发现爬虫也挺有趣的,于是今天又拿建方公寓练手,差点栽跟头了,且听我慢慢道来。有前两次爬虫经验,发现在爬取青客设计的半自动逻辑较好,所以这次采用了只要输入城市名称和城市代码以及总网页数3个参数然后再执行程序,发现自己挺喜欢这种互动式的模式,有参与感,但是打印整个解析网页的时候总提示我没找到我要找到的东西,经过一番折腾,发现是请求头出问题了,最初只构造了一个User-Agent, 很可能别人家服务器识别为爬虫程序,于是在网页源码Network下面把headers原原本本写下来

header={"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
         "Accept-Encoding": "gzip, deflate",
         "Accept-Language": "zh-CN,zh;q=0.9",
         "Cache-Control": "max-age=0",
         "Connection": "keep-alive",
         "Cookie": "_site_id_cookie=1; clientlanguage=zh_CN; SESSION=62a74a27387f4f4a9ca7cf4e45768631; _cookie_city_name=%E5%B9%BF%E5%B7%9E",
         "Host": "www.giantfind.com.cn",
         "Upgrade-Insecure-Requests": "1",
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"} #构造请求头

修改之后打印整个解析网页,发现要找的东西都出来,再也没有提示没找到我要找到的东西,心情瞬间大好,完整代码如下

完整代码
# -*- coding: utf-8 -*-
"""
project_name:giantfind
@author: 帅帅de三叔
Created on Tue Aug  6 09:21:11 2019
"""
import requests #导入请求模块
from bs4 import BeautifulSoup #导入网页解析模块
import urllib.parse  #url中文编码
import re #导入正则模块
import pymysql #导入数据库功能模块
import time #导入时间模块
host="http://www.giantfind.com.cn" #主域名
header={"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
         "Accept-Encoding": "gzip, deflate",
         "Accept-Language": "zh-CN,zh;q=0.9",
         "Cache-Control": "max-age=0",
         "Connection": "keep-alive",
         "Cookie": "_site_id_cookie=1; clientlanguage=zh_CN; SESSION=62a74a27387f4f4a9ca7cf4e45768631; _cookie_city_name=%E5%B9%BF%E5%B7%9E",
         "Host": "www.giantfind.com.cn",
         "Upgrade-Insecure-Requests": "1",
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"} #构造请求头

print("connecting mysql……\n")
db=pymysql.connect("localhost","root","123456","giantfind",charset='utf8') #链接数据库
print("connect successfully\n")
cursor=db.cursor() #获取游标
cursor.execute("drop table if exists giantfind_gz\n") #重新创建表

print("start creating table giantfind_gz")
c_sql="""CREATE TABLE giantfind_gz(
         district varchar(8),
         title varchar(20),
         area varchar(6),
         price varchar(6),
         house_type varchar(6),
         floor varchar(6),
         towards_or_style varchar(4),
         address varchar(30)        
          )Engine=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=UTF8"""
cursor.execute(c_sql)
print("table giantfind_gz has been created,please insert into data\n")

def generate_page(page_num,city,cityCode): #定义生成总网页数
    url="http://www.giantfind.com.cn/findRoomPc/index_{}.jhtml?city={}&cityCode={}&reservationChannel=21"
    for next_page in range(1,int(page_num)+1): 
        yield url.format(next_page,city,cityCode,next_page)

def get_detail_item(generate_page): #定义获取详情页网址
    #print("网址是:",generate_page)
    response=requests.get(generate_page,headers=header) #发出请求
    time.sleep(1) #挂起进程1秒
    soup=BeautifulSoup(response.text,'lxml') #解析网页
    detail_list=soup.find("div","content").find("div",class_="list-life list-lifen").findAll("a",class_="list-la list-lb stat") #该页所有房源列表
    #print(len(detail_list))
    for content in detail_list:
        detail_url=host+content['href'] #构造详情页
        answer=requests.get(detail_url,headers=header) #进入详情页
        answer_json=BeautifulSoup(answer.text,'lxml') #解析详情页
        district=answer_json.find("div",class_="hos-csho").find("p").get_text().replace("建方·家","").replace("建方·寓","").strip() #区域
        title=answer_json.find("div",class_="hos-csho").find("h2").find("span").get_text() #房源名称
        area=answer_json.find("div",class_="hos-csho").find("ul",class_="hos-clist").findAll("li")[0].find("i").find("span").get_text().split("  ")[1].replace("㎡","") #居住面积
        house_type=answer_json.find("div",class_="hos-csho").find("ul",class_="hos-clist").findAll("li")[0].find("i").find("span").get_text().split("  ")[0] #房型
        pattern_price=re.compile("\d+") #用以正则价格
        price=re.search(pattern_price,answer_json.find("div",class_="hos-csho").find("div").find("strong").get_text()).group(0) #价格
        floor=answer_json.find("div",class_="hos-csho").find("ul",class_="hos-clist").findAll("li")[1].find("i").get_text().replace("层","") #楼层
        towards_or_style=answer_json.find("div",class_="hos-csho").find("ul",class_="hos-clist").findAll("li")[2].find("i").get_text().strip() #朝向
        address=answer_json.find("div",class_="hos-csho").find("ul",class_="hos-clist").findAll("li")[4].find("i").get_text().replace(">","").strip() #详细地址
        print(district,title,area,price,house_type,floor,towards_or_style,address) #字段测试
        insert_data=("INSERT INTO giantfind_gz(district,title,area,price,house_type,floor,towards_or_style,address)""VALUES(%s,%s,%s,%s,%s,%s,%s,%s)") #控制插入格式
        gaintfind_data=([district,title,area,price,house_type,floor,towards_or_style,address]) #待插入数据
        cursor.execute(insert_data,gaintfind_data) #执行插入操作
        db.commit() #主动提交
    
def main(): #定义一个主函数整合其他所有函数
    city=urllib.parse.quote(input("please input city name:")) #请输入城市名称并Unicode编码
    cityCode=input("please input city code:") #请输入城市代码
    page_num=input("please input total pages num:")
    for page_link in generate_page(page_num,city,cityCode):
        #print(page_link)
        get_detail_item(page_link)

if __name__=="__main__":
    main()   
后话

谨以此篇记录遇到的header请求头问题,不做代码解析,爬虫仅作为交流,如有冒犯,请告知删。

延申阅读
青客公寓挂牌房源分城市爬取
优客逸家挂牌房源爬取
在这里插入图片描述

标签:get,text,find,page,爬取,建方,房源,class,hos
来源: https://blog.51cto.com/u_15255081/2862973