爬虫爬取房王
作者:互联网
from selenium import webdriver
from lxml import etree
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
option = webdriver.ChromeOptions()
option.binary_location=r'C:\Program Files\Google\Chrome\Application\chrome.exe'
driver = webdriver.Chrome('C:\Program Files\Google\Chrome\Application\chromedriver.exe')
driver.get("https://gz.ihk.cn/myxf/houselist")
html = driver.page_source
tree = etree.HTML(html)
import re
detail_url=re.findall('<div class="ihknewconlist" οnclick="listPageClick(.*)">',html)
detail=[]
for i in detail_url:
i=i.replace('(','').replace(')','').replace("'","")
detail.append(i)
for i in detail:
detail_all = "https://gz.ihk.cn/myxfdetail/main/" + i
driver1=webdriver.Chrome('C:\Program Files\Google\Chrome\Application\chromedriver.exe')
driver1.get(detail_all)
html1 = driver1.page_source
tree1=etree.HTML(html1)
title=tree1.xpath('//title/text()')[0]
area= tree1.xpath('//*[@id="main-rightbox"]/div/text()')
try:
name = tree1.xpath('//*[@id="main-rightbox"]/div[11]/dl[1]/dd[1]/strong/text()')[0]
phone = tree1.xpath('//*[@id="main-rightbox"]/div[11]/dl[1]/dd[3]/a/text()')[0]
a=tree1.xpath('//div[@class="ind-r07"]/dl[1]//text()')
print(''.join(a))
a=''.join(a)
list = []
for i in area:
if i =='':
continue
wv = i.replace('\n','').replace(" ",'').replace("'",'')
if wv=='':
continue
list.append(wv)
print(list)
for ii in list:
if ii=='':
continue
with open('zy.txt', 'a', encoding='utf-8')as f:
f.write("----------" + title + "----------\n")
for i in list:
f.write(i)
f.write("\n")
f.write(a)
f.write("\n")
except:
a = tree1.xpath('//div[@class="ind-r07"]/dl[1]//text()')
print(''.join(a))
a = ''.join(a)
list = []
for i in area:
if i == '':
continue
wv = i.replace('\n', '').replace(" ", '').replace("'", '')
if wv == '':
continue
list.append(wv)
print(list)
for ii in list:
if ii == '':
continue
with open('zy.txt', 'a', encoding='utf-8')as f:
f.write("----------" + title + "----------\n")
for i in list:
f.write(i)
f.write("\n")
f.write("\n")
标签:write,detail,tree1,list,爬虫,replace,爬取,房王,text 来源: https://blog.csdn.net/weixin_48502798/article/details/118603486