Python 爬虫JD数据
作者:互联网
# -*- coding: utf-8 -*-
# ---
# @Software: PyCharm
# @Site:
# @File: day1.py
# @Author: ---SamXu
# @E-mail: ---xuhongwu1993@gmail.com
# @Time: 5月 22, 2020
# 导入模块
from bs4 import BeautifulSoup # 网页解析
import re # 正则表达式,进行文字匹配
import urllib.request,urllib.error,urllib.parse # 制定url,获取网页数据,中文转码
import xlwt # 进行excel操作
import sqlite3 # 进行SQLite数据库操作
import ssl # ssl验证证书问题
#ssl._create_default_https_context = ssl._create_unverified_context # 取消全局变量
context = ssl._create_unverified_context() # 取消局部变量
def main():
User_input = input("请输入想要查询的产品:\n")
baseurl = userinput(User_input)
datelist = GetDate(baseurl)
#askURL(baseurl)
savepath = "JD数据抓去.xls"
Savedata(datelist,savepath)
findList = re.compile(r'<em>(.*?)\n(.*?)class="promo-words"',re.S)
findLink = re.compile(r'href="//(.*?)"')
findName = re.compile(r"n<em>(.*?)<font")
findMoney = re.compile(r'</em><i>(\d+\.?\d*)</i')
# 用户输入
def userinput(User_input):
enter = urllib.parse.quote(User_input) # 中文转换ascll码
baseurl = "https://search.jd.com/Search?keyword=" + enter + "&wq=" + enter + "&page="
return baseurl
# 爬取网页数
def GetDate(baseurl):
datelist = []
for i in range(1,2,2): # 获取多页内容
url1 = baseurl + str(i)
html = askURL(url1) # 保存获取到的网页源代码
#解析数据
soup = BeautifulSoup(html,"lxml") # BeautifulSoup('源码','解析格式')
for item in soup.find_all('div', class_="gl-i-wrap"): # 查找符合要求的字符串,形成列表"class 需要加'_' "
data = []
item = str(item)
FindList = re.findall(findList,item)
# print(FindList)
Link = ''.join(re.findall(findLink,str(FindList)))
name = ''.join(re.findall(findName,str(FindList)))
money = ''.join(re.findall(findMoney,str(FindList)))
data.append(name)
data.append(money)
data.append(Link)
datelist.append(data)
# print(datelist)
return datelist
# 指定一个URL的网页内容
def askURL(url):
head = {
"user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
} # 用户代理,伪装浏览器信息
request = urllib.request.Request(url,headers=head)
try:
response = urllib.request.urlopen(request,context=context)
html = response.read().decode("utf-8")
return html
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
# 保存数据
def Savedata(datelist,savepath):
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('JD数据')
col = ('名字','价格','链接')
for i in range(0,3):
sheet.write(0,i,col[i])
for i in range(0,30):
data = datelist[i]
for j in range(0,3):
sheet.write(i+1,j,data[j])
book.save('JD数据抓去.xls')
if __name__ == "__main__": # 当程序执行时
main()
标签:Python,datelist,爬虫,urllib,baseurl,re,JD,import,data 来源: https://blog.51cto.com/u_13184683/2703830