今日成果:爬取百度贴吧
作者:互联网
''' 第一页 https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0 # 第二页 https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50 # 第三页 https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100 ''' from urllib.parse import urlencode # 导入解析模块 from urllib.request import Request,urlopen # Request 请求 , urlopen 打开 def get_html(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36' } request = Request(url,headers=headers) response = urlopen(request) # print(response.read().decode("gbk","ignore")) # 返回的是二进制数据 return response.read() def save_html(filename,html_bytes): with open(filename,'wb') as f: # 使用 wb 进行存储数据 f.write(html_bytes) def main(): base_url = 'https://tieba.baidu.com/f?ie=utf-8&{}' # 贴吧 url content = input("请输入要进行查询的内容:") # 要进行查询的内容 num = int(input("请输入要下载的页数:")) for pn in range(num): print("正在下载第{}页".format(pn + 1)) args = { 'kw':content, # 内容 'pn':pn * 50 # 页码 } args = urlencode(args) # 进行转码 html_bytes = get_html(base_url.format(args)) # 传递拼接后的 base_url 给 get_html 函数 filename = "第" + str(pn+1) + "页.html" # 下载到本地的文件名称 save_html(filename,html_bytes) if __name__ == '__main__': main()
urlopen方法 #coding=gbk from urllib.request import urlopen # urlopen 打开网页使用 url = 'https://www.baidu.com/' # 要进行访问的 URL response = urlopen(url) # 发送请求 print(response.getcode()) # 获取 HTTP 响应码 200 print(response.geturl()) # 获取访问的网址信息 https://www.baidu.com/ print(response.info()) # 获取服务器响应的HTTP请求头 info = response.read() # 读取内容 print(info.decode()) # 打印内容
Request 方法 request = Request(url = url,headers = headers) # 带着 ua 去请求信息 print(request.get_header("User-agent")) # 获取请求头信息 response = urlopen(request) info = response.read()
安装 fake_useragent 第三方库 查看 headers 信息 from fake_useragent import UserAgent ua = UserAgent() print(ua.chrome) # 打印谷歌浏览器 # print(dir(ua)) print(ua.ie) # 打印 IE 浏览器 print(ua.firefox) # 打印火狐浏览器
使用 get 请求 from urllib.request import urlopen,Request from urllib.parse import quote # urlopen 打开网页使用 # Request 请求对象,带有 ua headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36' } url = 'https://www.baidu.com/s?wd={}'.format(quote('瀚阳的小驿站')) # 要进行访问的 URL , 使用 quote 进行转码 # 使用 urllib.parse 中的 urlencode 和 quote 方法 进行转码 # quote('瀚阳的小驿站') 进行转码 '%E7%80%9A%E9%98%B3%E7%9A%84%E5%B0%8F%E9%A9%BF%E7%AB%99' request = Request(url,headers = headers) # 进行请求 response = urlopen(request) print(response.read().decode())
from urllib.request import urlopen,Request from urllib.parse import urlencode from fake_useragent import UserAgent # urlopen 打开网页使用 # Request 请求对象,带有 ua # 使用 urlencode 对字典元素进行转换编码 args = { 'wd':"瀚阳的小驿站", "ie":"utf-8" } headers = { 'User-Agent': UserAgent().random } url = 'https://www.baidu.com/s?wd={}'.format(urlencode(args)) print(url) request = Request(url , headers = headers) response = urlopen(request) info = response.read() print(info.decode())
2020-06-20
标签:headers,url,request,成果,爬取,urlopen,print,response,百度 来源: https://www.cnblogs.com/hany-postq473111315/p/13170170.html