简单小爬虫
作者:互联网
爬虫代码仅用于学习
1. 壁纸下载
import requests
url = 'https://pic.netbian.com/uploads/allimg/210519/003255-1621355575e57d.jpg'
res = requests.get(url)
with open(r'D:\Downloads\女仆.jpg','wb') as f:
f.write(res.content)
print('保存成功...')
2. 站长之家图片链接
import requests
from bs4 import BeautifulSoup
import csv
import time
file = open('图片链接.csv', 'a', encoding='utf-8-sig', newline='')
file_csv = csv.writer(file)
file_csv.writerow( ['图片名称', '图片链接'] )
#请求链接
url = "https://sc.chinaz.com/tupian/"
#请求头
headers = {
# 浏览器标识
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}
#开始请求
response = requests.get(url, headers=headers)
# 设置编码方式 为了避免出现编码乱码问题
response.encoding = 'utf-8'
# 创建bs4对象
soup = BeautifulSoup(response.text, 'html.parser') # lxml
# 定位总的分类标签
all_a_list = soup.find('div', attrs={"class":"mt10 feilei"}).find_all('a')
for a in all_a_list:
time.sleep(1)
# https://sc.chinaz.com
href = a.get('href')
name = a.get('title')
temp = href.split('.')
for i in range(1,4):
if i == 1:
res_href = 'https://sc.chinaz.com' + href
print('第一页------------------------')
else:
res_href = 'https://sc.chinaz.com' + temp[0] + '_' + str(i) + '.html'
print('第{}页-----'.format(i))
# 开始请求分类页面
feilei_response = requests.get(res_href, headers=headers)
feilei_response.encoding = 'utf-8'
# 创建bs4对象
feilei_soup = BeautifulSoup(feilei_response.text, 'html.parser') # lxml
all_p_list = feilei_soup.find('div', attrs={'id':'container'}).find_all('p')
for p in all_p_list:
a = p.find('a')
a_href = a.get('href')
a_name = a.get('alt')
a_href_res = 'https:' + a_href
# 请求获取图片页面 imga
img_response = requests.get(a_href_res, headers=headers)
img_response.encoding='utf-8'
# 创建bs4对象
img_soup = BeautifulSoup(img_response.text, 'html.parser') # lxml
imga = img_soup.find('div',attrs={"class":"imga"}).find('a').get('href')
img = 'https:' + imga
print(img) # 最终的图片链接
file_csv.writerow( [a_name, img] )
break
3. 视频下载
stream = True时(默认是False),它不会立即进行下载,而是在使用 iter_content 进行遍历时按指定的块大小进行下载,这样在下载大文件时可以防止占用过多内存。
import requests
import os
root = r'D:\Downloads\movie'
path = root + os.sep + 'Beyond-喜欢你.mp4'
url = 'https://upos-sz-mirrorcos.bilivideo.com/upgcxcode/70/10/3711070/3711070-1-208.mp4?e=ig8euxZM2rNcNbNB7WdVhwdlhbUBhwdVhoNvNC8BqJIzNbfq9rVEuxTEnE8L5F6VnEsSTx0vkX8fqJeYTj_lta53NCM=&uipk=5&nbs=1&deadline=1631641391&gen=playurlv2&os=cosbv&oi=2054344254&trid=0386c5cd40b84401a8c0485a286ac1c0T&platform=html5&upsig=67a2ddc104c6b555ce2bac0d32e32980&uparams=e,uipk,nbs,deadline,gen,os,oi,trid,platform&mid=0&bvc=vod&nettype=0&orderid=0,1&logo=80000000#vp'
hd = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
os.mkdir(root)
r = requests.get(url,headers=hd,stream=True)
content_size = int(r.headers['content-length'])
n = 1
with open(path,"wb") as f:
for chunk in r.iter_content(chunk_size=1024):
rate = n*1024/content_size
print("\r下载进度:{0:%}".format(rate),end='')
f.write(chunk)
n += 1
print("下载完成")
代码不定期更新
标签:img,get,小爬虫,headers,href,简单,requests,response 来源: https://blog.csdn.net/de1eteU/article/details/120297350