Python批量下载壁纸,保存壁纸,flask搭建壁纸网站
作者:互联网
小提示:需更换新的请求头,下载时同目录创建img文件夹存图片
1. 获取壁纸数据
""" 思路 1. 请求网页,拿到源代码 ok requests.get(网址) 2. 将源代码解析成网页 lxml的etree.HTML(源代码) 3. 从网页中提取数据 HTML.xpath('复制来的xpath/text()') 4. 翻页,全站数据爬取 翻页一般修改url就可以实现 5. 数据保存 """ from pprint import pprint import csv """ 1. 获取小图片页面的大图页面的地址 2. 将域名拼接上去,构成大图片的页面地址 3. 请求大图片的地址,将大图片的网址保存 4. 下载图片 """ import requests from lxml import etree baseUrl = 'http://www.netbian.com' # 伪装成浏览器 需要更换请求头 cookies = { '__yjs_duid': '1_33e223172d0308c509f12b4f304f2d491651476976719', 'yjs_js_security_passport': 'f75d121a02abe7650a974a503fabb1b5f24977f8_1652958780_js', 'Hm_lvt_0f461eb489c245a31c209d36e41fcc0f': '1652672709,1652774098,1652870042,1652958781', 'Hm_lpvt_0f461eb489c245a31c209d36e41fcc0f': '1652958781', } headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Referer': 'http://www.netbian.com/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36', } def page(url): res = requests.get(url, headers=headers, cookies=cookies) html = res.text HTML = etree.HTML(html) # 提取数据 href属性 for i in range(1, 21): if i == 3: continue href = HTML.xpath('//*[@id="main"]/div[3]/ul/li[{}]/a/@href'.format(i)) # 提取小图片地址 small = HTML.xpath('//*[@id="main"]/div[3]/ul/li[{}]/a/img/@src'.format(i)) # 将域名拼接上去 detail_url = baseUrl + href[0] # print(detail_url) # 请求详情页,拿大图地址 detail = requests.get(detail_url, headers=headers, cookies=cookies) detail.encoding = "gbk" detail_html = detail.text detail_HTML = etree.HTML(detail_html) # 提取图片 big = detail_HTML.xpath('//*[@id="main"]/div[3]/div/p/a/img/@src') # 提取图片标题 title = detail_HTML.xpath('//*[@id="main"]/div[3]/div/p/a/img/@title') print(title, small, big) data.append( {"title": title[0], "small": small[0], "big": big[0], "category": category} ) if __name__ == '__main__': category = "动漫" data = [] for p in range(2, 5): main_url = "http://www.netbian.com/dongman/" url = main_url + 'index_{}.htm'.format(p) page(url) pprint(data) # 保存数据 # 1. 创建表头 header_list = ["title", "small", "big", "category"] # 打开文件 with open("img_data_a.csv", 'w', encoding="utf-8-sig", newline="") as f: # 创建csv的写对象 writer = csv.DictWriter(f, header_list) # 写入表头(a模式 第一次爬的需要写入表头,第二次不需要表头,请把写入表头注释了) writer.writeheader() # 写入数据 writer.writerows(data)获取数据
2. 批量下载
# 导入 csv 库 import csv import requests import time # 伪装成浏览器 需要更换请求头 cookies = { '__yjs_duid': '1_33e223172d0308c509f12b4f304f2d491651476976719', 'yjs_js_security_passport': 'f75d121a02abe7650a974a503fabb1b5f24977f8_1652958780_js', 'Hm_lvt_0f461eb489c245a31c209d36e41fcc0f': '1652672709,1652774098,1652870042,1652958781', 'Hm_lpvt_0f461eb489c245a31c209d36e41fcc0f': '1652958781', } headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Referer': 'http://www.netbian.com/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36', } def progressbar(url, filepath='./必须加上扩展名'): start = time.time() # 下载开始时间 response = requests.get(url, stream=True, headers=headers, cookies=cookies) # stream=True必须写上 size = 0 # 初始化已下载大小 chunk_size = 1024 # 每次下载的数据大小 content_size = int(response.headers['content-length']) # 下载文件总大小 try: if response.status_code == 200: # 判断是否响应成功 print('开始下载,[文件大小]:{size:.2f} MB'.format( size=content_size / chunk_size / 1024)) # 开始下载,显示下载文件大小 # filepath = '下载/222.mp4' #注:必须加上扩展名 with open(filepath, 'wb') as file: # 显示进度条 for data in response.iter_content(chunk_size=chunk_size): file.write(data) size += len(data) print('\r' + '[下载进度]:%s%.2f%%' % ( '>' * int(size * 50 / content_size), float(size / content_size * 100)), end=' ') end = time.time() # 下载结束时间 print('完成!用时: %.2f秒' % (end - start)) # 输出下载用时时间 except Exception: pass # 打开文件 with open("img_data_a.csv", encoding="utf-8-sig", mode="r") as f: # 基于打开的文件,创建csv.DictReader实例 reader = csv.DictReader(f) # 输出信息 for row in reader: title = row.get("title") big_url = row.get("big") print("下载:", title) # 下载 # res = requests.get(big_url, headers=headers, cookies=cookies) # # with open("img/{}.jpg".format(title), 'wb') as f: # f.write(res.content) # 用进度条下载 需要当前目录创建img文件夹存放下载的图片 u = big_url progressbar(url=u, filepath="img/{}.jpg".format(title))下载图片
3. 线程池批量获取全站壁纸地址
import requests from lxml import etree from concurrent.futures import ThreadPoolExecutor import csv baseUrl = 'http://www.netbian.com' # 伪装成浏览器 需要更换 cookies = { '__yjs_duid': '1_33e223172d0308c509f12b4f304f2d491651476976719', 'Hm_lvt_0f461eb489c245a31c209d36e41fcc0f': '1652672709,1652774098,1652870042', 'trenvecookieclassrecord': '^%^2C4^%^2C', 'trenvecookieinforecord': '^%^2C4-14978^%^2C', 'yjs_js_security_passport': '9376682f8d181fc1c094828cbcf9858097ffe69e_1652876557_js', 'Hm_lpvt_0f461eb489c245a31c209d36e41fcc0f': '1652876558', } headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Referer': 'http://www.netbian.com/fengjing/index_3.htm', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36', } def home_page(url): res = requests.get(url, headers=headers, cookies=cookies) html = res.text HTML = etree.HTML(html) lis = HTML.xpath('//*[@id="main"]/div[3]/ul//li') # print(lis) for i in range(1, len(lis)): # 提取数据 href属性 if i == 3: continue href = HTML.xpath('//*[@id="main"]/div[3]/ul/li[1]/a/@href') small = HTML.xpath('//*[@id="main"]/div[3]/ul/li[4]/a/img/@src')[0] # print(href) # 将域名拼接上去 detail_url = baseUrl + href[0] print(detail_url) detail_date = {'small': small} detail(detail_url, detail_date) def detail(detail_url, detail_date): res = requests.get(detail_url, headers=headers, cookies=cookies) res.encoding = "gbk" html = res.text HTML = etree.HTML(html) img_url = HTML.xpath('//*[@id="main"]/div[3]/div/p/a/img/@src')[0] title = HTML.xpath('//*[@id="main"]/div[3]/div/p/a/img/@title')[0] # print(img_url) detail_date["title"] = title detail_date["big"] = img_url detail_date['category'] = current print(current) print(title) data.append(detail_date) def job(url): print(url) for page in range(1, 10000): try: if page == 1: home_page(url) else: url_next = url + 'index_{}.htm'.format(page) home_page(url_next) except: print("没有了") return def save(): # 表头 header_list = ["title", "category", "small", 'big'] with open("img_data_all.csv", 'w', encoding="utf-8") as f: writer = csv.DictWriter(f, header_list) writer.writeheader() # 写入数据 writer.writerows(data) if __name__ == '__main__': category = [ "/rili/", "/dongman/", "/fengjing/", "/meinv/", "/youxi/", "/yingshi/", "/dongtai/", "/weimei/", "/sheji/", "/keai/", "/qiche/", "/huahui/", "/dongwu/", "/jieri/", "/renwu/", "/meishi/", "/shuiguo/", "/jianzhu/", "/tiyu/", "/junshi/", "/feizhuliu/", "/qita/", "/s/wangzherongyao/", "/s/huyan/", "/s/lol/", ] data = [] pool = ThreadPoolExecutor(50) for lei in category: current = lei url = baseUrl + lei # job(url) pool.submit(job, url) pool.shutdown() print("-- 爬取结束 --".center(20, "*")) print("开始写入") save() print("写入完成")全站/多线程
4. 存数据库
5. 搭建壁纸网站
标签:img,title,Python,detail,url,HTML,flask,print,壁纸 来源: https://www.cnblogs.com/zwnsyw/p/16295765.html