爬取性感小姐姐
作者:互联网
网站地址:http://www.meizitu.com/a/more_1.html
from bs4 import BeautifulSoup import random,os,requests headers = { 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:58.0) Gecko/20100101", 'Referer': "http://i.meizitu.net" } def home_page(num,num2,headers): list_url = [] for num in range(num,num2+1): url = "http://www.meizitu.com/a/more_%d.html"%num req = requests.get(url,headers=headers) req.encoding = req.apparent_encoding html = req.text bf = BeautifulSoup(html,'lxml') targets_url = bf.find_all(class_="pic") for each in targets_url: list_url.append(each.a.get('href')) return list_url def deal_page(headers,list_url): list_url2 = [] for targets_url2 in list_url: req = requests.get(targets_url2,headers=headers) req.encoding = "utf-8" html2 = req.text bf2 = BeautifulSoup(html2,'lxml') targets_url3 = bf2.find_all(id="picture") # print(targets_url3) list_url2.append(targets_url3) return list_url2 def download(headers,list_url2): list_url3 = [] # ================================ print(list_url2) import re urls = re.findall(r'http.*?jpg',str(list_url2)) print(urls,len(urls)) for endurl in urls: filename = (endurl.split('/')[-3]) + (endurl.split('/')[-2]) +(endurl.split('/')[-1]) print(endurl) print(filename) req3 = requests.get(endurl, headers=headers) root = "//Users//apple//Desktop//meizitu//" path = root + str(random.randrange(10000)) + filename if not os.path.exists(path): with open(path, 'wb') as f: f.write(req3.content) f.close() print("下载完成") if __name__ == '__main__': num = int(input("请输入要爬取的起始页:")) num2 = int(input("请输入终止页:")) a = home_page(num,num2,headers) b = deal_page(headers, a) download(headers, b)
标签:小姐姐,headers,性感,req,list,爬取,url,url2,targets 来源: https://www.cnblogs.com/Chen-MJ/p/11726674.html