day03 爬豌豆荚
作者:互联网
'''
主页:
名称、下载次数、大小、详情页地址
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
'''
import requests from bs4 import BeautifulSoup # 1、发送请求 def get_page(url): response = requests.get(url) return response # 解析主页 def parse_index(data): soup = BeautifulSoup(data, 'lxml') # 获取所有app的li标签 app_list = soup.find_all(name='li', attrs={"class": "card"}) for app in app_list: app_name = soup.find(name="a", attrs={"class": "name"}).text print(app_name) # 下载次数 # 获取class为install-count的span标签中的文本 down_num = app.find(name='span', attrs={"class": "install-count"}).text print(down_num) import re # 大小 # 根据文本正则获取到文本中包含 数字 + MB(\d+代表数字)的span标签中的文本 size = soup.find(name='span', text=re.compile("\d+MB")).text print(size) # 详情页地址 detail_url = app.find(name='a').attrs['href'] print(detail_url) def main(): for line in range(1, 33): url = f"https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=FRsWKgWBqMBZLdxLaK4iem9B" # 1、往app接口发送请求 response = get_page(url) print('*' * 1000) # 反序列化为字典 data = response.json() # 获取接口中app标签数据 app_li = data['data']['content'] # print(app_li) # 2、解析app标签数据 parse_index(app_li) if __name__ == '__main__': main()
标签:name,day03,app,find,url,print,page,豌豆荚 来源: https://www.cnblogs.com/cl007/p/11127872.html