bs4模块
作者:互联网
requests+bs4爬取汽车之家
import requests # pip3 install beautifulsoup4 from bs4 import BeautifulSoup res = requests.get('https://www.autohome.com.cn/news/1/#liststart') # print(res.text) # html.parser bs4默认的解析库 soup = BeautifulSoup(res.text, 'html.parser') # 使用bs4的查找 ul_list = soup.find_all(name='ul', class_='article') # print(len(ul_list)) for ul in ul_list: # 找ul标签下所有的li标签 li_list = ul.find_all(name='li') for li in li_list: h3 = li.find(name='h3') if h3: title = h3.text # 获取h3标签的文本内容 desc = li.find(name='p').text img = li.find(name='img')['src'] if not img.startswith('http'): img='https:'+img url = 'https:' + li.find('a')['href'] print(''' 新闻标题:%s 新闻摘要:%s 新闻图片:%s 新闻地址:%s ''' % (title, desc, img, url)) # 把图片保存到本地 res_img=requests.get(img) img_name=img.split('/')[-1] with open('./img/%s'%img_name,'wb') as f: for line in res_img.iter_content(1024): f.write(line) # 把数据存到数据库 pymysql写入数据库--》建库建表--》cursor.exec(insert ..)-->commit
bs4遍历文档树
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" id="id_p">lqz<b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # html.parser 内置的,速度一般,容错能力强 # lxml 第三方,速度快,容错能力强 # soup=BeautifulSoup(html_doc,'html.parser') # pip3 install lxml soup=BeautifulSoup(html_doc,'lxml') # print(soup.prettify()) # 对html进行美化 #1 遍历文档树之 . 遍历 速度快 # print(soup.title) # print(soup.body.p) # print(soup.body.p.b) #2、获取标签的名称 # print(soup.title.name) # print(soup.body.name) #3、获取标签的属性 # print(soup.body.p) # print(soup.p['class']) # 因为class可能有多个,所以是列表 # print(soup.p['id']) # print(soup.p.attrs) # 所有属性放到字典中 #4、获取标签的内容--文本内容 # print(soup.p.text) # 当前标签和子子孙的文本内容拼到一起 # print(soup.p.string) # 当前标签只有文本或只有一个子有文本才拿出来,如果有多个子子孙孙,返回None # print(list(soup.p.strings)) # 把子子孙孙的文本内容放到generator #5、嵌套选择 # 可以连续点嵌套选择 # print(soup.head.title.string) #6、子节点、子孙节点 # print(soup.p.contents) #p下所有子节点,放到列表中 # print(list(soup.p.children)) #得到一个迭代器,包含p下所有子节点,跟contents本质一样,只是节约内存 # print(list(soup.p.descendants)) #获取子孙节点,p下所有的标签都会选择出来 子子孙孙 # for i,child in enumerate(soup.p.children): # print(i,child) # for i,child in enumerate(soup.p.descendants): # print(i,child) #7、父节点、祖先节点 # print(soup.a.parent) #获取a标签的父节点 # print(list(soup.a.parents)) #找到a标签所有的祖先节点,父亲的父亲,父亲的父亲的父亲... #8、兄弟节点 print(soup.a.next_sibling) #下一个兄弟 print(soup.a.previous_sibling) #上一个兄弟 print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象 print(soup.a.previous_siblings) #上面的兄弟们=>生成器对象 # . 遍历 # 取属性 [] attrrs.get() # 取文本 text string strings
bs4搜索文档树
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" id="id_p">lqz<b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc, 'lxml') # 1、五种过滤器: 字符串、正则表达式、列表、True、方法 # find:找到第一个 find_all:找所有 # 字符串 --->value值是字符串 # res=soup.find_all(name='p') # res=soup.find(id='id_p') # res=soup.find_all(class_='story') # res=soup.find_all(name='p',class_='story') # and条件 # res=soup.find(name='a',id='link2').text # res=soup.find(name='a',id='link2').attrs.get('href') # res=soup.find(attrs={'id':'link2','class':'sister'}).attrs.get('href') # print(res) # 正则表达式--->value是正则表达式 # import re # # # res=soup.find_all(name=re.compile('^b')) # # res=soup.find_all(href=re.compile('^http')) # res=soup.find_all(class_=re.compile('^s')) # print(res) # 列表 value值是列表 # res=soup.find_all(name=['body','a']) # res=soup.find_all(class_=['sister','story']) # res=soup.find_all(id=['link2','link3']) # print(res) # True value值是True # res=soup.find_all(name=True) # res=soup.find_all(id=True) # res=soup.find_all(href=True) # print(res) # 方法 # def has_class_but_no_id(tag): # return tag.has_attr('class') and not tag.has_attr('id') # # print(soup.find_all(name=has_class_but_no_id)) # 有class但是没有id的标签 #1 html页面中,只要有的东西,通过bs4都可以解析出来 #2 遍历文档树+搜索文档树混用 # def has_class_but_no_id(tag): # return tag.has_attr('class') and not tag.has_attr('id') # print(soup.find(name=has_class_but_no_id).a.text) # 3 find_all的其他参数limit:限制取几条 recursive:是否递归查找 # def has_class_but_no_id(tag): # return tag.has_attr('class') and not tag.has_attr('id') # res=soup.find_all(name=has_class_but_no_id,limit=1) # # print(res) # # res=soup.find_all(name='a',recursive=False) #不递归查找,速度快,只找一层 # print(res)
标签:name,bs4,res,id,soup,模块,print,find 来源: https://www.cnblogs.com/892572624A/p/16251343.html