【BOOK】解析库--pyquery
作者:互联网
CSS选择器
1、初始化
html=''' <div> <ul> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0 "><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq ## 字符串初始化 doc = pq(html) print(doc('li')) ## URL初始化 doc= pq(url='https://cuiqingcai.com') print(doc('title')) ## 文件初始化 doc = pq(filename='test.html') print(doc('li'))
2、CSS选择器
# CSS选择器 from pyquery import PyQuery as pq doc = pq(html) ## id用 #,class用 . print(doc('#container .list li'))
3、查找节点
html=''' <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0 "><a href="link5.html">fifth item</a></li> </ul> </div> ''' ##查找节点 from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') print(items) ## find() 查找所有地子孙节点 lis = items.find('a') print(lis) ## children() 只查找子节点 lis = items.children('.active') print(lis) ## parent() 查找父节点 container = items.parent() print(container) ## parents() 查找祖先节点 ancestor = items.parents() print(ancestor) ## siblings() 查找兄弟节点 li = doc('.item-0.active') print(li.siblings('.active'))
4、遍历
from pyquery import PyQuery as pq doc = pq(html) ## 调用items()得到一个生成器,for in 进行遍历 lis = doc('li').items() for li in lis: print(li)
5、获取内容
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0 a')
print(a)
## 获取 a节点的 href的属性值
## attr()只返回第一个结果
print(a.attr('href'))
print(a.attr.href)
## 通过遍历获取所有的属性值
for item in a.items():
print(item.attr('href'))
## 只获取文本 多个节点的文本内容用 空格 间隔开 print(a.text()) ## third item fifth item ## 获取包含 a节点内部所有内容,包含节点,返回第一个 a节点 内部的HTML文本 print(a.html()) # <span class="bold">third item</span>
6、节点操作
## 节点操作 ## remove() html =''' <div class="wrap"> Hello World <p>呱呱呱</p> </div> ''' ## 只想获得Hello World from pyquery import PyQuery as pq doc = pq(html) wrap = doc('.wrap') ## 移除 p节点 wrap.find('p').remove() print(wrap.text())
7、伪类选择器
html=''' <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0 "><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) ## 第一个li节点 li = doc('li:first-child') print(li) ## 最后一个li节点 li = doc('li:last-child') print(li) ## 第2个li节点 li = doc('li:nth-child(2)') print(li) ## 包含‘second’文本的li节点 li = doc('li:contains(second)') print(li)
标签:pq,pyquery,##,doc,li,item,BOOK,print,解析 来源: https://www.cnblogs.com/motoharu/p/12557447.html