python 包之 PyQuery 网页解析教程
作者:互联网
一、安装
-
是一个非常强大又灵活的网页解析库
-
PyQuery 是 Python 仿照 jQuery 的严格实现
-
语法与 jQuery 几乎完全相同,更多操作可以参考jQuery
pip install pyquery
二、字符串初始化
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc)
print(type(doc))
print(doc('li'))
三、url初始化
from pyquery import PyQuery as pq
doc = pq(url="http://www.baidu.com", encoding='utf-8')
print(doc('head')
四、文件初始化
from pyquery import PyQuery as pq
doc = pq(filename='index.html')
print(doc)
五、css选择器
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('#container .fadeIn'))
六、查找子元素
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('#container')
lis = items.find('li')
print(type(lis))
print(lis)
七、兄弟元素
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
div = doc('#container .post-thumb')
print(div.siblings())
八、获取属性
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('#container .post-content a')
print(a)
print(a.attr('href'))
print(a.attr.href)
九、获取文本
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('#container .post-content a').text()
print(a)
十、类操作
html = '''
<ul id="container">
<li class="wow fadeIn">
<div class="d-flex latest-small-thumb">
<div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
<a class="color-white" href="single.html" tabindex="0">
<img src="assets/imgs/news/thumb-11.jpg" alt="">
</a>
</div>
<div class="post-content media-body align-self-center">
<h5 class="post-title mb-15 text-limit-3-row font-medium">
<a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
</h5>
</div>
</div>
</li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('#container li')
print(li)
li.removeClass('fadeIn')
print(li)
li.addClass('fadeIn')
print(li)
标签:pq,PyQuery,包之,python,doc,html,print,pyquery 来源: https://www.cnblogs.com/autofelix/p/16177575.html