编程语言
首页 > 编程语言> > python 包之 PyQuery 网页解析教程

python 包之 PyQuery 网页解析教程

作者:互联网

一、安装

pip install pyquery

 

二、字符串初始化

html = '''
<ul id="container">
    <li class="wow fadeIn">
        <div class="d-flex latest-small-thumb">
            <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
                <a class="color-white" href="single.html" tabindex="0">
                    <img src="assets/imgs/news/thumb-11.jpg" alt="">
                </a>
            </div>
            <div class="post-content media-body align-self-center">
                <h5 class="post-title mb-15 text-limit-3-row font-medium">
                    <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
                </h5>
            </div>
        </div>
    </li>
</ul>
'''

from pyquery import PyQuery as pq

doc = pq(html)
print(doc)
print(type(doc))
print(doc('li'))

 

三、url初始化

from pyquery import PyQuery as pq

doc = pq(url="http://www.baidu.com", encoding='utf-8')
print(doc('head')

 

四、文件初始化

from pyquery import PyQuery as pq

doc = pq(filename='index.html')
print(doc)

 

五、css选择器

html = '''
<ul id="container">
    <li class="wow fadeIn">
        <div class="d-flex latest-small-thumb">
            <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
                <a class="color-white" href="single.html" tabindex="0">
                    <img src="assets/imgs/news/thumb-11.jpg" alt="">
                </a>
            </div>
            <div class="post-content media-body align-self-center">
                <h5 class="post-title mb-15 text-limit-3-row font-medium">
                    <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
                </h5>
            </div>
        </div>
    </li>
</ul>
'''

from pyquery import PyQuery as pq

doc = pq(html)
print(doc('#container .fadeIn'))

 

六、查找子元素

html = '''
<ul id="container">
    <li class="wow fadeIn">
        <div class="d-flex latest-small-thumb">
            <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
                <a class="color-white" href="single.html" tabindex="0">
                    <img src="assets/imgs/news/thumb-11.jpg" alt="">
                </a>
            </div>
            <div class="post-content media-body align-self-center">
                <h5 class="post-title mb-15 text-limit-3-row font-medium">
                    <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
                </h5>
            </div>
        </div>
    </li>
</ul>
'''

from pyquery import PyQuery as pq

doc = pq(html)
items = doc('#container')
lis = items.find('li')
print(type(lis))
print(lis)

 

七、兄弟元素

html = '''
<ul id="container">
    <li class="wow fadeIn">
        <div class="d-flex latest-small-thumb">
            <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
                <a class="color-white" href="single.html" tabindex="0">
                    <img src="assets/imgs/news/thumb-11.jpg" alt="">
                </a>
            </div>
            <div class="post-content media-body align-self-center">
                <h5 class="post-title mb-15 text-limit-3-row font-medium">
                    <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
                </h5>
            </div>
        </div>
    </li>
</ul>
'''

from pyquery import PyQuery as pq

doc = pq(html)
div = doc('#container .post-thumb')
print(div.siblings())

 

八、获取属性

html = '''
<ul id="container">
    <li class="wow fadeIn">
        <div class="d-flex latest-small-thumb">
            <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
                <a class="color-white" href="single.html" tabindex="0">
                    <img src="assets/imgs/news/thumb-11.jpg" alt="">
                </a>
            </div>
            <div class="post-content media-body align-self-center">
                <h5 class="post-title mb-15 text-limit-3-row font-medium">
                    <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
                </h5>
            </div>
        </div>
    </li>
</ul>
'''

from pyquery import PyQuery as pq

doc = pq(html)
a = doc('#container .post-content a')
print(a)
print(a.attr('href'))
print(a.attr.href)

 

九、获取文本

html = '''
<ul id="container">
    <li class="wow fadeIn">
        <div class="d-flex latest-small-thumb">
            <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
                <a class="color-white" href="single.html" tabindex="0">
                    <img src="assets/imgs/news/thumb-11.jpg" alt="">
                </a>
            </div>
            <div class="post-content media-body align-self-center">
                <h5 class="post-title mb-15 text-limit-3-row font-medium">
                    <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
                </h5>
            </div>
        </div>
    </li>
</ul>
'''

from pyquery import PyQuery as pq

doc = pq(html)
a = doc('#container .post-content a').text()
print(a)

 

十、类操作

html = '''
<ul id="container">
    <li class="wow fadeIn">
        <div class="d-flex latest-small-thumb">
            <div class="post-thumb d-flex mr-15 border-radius-10 img-hover-scale overflow-hidden">
                <a class="color-white" href="single.html" tabindex="0">
                    <img src="assets/imgs/news/thumb-11.jpg" alt="">
                </a>
            </div>
            <div class="post-content media-body align-self-center">
                <h5 class="post-title mb-15 text-limit-3-row font-medium">
                    <a href="single.html" tabindex="0">9 Things I Love About Shaving My Head During Quarantine</a>
                </h5>
            </div>
        </div>
    </li>
</ul>
'''

from pyquery import PyQuery as pq

doc = pq(html)
li = doc('#container li')
print(li)
li.removeClass('fadeIn')
print(li)
li.addClass('fadeIn')
print(li)

 

标签:pq,PyQuery,包之,python,doc,html,print,pyquery
来源: https://www.cnblogs.com/autofelix/p/16177575.html