编程语言
首页 > 编程语言> > Python爬虫入门:pyquery库基础

Python爬虫入门:pyquery库基础

作者:互联网

Python爬虫入门:pyquery库基础

pyquery基础使用方法

html = """
<div>
<ul class="list">
<li class="item-0">one</li>
<li class="item-1"><a href="www.csdn.net">two</a></li>
<li class="item-0" id="three"><span class="bold"><a href="www.baidu.com">three</a></span></li>
<li class="item-1 active"><a href="www.csdn.net">four</a></li>
<li class="item-0"><a href="www.csdn.net">five</a></li>
</ul>
</div>
"""
import requests
from pyquery import PyQuery as pq
doc = pq('https://www.qq.com')
print(doc('title'))
doc = pq(html)
print(doc.find('ul').children('.item-0'))
print('*'*20+'text'+'*'*20)
print(doc.find('ul').children('.item-0')[0].text)
print('*'*20+'parent'+'*'*20)
print(doc.find('ul').children('.item-0').parent())
print('*'*20+'parents'+'*'*20)
print(doc.find('ul').children('.item-0').parents())
print('*'*20+'siblings'+'*'*20)
print(doc.find('ul').children('.item-1.active').siblings())
print('*'*20+'通过items()来遍历'+'*'*20)
for item in doc.find('ul').children('.item-0').items():
    print(item)
print('*'*20+'通过attr方法获取属性'+'*'*20)
print(doc.find('ul').children('.item-1.active').attr('class'))
print(doc.find('ul').children('.item-1.active').attr['class'])
print('*'*20+'通过text方法获取文字内容'+'*'*20)
print(doc.find('ul').children('.item-1.active').text())
print('*'*20+'通过html方法获取内容'+'*'*20)
print(doc.find('ul').children('.item-1.active').html())

节点操作

html = """
<div>
<ul class="list">
<li class="item-0">one</li>
<li class="item-1"><a href="www.csdn.net">two</a></li>
<li class="item-0" id="three"><span class="bold"><a href="www.baidu.com">three</a></span></li>
<li class="item-1 active"><a href="www.csdn.net">four</a></li>
<li class="item-0"><a href="www.csdn.net">five</a></li>
</ul>
</div>
"""
import requests
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-1.active')
print(li)
li.removeClass('active')
print(li)
li.addClass('activate')
print(li)
li.attr('name','link')
print(li)
li.text('changed item')
print(li)
li.html('<span>changed item</span>')
print(li)
li.append('<img src="img.png"/>')
print(li)
li.prepend('<img src="img2.png"/>')
print(li)
li.remove('img')
print(li)
li.empty()
print(li)

伪类操作

html = """
<div class="wrap">
<div class="container">
<ul class="list">
<li class="item-0">one</li>
<li class="item-1"><a href="www.csdn.net">two</a></li>
<li class="item-0" id="three"><span class="bold"><a href="www.baidu.com">three</a></span></li>
<li class="item-1 active"><a href="www.csdn.net">four</a></li>
<li class="item-0"><a href="www.csdn.net">five</a></li>
</ul>
</div>
</div>
"""
import requests
from pyquery import PyQuery as pq
doc = pq(html)
# 获取第一个元素
li = doc('li:first-child')
print(li)
# 获取最后一个元素
li = doc('li:last-child')
print(li)
# 获取第二个元素
li = doc('li:nth-child(2)')
print(li)
# 获取第三个以后的元素
li = doc('li:gt(2)')
print(li)
# 设置步长为2获取元素
li = doc('li:nth-child(2n)')
print(li)
# 获取包含某内容的元素
li = doc('li:contains("four")')
print(li)

标签:pyquery,Python,doc,爬虫,li,item,html,print,20
来源: https://blog.csdn.net/qq_36365528/article/details/99947186