爬取猫眼和纵横中文网的榜单信息
作者:互联网
猫眼电影top100`:
import re
import requests
import json
def getpage(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.57',
'Cookie': '''__mta=150214537.1607495152037.1607513645218.1607513649349.10; uuid_n_v=v1;
uuid=606FF27039E711EB81C837695837D31FA9C7894AC0F94110AA7CA5C3F1FA097F;
_lxsdk_cuid=176462d6abdc8-0c3f995e3e7e64-5a301348-144000-176462d6abdc8;
_lxsdk=606FF27039E711EB81C837695837D31FA9C7894AC0F94110AA7CA5C3F1FA097F;
_csrf=ea8f3bd61b09e5dc9b1ed979c23977d03a368ddd96b7f4984b2ad0b18dae8241;
Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1607495150,1607513322;
__mta=150214537.1607495152037.1607495190317.1607513633276.8;
Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1607513649; _lxsdk_s=1764767217b-813-94f-283%7C%7C2'''
}
response = requests.get(url=url,headers=headers)
if response.status_code == 200:
return response.text
else:
return None
def gomessage(html):
items = re.findall('<dd>.*?board-index.*?>(.*?)</i>.*?title=(.*?) class=.*?<p class="star">(.*?)</p>.*?releasetime">(.*?)</p>.*?',html,re.S)
for item in items:
print(item)
inputfile(item)
def inputfile(what):
with open('猫眼电影排行榜.txt','a',encoding='utf-8') as f:
f.write(json.dumps(what,ensure_ascii=False)+'\n') #防止乱码
def main():
for i in range(10):
url = f'https://maoyan.com/board/4?offset={10*i}'
html = getpage(url)
gomessage(html)
main()
纵横中文网:
import requests
import re
import time,json
def getpage(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.57',
'Cookie': '''ZHID=AD632CDD590D53418030A9491C6E21C7;
ver=2018; zhffr=www.baidu.com; sajssdk_2015_cross_new_user=1;
sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221764791c0d3cc5-0c2
8a4f47b4783-5a301d45-1327104-1764791c0d4724%22%2C%22%24device_id%22%3A%2217647
91c0d3cc5-0c28a4f47b4783-5a301d45-1327104-1764791c0d4724%22%2C%22pro
ps%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6
%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%
3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baid
u.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E
5%80%BC%22%7D%7D; v_user=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D0dWqAJU6v
kkdGTF8yBhy6WNh_nTqhnxs-62kVf1bo6RQ_fajtUDK2q4aabIUp0oL%26wd%3D%26eqid%3Da20a3
f87000308dc000000065fd0c923%7Chttp%3A%2F%2Fwww.zongheng.com%2F%7C52738807; zh_
visitTime=1607518503190; Hm_lvt_c202865d524849216eea846069349eb9=1607518503;
Hm_up_c202865d524849216eea846069349eb9=%7B%22uid_%22%3A%7B%22value%22%3A%22AD6
32CDD590D53418030A9491C6E21C7%22%2C%22scope%22%3A1%7D%7D; JSESSIONID=abcDTubi
qnrXX-gBDKhzx; zh_rba=true; Hm_lpvt_c202865d524849216eea846069349eb9=1607518541'''
}
response = requests.get(url=url,headers=headers)
if response.status_code == 200:
return response.text
else:
return None
def getmessage(html):
pattern = re.compile('<div class="rank_d_list.*?bookName = (.*?)bookId.*?<div class="rank_d_b_info">(.*?)</div>.*?',re.S)
items = re.findall(pattern,html)
for item in items:
print(item)
writefile(item)
def writefile(what):
with open('纵横中文网榜单.txt','a',encoding='utf-8') as fn :
fn.write(json.dumps(what,ensure_ascii=False)+'\n')
def main():
for i in range(1,10):
url = f"http://www.zongheng.com/rank/details.html?rt=5&d=1&p={i}"
html = getpage(url)
getmessage(html)
time.sleep(2)
main()
标签:url,22%,3A%,爬取,re,中文网,html,def,猫眼 来源: https://blog.csdn.net/weixin_47001721/article/details/110941255