爬虫-股吧
作者:互联网
import requests, random, re,json
from fake_useragent import UserAgent
def request_html(url):
ua = UserAgent()
headers = {'User-Agent': ua.random}
response = requests.get(url=url, headers=headers).text
# print(response)
# with open('1.html','w',encoding='utf-8') as f:
# f.write(response)
return response
def parse_html(response):
news = re.compile(r'<ul class="newlist" tracker-eventcode="gb_xgbsy_ lbqy_rmlbdj">(.*?)</ul>',re.S)
news_list = news.findall(response)[0]
# print(news_list)
pattern = re.compile(r'<li>(.*?)</li>', re.S)
pattern_list = pattern.findall(news_list)
# print(pattern_list)
lis = []
for i in pattern_list:
dic = {}
clk = re.compile(r'<cite>(.*?)</cite>',re.S) # 有换行
clk_list = clk.findall(i)[0].strip()
# print(clk_list)
dic['clk']=clk_list
rev_list = clk.findall(i)[1].strip()
# print(rev_list)
dic['rev']=rev_list
sub1 = re.compile(r'class="balink">(.*?)</a>|class="icon icon_list_xuanshang">(.*?)</em>',re.S)
sub1_list = sub1.findall(i)[0]
dic['sub1'] = sub1_list[0]+sub1_list[1]
# print(sub1_list)
sub2 = re.compile(r'title="(.*?)"')
sub2_list = sub2.findall(i)[0]
# print(sub2_list)
dic['sub2'] = sub2_list
aut = re.compile(r'<font>(.*?)</font>')
aut_list = aut.findall(i)[0]
# print(aut_list)
dic['aut'] = aut_list
last = re.compile(r'class="last">(.*?)<')
last_list = last.findall(i)[0]
# print(last_list)
dic['last'] = last_list
lis.append(dic)
return lis
if __name__ == '__main__':
for page in range(1,13):
url = 'http://guba.eastmoney.com/default,99_{}.html'.format(page)
# http://guba.eastmoney.com/default,99_1.html
# http://guba.eastmoney.com/default,99_2.html
response = request_html(url)
try:
lis = parse_html(response)
except:
response = request_html(url)
lis = parse_html(response)
print(url)
file_name = 'guba{}.json'.format(page)
with open(file_name,'w',encoding='utf-8') as f:
json.dump(lis,f,ensure_ascii=False)
标签:clk,sub1,list,爬虫,re,print,findall 来源: https://blog.csdn.net/weixin_42766128/article/details/101305269