博客园 文章爬取(乱写的,有的爬不下来)
作者:互联网
微博爬取(乱写的)
import re
import requests
web=[
{"name":'张三',"博客地址":"http://www.cnblogs.com/bladecheng/"},
{"name":"甲","博客地址":"http://www.cnblogs.com/pythonywy/"},
{"name":"乙","博客地址":"http://www.cnblogs.com/pythonywy/"},
{"name":"丙","博客地址":"http://www.cnblogs.com/zrx19960128/"},
{"name":"丁","博客地址":"http://www.cnblogs.com/itboy-newking/"},
{"name":"帅哥","博客地址":"http://www.cnblogs.com/chuwanliu/"},
{"name":"浪哥","博客地址":"http://www.cnblogs.com/einsam/"},
{"name":"强哥","博客地址":"http://www.cnblogs.com/wsxiaoyao"},
{"name":"云哥","博客地址":"http://www.cnblogs.com/yellowcloud/"}
]
for n in range(len(web)):
print("%s的博客文章地址如下:" %(web[n]["name"]))
html = requests.get(web[n]["博客地址"])
strr = html.text #网页文本
pat1 = r'postTitle2" href="(.*?)</a>' #正则匹配
title = re.findall(pat1, strr) #匹配后的结果
long = len(title)
for i in range(0, long):
tx = r'">'
res = re.sub(tx, ' 文章标题:', title[i])
print(res)
print("爬取完毕!")
标签:www,http,name,乱写,博客园,爬取,博客地址,com,cnblogs 来源: https://www.cnblogs.com/bladecheng/p/10883555.html