首页 > 编程语言> > 【待解决】爬取指定关键词的文章（Python3）

【待解决】爬取指定关键词的文章（Python3）

2021-07-06 16:01:45 作者：互联网

# -*- coding: utf-8 -*-
# http://weixin.sogou.com/

import re
import urllib.request
import time   # sleep()方法 实现延时
import urllib.error

# 为使用代理服务器爬一个网址
def use_proxy(proxy_addr,url):
    # 建立异常处理机制
    try:
        req = urllib.request.Request(url)  # 模拟浏览器
        req.add_header("User-Agent","Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KHTML,like Google Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
        proxy = urllib.request.ProxyHandler({'http':proxy_addr})  # 设置代理服务器
        opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
        urllib.request.install_opener(opener)
        data = urllib.request.urlopen(req).read()
        return data
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)
        # 若为URLError异常，延时10s执行
        time.sleep(10)
    except Exception as e:
        print("exception:"+str(e))
        # 若为Exception异常，延时1s执行
        time.sleep(1)

# 设置关键词
key = "Python"
# 设置代理服务器，该代理服务器可能失效，失效后需更新有效代理服务器
proxy = "127.0.0.1:8888"
# 爬取的页数
for i in range(0,10):
    key = urllib.request.quote(key)
    thisPageUrl="http://weixin.sogou.com/weixin?type=2&query=" + key + "&page=" + str(i)
    # a = ""
    thisPageData = use_proxy(proxy,thisPageUrl)
    print(len(str(thisPageData)))
    pat1 = '<a href="(.*?)"'
    rs1 = re.compile(pat1,re.S).findall(str(thisPageData))
    if(len(rs1) == 0):
        print("此次（"+str(i)+"页）没成功")
        continue
    for j in range(0,len(rs1)):
        thisUrl = rs1[j]
        thisUrl = thisUrl.replace("amp;","")
        file = "F:/爬虫信息/result/第"+str(i)+"页第"+str(j)+"篇文章.html"
        thisData = use_proxy(proxy,thisUrl)
        try :
            fh = open(file,"wb")
            fh.write(thisData)
            fh.close()
            print("第"+str(i)+"页第"+str(j)+"篇文章成功！")
        except Exception as e:
            print(e)
            print("第"+str(i)+"页第"+str(j)+"篇文章失败！")

问题如下：
这里写图片描述

标签：关键词,request,urllib,爬取,print,opener,proxy,代理服务器,Python3
来源： https://blog.51cto.com/u_13696685/2991686