编程语言
首页 > 编程语言> > 【Python学习之旅】---多线程爬取段子

【Python学习之旅】---多线程爬取段子

作者:互联网

 1 import requests
 2 import time
 3 import threading
 4 import queue
 5 from lxml import etree
 6 # "https://ishuo.cn/duanzi"
 7 # header={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"}
 8 #爬取网页线程---爬取段子列表所在的网页,放入队列
 9 class Mythread1(threading.Thread):
10     def __init__(self,threaName,pageQueue,dataQueue):
11         threading.Thread.__init__(self)
12         self.threaName=threaName  #线程名
13         self.pageQueue = pageQueue   #页码队列
14         self.dataQueue = dataQueue   #数据队列
15         self.headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"}
16 
17     def run(self):
18         print("启动线程",self.threaName)
19         while not vlog1:
20             try:
21                 page=self.pageQueue.get()
22                 url="https://ishuo.cn/duanzi"
23                 data=requests.get(url,headers=self.headers).text
24                 time.sleep(0.5)
25                 self.dataQueue.put(data)   #将数据放入到数据队列中
26             except Exception as e:
27                 pass
28         print("结束线程",self.threaName)
29 
30 
31 #解析网页线程---从队列中拿出列表网页,进行解析,并存储到本地
32 class Mythread2(threading.Thread):
33     def __init__(self,threaName,dataQueue,filename):
34         threading.Thread.__init__(self)
35         self.threaName=threaName
36         self.dataQueue = dataQueue
37         self.filename = filename
38 
39     def run(self):
40         print("启动线程",self.threaName)
41         while not vlog1:
42             try:
43                 data1=self.dataQueue.get()  #取出数据队列中的数据
44                 html=etree.HTML(data1)
45                 node_list=html.xpath('//div[@class="info"]/a') #获取这一页网页中所有符合条件的a标签
46                 # print(node_list)
47                 for node in node_list:
48                     data2=node.text  #依次获取a标签的信息
49                     self.filename.write(data2+"\n") #将信息写入文件
50             except Exception as e:
51                 pass
52         print("结束线程",self.threaName)
53 
54 
55 
56 
57 vlog1=False  #判断页码队列中是否为空
58 vlog2=False  #判断数据队列中是否为空
59 
60 
61 def main():
62     #页码队列
63     pageQueue=queue.Queue(1)
64     pageQueue.put(1)
65     #存放采集结果的数据队列
66     dataQueue=queue.Queue()
67     #保存到文件
68     filename=open(r"D:\软件\python\python_work\Python_day18\123.txt","a")
69     #启动线程
70     t1=Mythread1("采集线程",pageQueue,dataQueue)
71     t1.start()
72     t2=Mythread2("解析线程",dataQueue,filename)
73     t2.start()
74     #结束主线程
75     #当pageQueue为空时,结束采集线程
76     while not pageQueue.empty():
77         pass
78     global vlog1
79     vlog1=True
80 
81     # 当dataQueue为空时,结束解析线程
82     while not pageQueue.empty():
83         pass
84     global vlog2  #定义全局变量
85     vlog2 =True
86 
87     t1.join()
88     t2.join()
89     filename.close()  #当2个线程执行完之后关闭文件
90     print("结束!")
91 
92 
93 
94 if __name__=='__main__':
95     main()

 

标签:__,Python,self,爬取,线程,pageQueue,dataQueue,多线程,threaName
来源: https://www.cnblogs.com/chenyuxia/p/12535596.html