【Python学习之旅】---多线程爬取段子
作者:互联网
1 import requests 2 import time 3 import threading 4 import queue 5 from lxml import etree 6 # "https://ishuo.cn/duanzi" 7 # header={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"} 8 #爬取网页线程---爬取段子列表所在的网页,放入队列 9 class Mythread1(threading.Thread): 10 def __init__(self,threaName,pageQueue,dataQueue): 11 threading.Thread.__init__(self) 12 self.threaName=threaName #线程名 13 self.pageQueue = pageQueue #页码队列 14 self.dataQueue = dataQueue #数据队列 15 self.headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"} 16 17 def run(self): 18 print("启动线程",self.threaName) 19 while not vlog1: 20 try: 21 page=self.pageQueue.get() 22 url="https://ishuo.cn/duanzi" 23 data=requests.get(url,headers=self.headers).text 24 time.sleep(0.5) 25 self.dataQueue.put(data) #将数据放入到数据队列中 26 except Exception as e: 27 pass 28 print("结束线程",self.threaName) 29 30 31 #解析网页线程---从队列中拿出列表网页,进行解析,并存储到本地 32 class Mythread2(threading.Thread): 33 def __init__(self,threaName,dataQueue,filename): 34 threading.Thread.__init__(self) 35 self.threaName=threaName 36 self.dataQueue = dataQueue 37 self.filename = filename 38 39 def run(self): 40 print("启动线程",self.threaName) 41 while not vlog1: 42 try: 43 data1=self.dataQueue.get() #取出数据队列中的数据 44 html=etree.HTML(data1) 45 node_list=html.xpath('//div[@class="info"]/a') #获取这一页网页中所有符合条件的a标签 46 # print(node_list) 47 for node in node_list: 48 data2=node.text #依次获取a标签的信息 49 self.filename.write(data2+"\n") #将信息写入文件 50 except Exception as e: 51 pass 52 print("结束线程",self.threaName) 53 54 55 56 57 vlog1=False #判断页码队列中是否为空 58 vlog2=False #判断数据队列中是否为空 59 60 61 def main(): 62 #页码队列 63 pageQueue=queue.Queue(1) 64 pageQueue.put(1) 65 #存放采集结果的数据队列 66 dataQueue=queue.Queue() 67 #保存到文件 68 filename=open(r"D:\软件\python\python_work\Python_day18\123.txt","a") 69 #启动线程 70 t1=Mythread1("采集线程",pageQueue,dataQueue) 71 t1.start() 72 t2=Mythread2("解析线程",dataQueue,filename) 73 t2.start() 74 #结束主线程 75 #当pageQueue为空时,结束采集线程 76 while not pageQueue.empty(): 77 pass 78 global vlog1 79 vlog1=True 80 81 # 当dataQueue为空时,结束解析线程 82 while not pageQueue.empty(): 83 pass 84 global vlog2 #定义全局变量 85 vlog2 =True 86 87 t1.join() 88 t2.join() 89 filename.close() #当2个线程执行完之后关闭文件 90 print("结束!") 91 92 93 94 if __name__=='__main__': 95 main()
标签:__,Python,self,爬取,线程,pageQueue,dataQueue,多线程,threaName 来源: https://www.cnblogs.com/chenyuxia/p/12535596.html