多线程爬虫
作者:互联网
import json from queue import Queue from threading import Thread,Lock import requests import time from aip import AipNlp from pymysql import connect from multiprocessing import Pool import pymongo lock = Lock() conn = connect(host="localhost", port=3306, database="lala", user="root", password="", charset="utf8") cur = conn.cursor() f = pymongo.MongoClient("localhost", 27017) jihe = f.sjk.jh class CrawlThread(Thread): def __init__(self,threadname,pageQueue,dataQueue): super().__init__() self.threadname = threadname self.pageQueue = pageQueue self.dataQueue = dataQueue self.headers = { "Referer": "https://item.jd.com/6051045.html", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" } def run(self): while True: try: page = self.pageQueue.get(False) url = "https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv6445&productId=6051045&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1".format(page) print('%s号采集线程开始工作'%self.threadname) response = requests.get(url,headers=self.headers) res = response.text self.dataQueue.put(res) print('%s号采集线程已经完成工作'%self.threadname) except: break class PraseThread(Thread): def __init__(self,threadname,dataQueue): super().__init__() self.threadname = threadname self.dataQueue = dataQueue def run(self): while True: try: html = self.dataQueue.get(False) print(html) time.sleep(2) print("%s解析号线程开始工作"%self.threadname) self.parse(html) print("%s解析号线程完成工作"%self.threadname) except: break def parse(self,html): res = html[26:-2] res1 = json.loads(res) res2 = res1['comments'] datalist = [] for i in res2: con = i['content'] items = { "评论":con } self.baidu(con) # print(items) datalist.append(items) # print(datalist) # json.dump(datalist,open('京东.json','a',encoding='utf-8'),ensure_ascii=False,indent=4) def baidu(self,con): """ 你的 APPID AK SK """ try: # lock.acquire() APP_ID = '17019478' API_KEY = '8Grs25GiR4Qt2Khlnk0L2ekX' SECRET_KEY = 'wLvFGKaWgebZtRTmvLFWGYU1P5E9uQUH' client = AipNlp(APP_ID, API_KEY, SECRET_KEY) client.setConnectionTimeoutInMillis(5 * 1000) # 设置连接超时 """ 调用情感倾向分析 """ time.sleep(0.5) alist = [] res = client.sentimentClassify(con) print(res) msg = { 'text':res['text'], 'sentiment':res['items'][-1]['sentiment'] } text = json.dumps(msg['text']) sentiment = json.dumps(msg['sentiment']) self.cun(text,sentiment) # lock.release() except: pass def cun(self,text,sentiment): print("555") sql = "insert into mg value(0,%s,%s)" params = (text,sentiment) print(sql,params) cur.execute(sql,params) print("6666") def main(): pageQueue = Queue() for i in range(11,20): pageQueue.put(i) dataQueue = Queue() crawllist = [] threadcrawl = [] for x in range(1,21): crawllist.append('采集线程{}号'.format(x)) for threadname in crawllist: thread = CrawlThread(threadname,pageQueue,dataQueue) thread.start() # thread.run() #启动线程 threadcrawl.append(thread) for tname in threadcrawl: tname.join() parselist = [] threadparse = [] for x in range(1,21): parselist.append('解析线程{}号'.format(x)) for threadname in crawllist: thread = PraseThread(threadname,dataQueue) thread.start() # thread.run() #启动线程 threadparse.append(thread) for tname in threadparse: tname.join() if __name__ == '__main__': main()多线程爬虫(jd)
标签:__,self,threadname,爬虫,dataQueue,print,import,多线程 来源: https://www.cnblogs.com/liuxiaomo/p/12002314.html