多线程爬虫

2019-12-07 16:50:35 作者：互联网
import json
from queue import Queue
from threading import Thread,Lock
import requests
import time
from aip import AipNlp
from pymysql import connect
from multiprocessing import Pool
import pymongo
lock = Lock()
conn = connect(host="localhost", port=3306, database="lala", user="root", password="", charset="utf8")
cur = conn.cursor()

f = pymongo.MongoClient("localhost", 27017)
jihe = f.sjk.jh
class CrawlThread(Thread):
    def __init__(self,threadname,pageQueue,dataQueue):
        super().__init__()
        self.threadname = threadname
        self.pageQueue = pageQueue
        self.dataQueue = dataQueue
        self.headers = {
                "Referer": "https://item.jd.com/6051045.html",
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
                }
    def run(self):
        while True:
            try:
                page = self.pageQueue.get(False)
                url = "https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv6445&productId=6051045&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1".format(page)
                print('%s号采集线程开始工作'%self.threadname)
                response = requests.get(url,headers=self.headers)
                res = response.text
                self.dataQueue.put(res)
                print('%s号采集线程已经完成工作'%self.threadname)
            except:
                break

class PraseThread(Thread):
    def __init__(self,threadname,dataQueue):
        super().__init__()
        self.threadname = threadname
        self.dataQueue = dataQueue
    def run(self):
        while True:
            try:
                html = self.dataQueue.get(False)
                print(html)
                time.sleep(2)
                print("%s解析号线程开始工作"%self.threadname)
                self.parse(html)
                print("%s解析号线程完成工作"%self.threadname)
            except:
                break
    def parse(self,html):
        res = html[26:-2]
        res1 = json.loads(res)
        res2 = res1['comments']
        datalist = []
        for i in res2:
            con = i['content']
            items = {
                "评论":con
            }
            self.baidu(con)
            # print(items)
            datalist.append(items)
        # print(datalist)
        # json.dump(datalist,open('京东.json','a',encoding='utf-8'),ensure_ascii=False,indent=4)
    def baidu(self,con):
        """ 你的 APPID AK SK """
        try:
            # lock.acquire()
            APP_ID = '17019478'
            API_KEY = '8Grs25GiR4Qt2Khlnk0L2ekX'
            SECRET_KEY = 'wLvFGKaWgebZtRTmvLFWGYU1P5E9uQUH'
            client = AipNlp(APP_ID, API_KEY, SECRET_KEY)
            client.setConnectionTimeoutInMillis(5 * 1000)  # 设置连接超时
            """ 调用情感倾向分析 """
            time.sleep(0.5)
            alist = []
            res = client.sentimentClassify(con)
            print(res)
            msg = {
                'text':res['text'],
                'sentiment':res['items'][-1]['sentiment']
            }
            text = json.dumps(msg['text'])
            sentiment = json.dumps(msg['sentiment'])
            self.cun(text,sentiment)
            # lock.release()
        except:


            pass
    def cun(self,text,sentiment):
        print("555")
        sql = "insert into mg value(0,%s,%s)"
        params = (text,sentiment)
        print(sql,params)
        cur.execute(sql,params)
        print("6666")
def main():
    pageQueue = Queue()
    for i in range(11,20):
        pageQueue.put(i)
    dataQueue = Queue()
    crawllist = []
    threadcrawl = []
    for x in range(1,21):
        crawllist.append('采集线程{}号'.format(x))
    for threadname in crawllist:
        thread = CrawlThread(threadname,pageQueue,dataQueue)
        thread.start()  # thread.run()   #启动线程
        threadcrawl.append(thread)
    for tname in threadcrawl:
        tname.join()
    parselist = []
    threadparse = []
    for x in range(1,21):
        parselist.append('解析线程{}号'.format(x))
    for threadname in crawllist:
        thread = PraseThread(threadname,dataQueue)
        thread.start()  # thread.run()   #启动线程
        threadparse.append(thread)
    for tname in threadparse:
        tname.join()
if __name__ == '__main__':
    main()
多线程爬虫（jd）
标签：__,self,threadname,爬虫,dataQueue,print,import,多线程
来源： https://www.cnblogs.com/liuxiaomo/p/12002314.html