其他分享
首页 > 其他分享> > 皮皮辉的爬虫学习之路

皮皮辉的爬虫学习之路

作者:互联网

爬取天猫评论信息

必备知识

爬取天猫商品评论的过程

# 导入需要的库
import requests
import json
import re

# 宏变量存储目标js的URL列表
COMMENT_PAGE_URL = []


# 生成链接列表
def Get_Url(num):
    COMMENT_PAGE_URL.clear()
    # urlFront ='https://rate.tmall.com/list_detail_rate.htm?itemId=591204720926&spuId=1196687088&sellerId=2057500938&order=3&currentPage=2&append=0&content=1&tagId=&posi=&picture=&groupId=&ua=098%23E1hvpvvfvw6vUvCkvvvvvjiPn2Fvzji2R2SWtj3mPmPwtjl8P2zZzj1PP2MhsjEhdphvmpvZ8vi3OppjoUhCvvswN8iy0YMwzPAQ6DItvpvhvvvvvUhCvvswPv1yMKMwzPQphHurvpvEvCo5vVYSCHDDdphvmQ9ZCQmj3QBBn4hARphvCvvvvvmrvpvEvvQRvy3UvRoi2QhvCvvvMMGCvpvVvvpvvhCvmphvLv99ApvjwYcEKOms5k9vibmXYC97W3dhA8oQrEtlB%2BFy%2BnezrmphQRAn3feAOHFIAXcBKFyK2ixrsj7J%2B3%2BdafmxfBkKNB3rsj7Q%2Bu0ivpvUvvmvRE8X69TEvpvVmvvC9jahKphv8vvvvvCvpvvvvvmm86CvmWZvvUUdphvWvvvv9krvpv3Fvvmm86CvmVWtvpvhvvvvv8wCvvpvvUmm3QhvCvvhvvmrvpvEvvFyvrzavm9VdphvhCQpVUCZxvvC7g0znsBBKaVCvpvLSH1a7z2SznswEjE4GDRi4IkisIhCvvswN8340nMwzPs5OHItvpvhvvvvv86Cvvyvh2%2BHj1GhPDervpvEvv1LCNL6Chi19phvHNlwM7L7qYswM22v7SEL4OVUTGqWgIhCvvswN83KTRMwzPQZ9DuCvpvZz2AufpfNznsGDnrfY%2FjwZr197Ih%3D&needFold=0&_ksTS=1584072063659_1149&callback=jsonp1150'
    urlFront = 'https://rate.tmall.com/list_detail_rate.htm?itemId=591204720926&spuId=1196687088&sellerId=2057500938&order=3&currentPage='
    urlRear ='&append=0&content=1&tagId=&posi=&picture=&groupId=&ua=098%23E1hvpvvfvw6vUvCkvvvvvjiPn2Fvzji2R2SWtj3mPmPwtjl8P2zZzj1PP2MhsjEhdphvmpvZ8vi3OppjoUhCvvswN8iy0YMwzPAQ6DItvpvhvvvvvUhCvvswPv1yMKMwzPQphHurvpvEvCo5vVYSCHDDdphvmQ9ZCQmj3QBBn4hARphvCvvvvvmrvpvEvvQRvy3UvRoi2QhvCvvvMMGCvpvVvvpvvhCvmphvLv99ApvjwYcEKOms5k9vibmXYC97W3dhA8oQrEtlB%2BFy%2BnezrmphQRAn3feAOHFIAXcBKFyK2ixrsj7J%2B3%2BdafmxfBkKNB3rsj7Q%2Bu0ivpvUvvmvRE8X69TEvpvVmvvC9jahKphv8vvvvvCvpvvvvvmm86CvmWZvvUUdphvWvvvv9krvpv3Fvvmm86CvmVWtvpvhvvvvv8wCvvpvvUmm3QhvCvvhvvmrvpvEvvFyvrzavm9VdphvhCQpVUCZxvvC7g0znsBBKaVCvpvLSH1a7z2SznswEjE4GDRi4IkisIhCvvswN8340nMwzPs5OHItvpvhvvvvv86Cvvyvh2%2BHj1GhPDervpvEvv1LCNL6Chi19phvHNlwM7L7qYswM22v7SEL4OVUTGqWgIhCvvswN83KTRMwzPQZ9DuCvpvZz2AufpfNznsGDnrfY%2FjwZr197Ih%3D&needFold=0&_ksTS=1584072063659_1149&callback=jsonp1150'
    for i in range(0, num):
        COMMENT_PAGE_URL.append(urlFront + str(1 + i) + urlRear)


# 获取评论数据
def GetInfo(num):
    # 定义需要的字段
    nickname = []
    auctionSku = []
    ratecontent = []
    ratedate = []
    # 循环获取每一页评论
    for i in range(num):
        # 头文件,没有头文件会返回错误的js
        headers = {
            'cookie':'cna=AZIzFWjRuyICAToTAygn5OEJ; sm4=429006; hng=CN%7Czh-CN%7CCNY%7C156; lid=%E6%98%9F%E8%BE%89%E7%81%BF%E7%83%82%E4%B9%8B%E7%82%8E%E7%84%B1%E7%87%9A; t=1e17c56d1530f801b4c5dd9bc8793aa2; tracknick=%5Cu6',
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
            'referer': 'https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.3d067a51ue6QgE&id=591204720926&skuId=4065121357065&areaId=420100&user_id=2057500938&cat_id=50025174&is_b=1&rn=f3dfc9236475de95757ce169d42558a0',
            'accept': '*/*',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9'
        }#伪装成浏览器访问,防止乱码或者防止访问失败
        # 解析JS文件内容
        print(COMMENT_PAGE_URL[i])
        content = requests.get(COMMENT_PAGE_URL[i], headers=headers).text  # 调用http接口并获取他的文字
        print(content)
        # nk = re.findall('"displayUserNick":"(.*?)"', content)
        # nickname.extend(nk)
        # # print(nk)
        #如果不需要某些信息,删除对应部分即可,要添加某些信息,照着样子写新的正则即可
        nickname.extend(re.findall('"displayUserNick":"(.*?)"', content))  # 正则表达式匹配存入列表
        auctionSku.extend(re.findall('"auctionSku":"(.*?)"', content))
        ratecontent.extend(re.findall('"rateContent":"(.*?)"', content))
        ratedate.extend(re.findall('"rateDate":"(.*?)"', content))
    # 将数据写入TEXT文件中
    for i in list(range(0, len(nickname))):
        text = ','.join((nickname[i], ratedate[i], auctionSku[i], ratecontent[i])) + '\n'
        with open(r"D:\\python\\python\\taobao_info\\TaoBao.txt", 'a+', encoding='UTF-8') as file:
            file.write(text + ' ')
            print(i + 1, ":写入成功")


# 主函数
if __name__ == "__main__":
    Page_Num = 20
    Get_Url(Page_Num)
    GetInfo(10)

标签:content,皮皮,URL,爬虫,PAGE,学习,re,cookie,findall
来源: https://blog.csdn.net/qq_38147101/article/details/104898425