其他分享
首页 > 其他分享> > 爬取Macy网用户评价日志(3):根据url爬取产品信息(二):爬取产品图片并分类保存

爬取Macy网用户评价日志(3):根据url爬取产品信息(二):爬取产品图片并分类保存

作者:互联网

1. 方法叙述:

    github地址:https://github.com/AtwoodZhang/Crawler_of_Product_Comment

    在获得产品url之后,向url发送请求。然后解析response之后,匹配html中的图片url,向图片url发送请求并下载保存。

    1)所有img按照web id新建文件夹。每个页面有多个img 的url,因此需要考虑下载的具体是哪一张图片。

    2)所有img的url,使用xpath时,xpath helper可以测试,但使用request却无法匹配结果,因此使用re模块,即正则匹配进行匹配下载。

    3)web id是Macy为每个prod分配的一个序列号。

2. 具体代码:

_04_spider_of_rank4_prod_info_for_pic.py
# step1.4 下载产品图片;直接保存到本地。
import time
import os
from _00_record_of_small_functions import *
from _04_mysql_of_rank4 import MacyRank4Mysql
from concurrent.futures import ThreadPoolExecutor  # 用来构建线程池
import request_test_04_get_comment as rc   # 爬虫
import random
from _00_record_of_agent_pool import ua_list
from _00_record_of_xpath_and_re_dict import xpath_dict
import requests
import pymysql


def run_crawl_img():
    # step1. 从数据库中取出需要request的url;
    r4_img_sql = MacyRank4Mysql()
    r4_img_sql.select_unrequest_prod_img(table_name='rank4_prod_specific_info')
    r4_img_list = [i for i in r4_img_sql.cursor.fetchall()]
    r4_img_sql.database_commit_close()
    print(len(r4_img_list))

    # step1.2. 首先使用一条数据进行测试;
    # r4_img_list = [r4_img_list[21]]
    # r4_img_list = r4_img_list[0:2]
    # print(r4_img_list)
    # print(len(r4_img_list))

    # step2. 对url_list中的每一条数据逐一发送爬取请求;
    # 开启多线程;
    with ThreadPoolExecutor(1) as t:
        for i in r4_img_list:
            case = [i[0], i[2]]
            t.submit(send_request, case)
            time.sleep(random.uniform(1, 3))


def send_request(url_address):
    print(url_address)
    print(url_address[1], url_address[0])
    rc.get_comment(url=url_address[1], x=url_address[0])


if __name__ == "__main__":
    # step1. 写入爬取日志
    log_path = './prod_crawl_log/'
    if not os.path.exists(log_path):
        os.makedirs(log_path)
    log_file_name = log_path + 'crawl_img_' + 'log-' + time.strftime("%Y%m%d-%H%M%S", time.localtime()) + '.log'
    sys.stdout = Logger(log_file_name)
    sys.stderr = Logger(log_file_name)

    # step2. 运行爬取过程;
    start = time.time()
    run_crawl_img()
    end = time.time()
    spend_time = end - start
    print("finish crawl prod_img:", spend_time)
request_test_04_get_comment.py
import requests
import pymysql
from _00_record_of_agent_pool import ua_list
from _00_record_of_xpath_and_re_dict import xpath_dict
import random
import re
import os


def get_comment(url, x):
    # url表示requests的具体html页
    resp_status = False
    # print("connect now!")

    try:
        resp_status = support_request(url=url, x=x)
    except Exception as e:
        print(e)
        # 此时是请求超时
        for i in range(1, 5):
            print('请求超时,第%s次重复请求' % i)
            resp_status = support_request(url=url, x=x)

    # 若解析失败则多次请求
    count = 0
    if resp_status is False:
        count = 5
    while resp_status is False and count > 0:
        resp_status = support_request(url=url, x=x)
        count = count - 1


def support_request(url, x):
    headers = {'User-Agent': random.choice(ua_list)}
    response = requests.get(url=url, headers=headers, timeout=3)
    # print(response.text)
    if response.status_code == 200 and response.text != []:
        response.encoding = "utf-8"
        parse_html(response.text, url, x)
        response.close()
        resp_status = True
    else:
        print("本次请求失败!")
        resp_status = False
    return resp_status


def parse_html(html_, url, x):
    re_string_rank4_img = '<img src="(.*?)".*?name='
    # re_string_rank4_img = '<img src="(.*?)"'
    pattern4 = re.compile(re_string_rank4_img, re.S)
    img_url = pattern4.findall(html_)
    img_url = list(set(img_url))
    count = 0
    folder_name = ".\\prod_img\\" + x
    folder = os.path.exists(folder_name)
    if not folder:
        os.makedirs(folder_name)
    for i in img_url:
        result = requests.get(i)
        result.raise_for_status()
        filename = folder_name + "\\" + x + "_" + str(count) + ".png"
        print("img_name:", filename)
        with open(filename, "wb") as f:
            f.write(result.content)
            f.close()
        count = count + 1

    print("img_url:", img_url)
    print("This prod ", x, "'s img download well.")

    sql = "update rank4_prod_specific_info set img_request_situation='True' " \
          "where prod_id='{}'".format(x)

    db = pymysql.connect(
        host="localhost",  # 数据库服务端地址
        user='root',  # 链接数据库所使用的用户名
        passwd='root',  # 数据库密码
        db='macy',  # 数据库名称
        charset='utf8')
    # 创建游标对象
    cursor = db.cursor()
    cursor.execute(sql)
    db.commit()
    cursor.close()
    db.close()
    print(sql)
    print("I have updated the", x, "sql.\n")


if __name__ == "__main__":
    url_ = "https://www.macys.com/shop/product/charter-club-crew-neck-heart-print-t-shirt-created-for-macys?ID=13029272&CategoryID=255"

    webid_string = 'https://www.macys.com/.*?ID=(.*?)&.*?'
    webid_pattern = re.compile(webid_string, re.S)
    result_webid = webid_pattern.findall(url_)[0].strip()

    print("web_id: ", result_webid)
    x = result_webid
    print(url_, x)
    # print(x)
    get_comment(url_, x)

 

标签:r4,产品图片,img,url,list,爬取,print,import
来源: https://www.cnblogs.com/cainiaoxuexi2017-ZYA/p/15903028.html