其他分享
首页 > 其他分享> > 今日头条街拍美图爬取

今日头条街拍美图爬取

作者:互联网

 

实验目的

熟悉Ajax的使用

实验内容

以今日头条为例来尝试通过分析Ajax请求来抓取网页数据的方法。这次要抓取的目标是今日头条的街拍美图,抓取完成之后,将每组图片分文件夹下载到本地并保存下来。

实验过程

1、网页分析

(1)打开今日头条首页https://www.toutiao.com/,搜索框里输入“街拍”

 

(2)得到如下搜索结果

 

 

(2)转到图片,这时打开开发者工具,选择网络——>XHR,查看URL的构成,

https://so.toutiao.com/search?keyword=%E8%A1%97%E6%8B%8D&pd=atlas&source=search_subtab_switch&dvpf=pc&aid=4916&page_num=1&rawJSON=1&search_id=202204271935490101501350305E56B428

通过分析,我们发现,这里变化的只有page_num.

我们从这里也能找到user-agent,cookies等信息,可以构成请求头。

 

(5)打开预览选项卡查看,我们要爬取的图片信息就在该json文件的rawData中,而其中的img_url就是我们要爬取的图片链接。

 

2、爬取代码

(1)导入需要的包

import requests,os #os用于创建文件

from urllib.parse import urlencode #解决编码问题

import urllib.parse

from hashlib import md5 #检测是否有重复文件

 

(2)设置请求头

headers={

    'host':'so.toutiao.com',

    

    'Referer':'https://so.toutiao.com/search?keyword=%E8%A1 \

    %97%E6%8B%8D&pd=atlas&source=search_subtab_switch&dvpf=pc&aid=4916&page_num=0',

    

    'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Ne \

    xus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like \

    Gecko) Chrome/100.0.4896.127 Mobile Safari/537.36 Edg/100.0.1185.50',

                                              

    'X-Requested-With':'XMLHttpRequest',  # 利用Ajax爬取需要手动设置这一参数

    

    'Cookie':'msToken=-tbIPWGnxn9IkPce9TkVKQOOCbS996FTCPCv4ZyEEMq31aG6VVx9v7 \

    uhqUjdE9VmR_7OJSKeN8M-Mk4bLG4BPdr8T-WNwYnzJoM7A8ucM0Ko; tt_webid=7089706 \

    220605261320; _S_IPAD=0; MONITOR_WEB_ID=7089706220605261320; PIXIEL_RATI \

    O=2.0000000298023224; FRM=new; ttcid=4c3ed44c1a63414dba951d23262dc98040 \

    ; WIN_WH=320_658; tt_scid=-HSOw6HU9KYC0P6N6Hty6tR6HH6XDaCuaXSnNvT29cl65Q \

    iFXXhHZdUDwp1aKBH77ccb; ttwid=1%7CUvC_j34tZR9J0iHJqed1wxypB7iXOAO3MBUcKQ \

    1guHs%7C1651059027%7Cabb8d559145a8f3e00a3b83a5e8e2150db22aa339e83af34baa \

    ef712abb04117; _S_WIN_WH=1536_746; _S_DPR=1.25'

    }

 

(3)获取网页

def get_page(page_num):

    params = {           #url较长,可设置params的参数

        'keyword':urllib.parse.unquote('%E8%A1%97%E6%8B%8D'),

        'pd':'atlas',

        'source':'search_subtab_switch',

        'dvpf':'pc',

        'aid':4916,

        'page_num':page_num,

        'rawJSON':1,

        'search_id':'202204271935490101501350305E56B428'

    }

 

    base_url = 'https://so.toutiao.com/search?'

    url = base_url + urlencode(params)  #构成完整的URL

    try:

        resp = requests.get(url,headers=headers)

        if 200  == resp.status_code:  #若连接成功,则以json格式返回响应

            return resp.json()

    except requests.ConnectionError as e:  #如果出错,则输出错误信息

        print('error:',e)   #返回错误信息的元组

        return None

(4)获取图片信息

def get_images(json):

    images=json.get('rawData').get('data')

    for image in images:

        title = image.get('text')

        link = image.get('img_url')

        yield {

                'image': image,

                'link': link

                    }

 

(5)处理文件目录

def Mulu(image):

    text=image.get('link')

    ls=[]

    for item in text:

        if item.isdigit() or item.isalpha():

            ls.append(item)

        im=''.join(ls)

        return im

 

(6)保存图片

def save_image(im,item):

    if not os.path.exists(im):

        os.mkdir(im)

    try:

        response = requests.get(item.get('link')).content

        print(response)

        file_path = '{0}/{1}.{2}'.format(im, md5(response).hexdigest(), 'jpg')

           

        

        with open(file_path, 'wb') as f:

                f.write(response)

       

    except requests.ConnectionError:

        print('Failed to Save Image')

 

(7)主函数

def main(page):

    json = get_page(page)

    for item in get_images(json):

        #print(item)  #调试信息,调试结束后应注释掉

        im=Mulu(item)

        save_image(im,item)

    print('图片保存完毕')  #提示信息,调试结束后保留

 

if __name__=="__main__":

    for i in range(0,2):

        main(i)

 

(8)完整代码

 

import requests,os #os用于创建文件

from urllib.parse import urlencode #解决编码问题

import urllib.parse

from hashlib import md5 #检测是否有重复文件

 

 

headers={

    'host':'so.toutiao.com',

    

    'Referer':'https://so.toutiao.com/search?keyword=%E8%A1 \

    %97%E6%8B%8D&pd=atlas&source=search_subtab_switch&dvpf=pc&aid=4916&page_num=0',

    

    'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Ne \

    xus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like \

    Gecko) Chrome/100.0.4896.127 Mobile Safari/537.36 Edg/100.0.1185.50',

                                              

    'X-Requested-With':'XMLHttpRequest',  # 利用Ajax爬取需要手动设置这一参数

    

    'Cookie':'msToken=-tbIPWGnxn9IkPce9TkVKQOOCbS996FTCPCv4ZyEEMq31aG6VVx9v7 \

    uhqUjdE9VmR_7OJSKeN8M-Mk4bLG4BPdr8T-WNwYnzJoM7A8ucM0Ko; tt_webid=7089706 \

    220605261320; _S_IPAD=0; MONITOR_WEB_ID=7089706220605261320; PIXIEL_RATI \

    O=2.0000000298023224; FRM=new; ttcid=4c3ed44c1a63414dba951d23262dc98040 \

    ; WIN_WH=320_658; tt_scid=-HSOw6HU9KYC0P6N6Hty6tR6HH6XDaCuaXSnNvT29cl65Q \

    iFXXhHZdUDwp1aKBH77ccb; ttwid=1%7CUvC_j34tZR9J0iHJqed1wxypB7iXOAO3MBUcKQ \

    1guHs%7C1651059027%7Cabb8d559145a8f3e00a3b83a5e8e2150db22aa339e83af34baa \

    ef712abb04117; _S_WIN_WH=1536_746; _S_DPR=1.25'

    }

 

 

def get_page(page_num):

    params = {           #url较长,可设置params的参数

        'keyword':urllib.parse.unquote('%E8%A1%97%E6%8B%8D'),

        'pd':'atlas',

        'source':'search_subtab_switch',

        'dvpf':'pc',

        'aid':4916,

        'page_num':page_num,

        'rawJSON':1,

        'search_id':'202204271935490101501350305E56B428'

    }

 

    base_url = 'https://so.toutiao.com/search?'

    url = base_url + urlencode(params)  #构成完整的URL

    try:

        resp = requests.get(url,headers=headers)

        if 200  == resp.status_code:  #若连接成功,则以json格式返回响应

            return resp.json()

    except requests.ConnectionError as e:  #如果出错,则输出错误信息

        print('error:',e)   #返回错误信息的元组

        return None

 

def get_images(json):

    images=json.get('rawData').get('data')

    for image in images:

        title = image.get('text')

        link = image.get('img_url')

        yield {

                'image': image,

                'link': link

                    }

                

def Mulu(image):

    text=image.get('link')

    ls=[]

    for item in text:

        if item.isdigit() or item.isalpha():

            ls.append(item)

        im=''.join(ls)

        return im

    

 

def save_image(im,item):

    if not os.path.exists(im):

        os.mkdir(im)

    try:

        response = requests.get(item.get('link')).content

        print(response)

        file_path = '{0}/{1}.{2}'.format(im, md5(response).hexdigest(), 'jpg')

           

        

        with open(file_path, 'wb') as f:

                f.write(response)

       

    except requests.ConnectionError:

        print('Failed to Save Image')

 

 

def main(page):

    json = get_page(page)

    for item in get_images(json):

        #print(item)  #调试信息,调试结束后应注释掉

        im=Mulu(item)

        save_image(im,item)

    print('图片保存完毕')  #提示信息,调试结束后保留

 

 

if __name__=="__main__":

    for i in range(0,2):

        main(i)

 

 

 

标签:get,image,item,json,im,美图,今日,page,头条
来源: https://www.cnblogs.com/ffxqc/p/16222594.html