编程语言
首页 > 编程语言> > 【Python】百度贴吧爬虫

【Python】百度贴吧爬虫

作者:互联网

# -*- coding: utf-8 -*-
"""
爬取百度贴吧中指定帖子中的所有图片——————requests-bs4-re路线
1.0,2.0,2.5,2.6,3.0,3.2
3.5
"""
import requests, os, re, time, random
from bs4 import BeautifulSoup
import threading


def getHTTPtext(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text


def get_information(html, alist):
    first_soup = BeautifulSoup(html, "html.parser")
    r2 = re.findall(r'<title>.*?_百度贴吧', html[:5000])  # FIXME 这个html[:5000]太傻了
    r3 = r2[0].split("_")
    title = r2[0].split("_")[0][7:]
    if len(r3) == 3:
        ba_name = r3[1]
    else:
        r_1 = re.findall(r'【.*?】', r3[0])[-1]
        ba_name = r_1[1:][:-1]
    max_page = re.findall(r'\d+', re.findall(r'共.*?页', html)[0])[0]
    alist.append(title)
    alist.append(max_page)
    alist.append(ba_name)
    return first_soup


def download_pic(link, count, path_now, pn, max_page):
    path = path_now + str(count) + ".png"
    try:
        if not os.path.exists(path):
            r = requests.get(link, timeout=30)
            with open(path, 'wb') as f:
                f.write(r.content)
                f.close()
                print("\r保存成功,这是第{}张图片,现在是第{}/{}页".format(count, pn, max_page), end='')
        else:
            print("文件已存在")
    except:
        print("保存失败")


def make_path(ba_name, title):
    a = input("文件路径是否使用默认设置?若否,输入从何处开始更改(从1开始):")
    path_list = ['D', 'tieba_pics', ba_name, title[:15]]  # FIXME 吧名中可能出现“/”干扰此模块
    while a:
        a = int(a)
        b = input("现在在修改第{}层文件,输入文件名,回车以终止:".format(a))
        if not b:  # b为空则终止
            break
        if a <= 4:
            path_list[a - 1] = b
        else:
            path_list.append(b)
        a += 1
    c = 1
    path_now = path_list[0] + "://"
    while c + 1 <= len(path_list):
        path_now = path_now + path_list[c] + "//"
        if not os.path.exists(path_now):
            os.mkdir(path_now)
        c += 1
    return path_now


def get_http_text(url):
    """多线程特供"""
    # 全程对 next_soup 上锁
    global next_soup
    threadLock.acquire()
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = 'utf-8'
    except:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = 'utf-8'
    next_html = r.text
    next_soup = BeautifulSoup(next_html, "html.parser")
    threadLock.release()
    # print("get线程结束")


def get_hrefs(max_page, seelz, first_soup):
    """一个生成器,可以次序输出一个帖子里所有图片的链接"""
    global threadLock
    p_num = 0  # 爬取的图片数
    threadLock = threading.Lock()

    if max_page == 1:
        first_list = first_soup("img", "BDE_Image")
        print("\r保存成功,这是第{}张图片,现在是第1/1页".format(p_num), end='')  # 跳页时也要输出一次
        for i in first_list:
            href = i.attrs['src']
            pic_id = href.split("/")[-1]
            real_href = "http://tiebapic.baidu.com/forum/pic/item/" + pic_id
            p_num += 1
            yield [p_num, 1, real_href]

    else:
        for pn in range(2, max_page + 1 + 1):
            # 这个函数的结构较为特殊,因为第一页的内容已经爬取过了,所以这里从第二页开始循环
            # 但是是在对前一页的链接进行保存,同时利用多线程爬取这一页的内容
            # 所以在循环体中体现的就是从 2 到 max_page + 1 的一个过程

            # 爬取到第一页时,还没有下一页的内容
            if not pn == 2:
                first_soup = next_soup
            if not seelz:
                url = "https://tieba.baidu.com/p/" + str(ID) + "?see_lz=1&pn=" + str(pn)
            else:
                url = "https://tieba.baidu.com/p/" + str(ID) + "?pn=" + str(pn)

            # 爬取到最后一页时,不再准备下一页的内容
            if not pn == max_page + 1:
                get_next_html = threading.Thread(target=get_http_text(url))
                get_next_html.start()

            first_list = first_soup("img", "BDE_Image")
            # 跳页时需要输出一次
            print("\r保存成功,这是第{}张图片,现在是第{}/{}页".format(p_num, pn - 1, max_page), end='')

            for i in first_list:
                href = i.attrs['src']
                pic_id = href.split("/")[-1]
                real_href = "http://tiebapic.baidu.com/forum/pic/item/" + pic_id
                p_num += 1
                yield [p_num, pn - 1, real_href]


def main(ID):
    # 获取这个帖子的信息
    seelz = input("只看楼主?是的话回车:")
    if not seelz:
        url = "https://tieba.baidu.com/p/" + str(ID) + "?see_lz=1"
    else:
        url = "https://tieba.baidu.com/p/" + str(ID)
    first_html = getHTTPtext(url)
    alist = []
    first_soup = get_information(first_html, alist)
    title, max_page, ba_name = alist[0], int(alist[1]), alist[2]
    # 创建图片保存路径
    path_now = make_path(ba_name, title)
    # 开始爬取
    for p_num, pn, real_href in get_hrefs(max_page, seelz, first_soup):
        # 每次下载时都要输出
        download_pic(real_href, p_num, path_now, pn, max_page)


real_count = count = 0
luck = random.randint(0, 20)
while True:
    if count == 0:
        ID = int(input("输入要爬取帖子的id:"))
        main(ID)
        real_count += 1
        choose = input('''是否退出?
1.是
2.否
选择:''')
        if choose == "1":
            count += 1
        elif choose == "2":
            continue
        else:
            print("?")
            time.sleep(1)
            print("emmmm")
            time.sleep(1)
            print("好吧")
            time.sleep(1)
            break
    if count == 1:
        print("感谢您的使用,本次共爬取了{}次".format(real_count))
        time.sleep(1)
        if luck == 0:
            print("注意身体")
            time.sleep(1)
        time.sleep(1)
        break

标签:name,Python,爬虫,re,html,path,requests,百度,ba
来源: https://www.cnblogs.com/HGNET/p/16244996.html