Python + selenium爬取B站用户信息(iP池+pymsql存储)
作者:互联网
import bs4
import json
import time
import pymysql
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def getPage(mid, n, href):
headers = {
'User-Agent': 'Mozilla/5.0',
'Cookie': "",
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection': 'keep-alive',
'Referer': href+'/fans/fans',
}
params = (
('vmid', str(mid)),
('pn', str(n)),
('ps', '50'),
('order', 'desc'),
)
proxy = ["116.117.134.134", "112.80.248.73", "47.99.209.194", "1.181.48.68", "60.255.151.81", "202.108.22.5", "223.104.38.117"]
i = 0
while True:
print(i)
if i < len(proxy):
proxies = {
'https://': proxy[i]
}
response = requests.get('https://api.bilibili.com/x/relation/followers', proxies=proxies, headers=headers, params=params)
if response.status_code == 200:
break
i = i + 1
if i + 1 == len(proxy):
print("IP 全部失效")
break
return response
def getUserDetails(mid):
cookies = {'domain': '/',
'expires': 'false',
'httpOnly': 'false',
'name': 'buvid3',
'path': 'Fri, 29 Jan 2021 08:50:10 GMT',
'value': '7A29BBDE-VA94D-4F66-QC63-D9CB8568D84331045infoc,bilibili.com'}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Origin': 'https://space.bilibili.com',
'Connection': 'keep-alive',
'Referer': 'https://space.bilibili.com/546195/fans/fans',
'Cache-Control': 'max-age=0',
}
params = (
('mid', str(mid)),
('jsonp', 'jsonp'),
)
proxy = ["112.95.18.193", "112.80.248.73", "47.99.209.194", "1.181.48.68", "60.255.151.81", "202.108.22.5", "223.104.38.117"]
i = 0
while True:
print(i)
if i < len(proxy):
proxies = {
'https://': proxy[i]
}
response = requests.get('https://api.bilibili.com/x/space/acc/info', proxies=proxies, headers=headers, cookies=cookies, params=params)
if response.status_code == 200:
break
i = i + 1
if i + 1 == len(proxy):
print("IP 全部失效")
break
return response
def getUpInfoBySelenium(href, mid):
chrome_options = Options()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(executable_path="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")
browser.get(href)
try:
html = browser.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(browser.page_source, 'html.parser')
focus = soup.find('p', 'n-data-v space-attention').text # 关注数
fans = soup.find('p', 'n-data-v space-fans').text # 粉丝数
print("关注数" + str(focus), "粉丝数" + str(fans))
finally:
browser.close()
def viplevel(vip):
if vip == 0:
vipname = '非会员'
elif vip == 1:
vipname = '会员'
else:
vipname = '大会员'
return vipname
def createDb():
#--------------------------------------------------------------------------------------------------
db = pymysql.connect(host='localhost', user='root', password='admin', port=3306)
cursor = db.cursor();
sql = 'CREATE DATABASE bilibili'
cursor.execute(sql)
cursor.close()
#--------------------------------------------------------------------------------------------------
db = pymysql.connect(host='localhost', user='root', password='admin', port=3306, db='bilibili')
cursor = db.cursor()
sql = 'CREATE TABLE IF NOT EXISTS up (id int(11) NOT NULL AUTO_INCREMENT, ' \
'up_id VARCHAR(255) NOT NULL,up_name VARCHAR(255) NOT NULL, ' \
'sex VARCHAR(10) NOT NULL, birthday VARCHAR(255),' \
'focus VARCHAR(255),fans VARCHAR(255),area VARCHAR(255),' \
'praise VARCHAR(255),view VARCHAR(255),' \
'sign VARCHAR(255) NOT NULL,title VARCHAR(255) NOT NULL,' \
'PRIMARY KEY (id,up_id))'
cursor.execute(sql)
db.close()
#---------------------------------------------------------------------------------------------------
db = pymysql.connect(host='localhost', user='root', password='admin', port=3306, db='bilibili')
cursor = db.cursor()
sql = 'CREATE TABLE IF NOT EXISTS fans (id int(11) NOT NULL AUTO_INCREMENT,' \
'up_id VARCHAR(255) NOT NULL,fans_id VARCHAR(255) NOT NULL,' \
'fans_name VARCHAR(255) NOT NULL, sex VARCHAR(10) NOT NULL,' \
'fans_level VARCHAR(10) NOT NULL,viplevel VARCHAR(255) NOT NULL,' \
'time VARCHAR(255) NOT NULL,' \
'PRIMARY KEY (id))'
cursor.execute(sql)
db.close()
def insertUp(mid, name, sex, sign, birthday, title):
db = pymysql.connect(host='localhost', user='root', password='admin', port=3306, db='bilibili')
cursor = db.cursor()
sql = 'INSERT INTO up(up_id,up_name,sex,sign,birthday,title) values(%s,%s,%s,%s,%s,%s)'
val = (mid, name, sex, sign, birthday, title)
try :
cursor.execute (sql, val)
db.commit()
except:
db. rollback ()
db.close()
def insertFans(up_mid, fans_mid, time, uname, viplevel, sex, level):
db = pymysql.connect(host='localhost', user='root', password='admin', port=3306, db='bilibili')
cursor = db.cursor()
sql = 'INSERT INTO fans(up_id,fans_id,fans_name,sex,fans_level,viplevel,time) values(%s,%s,%s,%s,%s,%s,%s)'
val = (up_mid, fans_mid, uname, sex, level, viplevel, time)
try:
cursor.execute(sql, val)
db.commit()
except:
db.rollback()
db.close()
if __name__ == '__main__':
up_id = ["546195", "9824766", "777536", "321173469", "517327498", "122879", "20165629", "14110780", "62540916", "19577966"]
for i in range(len(up_id)):
href = "https://space.bilibili.com/" + str(up_id[i]) + "/video"
up = getUserDetails(up_id[i]) #获取up主个人信息(json)
json_obj = json.loads(up.text)
up_mid = json_obj['data']['mid']
name = json_obj['data']['name']
sex = json_obj['data']['sex']
sign = json_obj['data']['sign']
level = json_obj['data']['level']
birthday = json_obj['data']['birthday']
title = json_obj['data']['official']['title']
print("up主uid:"+str(up_mid), "用户名:"+name, "性别:"+sex, "留言:"+sign, "生日:"+birthday, "称号:"+title)
# ------------------------------------------------ #
print("开始 selenium")
getUpInfoBySelenium(href, str(up_mid)) # 打印粉丝数
print("结束 selenium")
# ------------------------------------------------ #
print("粉丝数据:", end='')
for j in range(1, 5):
print("j:" + j)
r = getPage(up_id[i], j, href)
json_obj = json.loads(r.text) #返回json格式
for entry in json_obj['data']['list']:
fans_mid = entry['mid']
mtime = entry['mtime']
uname = entry['uname']
vip = entry['vip']['vipType']
fansDetails = getUserDetails(fans_mid)
json_obj = json.loads(fansDetails.text)
sex = json_obj['data']['sex']
level = json_obj['data']['level']
print("uid:" + str(fans_mid), "关注时间:"+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(mtime)), "用户名:" + uname, "vip等级:" + viplevel(vip), "性别:"+sex, "账户等级:"+str(level))
time.sleep(5) # 防止封ip
本文初始定义了几个UP主的id号,因为这几个UP主的粉丝量较大,故易获取粉丝信息
本文通过构造UP主的空间信息,获取粉丝量,并访问粉丝信息,通过python的pymsql库链接本地mysql进行数据存储,其中的SQL代码已经内嵌进去。对于B站高频访问会有IP限制,所以本文也采用了代理IP池的方法,不过本文并没有进一步通过构建时时IP池进行刷新IP,这一点受制于有效IP过少的限制,所以仅使用了几个IP进行替换。
另,B站的UP主粉丝信息并不能持续翻页,这点受制于B站网站的用户信息限制或本博主技术不到位,无法突破。故每个UP主的粉丝信息只能爬取几十页的粉丝目录。
标签:VARCHAR,Python,iP,json,db,up,mid,fans,selenium 来源: https://blog.csdn.net/YiXiao1997/article/details/118883707