编程语言
首页 > 编程语言> > python爬取b站所有动漫简介和电影天堂下载链接

python爬取b站所有动漫简介和电影天堂下载链接

作者:互联网

#定位到2022必看热片
#提取子页面连接地址
#拿到想要的下载地址
import re
import requests
import csv


header={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"
    }

url="https://dytt89.com/"
requ=requests.get(url,headers=header)
requ.encoding="gb2312"
f = open("电影天堂.csv", mode="a", encoding="utf-8", newline='')
csvwriter=csv.writer(f)
#电影天堂网页用的是这个gb2312,utf-8会乱码
print(requ.text)
obj1=re.compile(r'<span style="float:left;">综艺&动漫.*?<ul>(?P<ul>.*?)</ul>',re.S)
obj2=re.compile(r"<li><a href='(?P<link>.*?)' title=.*?2022年(?P<name>.*?)</a><span>",re.S)
obj3=re.compile(r'<img alt="" src="(?P<image>.*?)" style=.*?译  名 (?P<tit>.*?)<br />.*?<td style="WORD-WRAP:.*?<a href="(?P<link2>.*?)">magnet',re.S)
result=obj1.finditer(requ.text)
herf_list=[]
for i in result:
    ul=i.group("ul")
    #print(i.group("ul"))
result2=obj2.finditer(ul)
for i in result2:
    #拼接子页面url
    herf=url+i.group("link").strip("/")
    herf_list.append(herf) #把子页面列表列举出来
    #print(herf)
    print(i.group("name"))

for j in herf_list:
    requst=requests.get(j,headers=header)
    requst.encoding="gb2312"
    print(requst.text)
    rew=obj3.finditer(requst.text)
    for s in rew:
        print(s.group("tit"))
        dic=s.groupdict()
        csvwriter.writerow(dic.values())

print("over")

使用python爬取界面列表子页面链接,然后根据列表的子页面链接,爬取电影天堂的电影的链接和图片存储起来

 

 繁体是因为这些字页面使用的是繁体的表示方法

根据这一思路,我爬取了b站所有动漫的子页面的简介,下面附上代码

#定位到动漫列表
#提取子页面连接地址
#拿到想要的下载地址

import requests
import re
import csv
wq=1
while(wq<163):
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"
    }

    dat={
            "season_version":"-1",
        "spoken_language_type":"-1",
        "area":"-1",
        "is_finish":"-1",
        "copyright":"-1",
        "season_status":"-1",
        "season_month":"-1",
        "year":"-1",
        "style_id":"-1",
        "order":"4",
        "st":"1",
        "sort":"0",
        "page":f"{wq}",
        "season_type":"1",
        "pagesize":"20",
        "type":"1"
        }
    f=open("动漫简介.csv",mode="a",encoding="utf-8",newline='')
    csvwriter=csv.writer(f)
    url="https://api.bilibili.com/pgc/season/index/result/#"
    wq = wq + 1
    requ=requests.get(url,headers=header,params=dat)
    print(requ.text)
    obj1=re.compile(r'"link":"(?P<link>.*?)","media_id"',re.S)
    obj2=re.compile(r'<meta property="og:title" content="(?P<title>.*?)"><meta property.*?:image" content=".*?"><meta name=".*?itemprop="description" content="(?P<jianjie>.*?)"><meta it',re.S)
    result=obj1.finditer(requ.text)
    link_list=[]
    for i in result:
        dis=i.group("link")
        print(dis)
        link_list.append(dis)
    for j in link_list:
        print(j)
        requ1=requests.get(j,headers=header,params=dat)
        #print(requ1.text)
        result1=obj2.finditer(requ1.text)
        for k in result1:
            print(k.group("title"))
            print(k.group("jianjie"))
            dic=k.groupdict().values()
            csvwriter.writerow(dic)

    #break   #测试用

 

 动画是按照评分的顺序

标签:group,动漫,python,herf,爬取,re,print,import,页面
来源: https://www.cnblogs.com/520520520zl/p/16182736.html