广西空中课堂五年级每日爬取教学视频(scrapy 爬虫)
作者:互联网
这几天由于特殊原因,闲在家中无事干,恰逢老妹要在家上课,家里没有广西广电机顶盒,所以只能去网上下载下来放到电视上看。前段时间又学了点爬虫正好拿来练练手(已查阅网站无robots协议限制)
网站链接:广西空中课堂
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
import re
import datetime
class MycoursespiderSpider(scrapy.Spider):
name = 'mycoursespider'
global mydict
mydict = {}
global list
list = []
start_urls = ['http://zt.gxtv.cn/zt/default.html']
def parse(self, response):
curr_time = datetime.datetime.now()
pattern = str(curr_time.month) + '月' + str(curr_time.day) + '日'
domain = 'http://zt.gxtv.cn'
# ctable = response.css('a#ctable::attr(href)').extract()[0]
# yield SplashRequest(ctable, self.parsecoursetable, endpoint='render.html', args={'images': 1})
g5 = response.css('ul#g5 a[target=_blank]').extract() #获取五年级栏目的内容
g4 = response.css('ul#g4 a[target=_blank]').extract() #获取四年级栏目的内容
g5 = ''.join(g5)
g4 = ''.join(g4)
soup = BeautifulSoup(g5, 'html.parser')
ensoup = BeautifulSoup(g4, 'html.parser')
for i in ensoup.find_all('a'):
if re.search(pattern + '-英语', i['title']) is not None: #查看今天有没有英语课,乡下五年级学四年级
mydict.update({i['title']: domain + i['href']})
for i in soup.find_all('a'): #查看今天五年级的更新内容
if re.search(pattern, i['title']) is not None:
mydict.update({i['title']: domain + i['href']})
for key in mydict:
page = mydict[key]
yield scrapy.Request(page, callback=self.parseinside)
def parseinside(self, response):
curr_time = datetime.datetime.now() #当前时间
filename = str(curr_time.month) + '-' + str(curr_time.day) + '.txt'
playhost = 'http://video.cdn.liangtv.cn.*mp4' #匹配链接字符串
resp = response.text
title = response.css('h3#title::text').extract_first()
playlink = re.search(playhost, resp)
if playlink is not None:
video = str(playlink.group(0))
mydict[title] = video
with open(filename, 'w+') as f:
for key in mydict:
f.write(str(key) + ':' + str(mydict[key]))
f.write('\n')
yield scrapy.Request(video, self.parsevideo, meta={'title': title}) #meta实现内部函数之间传参
def parsevideo(self, response): #保存视频
title = response.meta['title'] + '.mp4'
with open(title, 'wb') as f:
f.write(response.body)
# def parsecoursetable(self, response): #尝试抓课程表下来,转了半天splash死活抓不出他动态渲染后的页面
# resp = response.text
# resp = '' + resp
# resp.encode('utf-8')
# print(resp)
# with open('download.html', 'w+') as f:
# f.write(response.text)
还差一个爬动态渲染出来的课程表,这个splash真的搞不懂
这不是一个名字 发布了9 篇原创文章 · 获赞 1 · 访问量 1万+ 私信 关注标签:空中课堂,title,resp,self,爬取,scrapy,str,mydict,response 来源: https://blog.csdn.net/qq_41968029/article/details/104463434