电影天堂爬去示例基础2
作者:互联网
代码部分1
// An highlighted block
from urllib import request
import re
import time
import random
from useragents import *
import pymysql
class FilmSky(object):
def __init__(self):
self.url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
# 定义两个对象
self.db = pymysql.connect(
'127.0.0.1','root','123456','maoyandb',charset='utf8'
)
self.cursor = self.db.cursor()
# 获取html函数(因为两个页面都需要发请求)
def get_page(self,url):
req = request.Request(
url=url,
headers={'User-Agent':random.choice(ua_list)}
)
res = request.urlopen(req)
# ignore参数,实在处理不了的编码错误忽略
# 查看网页源码,发现网页编码为 gb2312,不是 utf-8
html = res.read().decode('gb2312','ignore')
return html
# 解析提取数据(把名称和下载链接一次性拿到)
# html为一级页面响应内容
def parse_page(self,html):
# 1. 先解析一级页面(电影名称 和 详情链接)
pattern = re.compile('<table width="100%".*?<td height="26">.*?<a href="(.*?)".*?>(.*?)</a>',re.S)
# film_list: [('详情链接','名称'),()]
film_list = pattern.findall(html)
# print(film_list)
ins = 'insert into filmsky values(%s,%s)'
for film in film_list:
film_name = film[1]
film_link = 'https://www.dytt8.net' + film[0]
# 2. 拿到详情链接后,再去获取详情链接html,提取下载链接
download_link = self.parse_two_html(film_link)
self.cursor.execute(ins,[film_name,film_link])
self.db.commit()
# 打印测试
d = {
'电影名称':film_name,
'下载链接':download_link
}
print(d)
# 解析二级页面,获取下载链接
def parse_two_html(self,film_link):
two_html = self.get_page(film_link)
pattern = re.compile('<td style="WORD-WRAP.*?>.*?>(.*?)</a>',re.S)
download_link = pattern.findall(two_html)[0]
return download_link
# 主函数
def main(self):
for page in range(1,3):
url = self.url.format(page)
html = self.get_page(url)
self.parse_page(html)
time.sleep(random.randint(1,2))
print('第%d页完成' % page)
# 断开数据库(所有页爬完之后)
self.cursor.close()
self.db.close()
if __name__ == '__main__':
start = time.time()
spider = FilmSky()
spider.main()
end = time.time()
print('执行时间:%.2f' % (end-start))
代码部分2
// An highlighted block
# 1. 打印程序执行时间
# 2. 随机的User-Agent,(确保每次发请求使用随机)
# 3. 数据爬下来后做处理(字符串),定义成字典
# 4. 一条龙: 获取 -> 调用解析 -> 数据处理
import requests
from lxml import etree
import time
import random
class MaoyanSpider(object):
def __init__(self):
self.url = 'https://maoyan.com/board/4?offset={}'
self.ua_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)'
]
# 用于记录页数
self.page = 1
# 获取
def get_page(self,url):
# 每次使用随机的user-agent
headers = {'User-Agent':random.choice(self.ua_list)}
html = requests.get(
url=url,
headers=headers
).content.decode('utf-8')
# 直接调用解析函数
self.parse_page(html)
# 解析
def parse_page(self,html):
# 创建解析对象
parse_html = etree.HTML(html)
movie_dict = {}
# 1.基准xpath: 匹配每个电影信息的节点对象
dd_list = parse_html.xpath('//dl[@class="board-wrapper"]/dd')
# 2.for依次遍历每个节点对象,获取信息
for dd in dd_list:
# 名称
movie_dict['name'] = dd.xpath('./a/@title')[0].strip()
# 主演
movie_dict['star'] = dd.xpath('.//p[@class="star"]/text()')[0].strip()
# 时间
movie_dict['time'] = dd.xpath('.//p[@class="releasetime"]/text()')[0].strip()
print(movie_dict)
def main(self):
for offset in range(0,31,10):
url = self.url.format(offset)
self.get_page(url)
time.sleep(random.randint(1,3))
print('第%d页爬取完成' % self.page)
self.page += 1
if __name__ == '__main__':
start = time.time()
spider = MaoyanSpider()
spider.main()
end = time.time()
print('执行时间: %.2f' % (end-start)
标签:示例,url,self,电影,html,time,天堂,page,film 来源: https://blog.csdn.net/weixin_44422242/article/details/100130305