Python爬虫爬取搜狐视频电影并存储到mysql数据库
作者:互联网
代码:
1 import time 2 import traceback 3 import requests 4 from lxml import etree 5 import re 6 from bs4 import BeautifulSoup 7 from lxml.html.diff import end_tag 8 import json 9 import pymysql 10 #连接数据库 获取游标 11 def get_conn(): 12 """ 13 :return: 连接,游标 14 """ 15 # 创建连接 16 conn = pymysql.connect(host="127.0.0.1", 17 user="root", 18 password="000429", 19 db="movierankings", 20 charset="utf8") 21 # 创建游标 22 cursor = conn.cursor() # 执行完毕返回的结果集默认以元组显示 23 if ((conn != None) & (cursor != None)): 24 print("数据库连接成功!游标创建成功!") 25 else: 26 print("数据库连接失败!") 27 return conn, cursor 28 #关闭数据库连接和游标 29 def close_conn(conn, cursor): 30 if cursor: 31 cursor.close() 32 if conn: 33 conn.close() 34 return 1 35 36 def get_souhu(): 37 url='https://film.sohu.com/list_0_0_0_2_2_1_60.html?channeled=1200100000' 38 #最新上架 39 new_url='https://film.sohu.com/list_0_0_0_2_1_1_60.html?channeled=1200100000' 40 #本周热播 41 week_url='https://film.sohu.com/list_0_0_0_2_0_1_60.html?channeled=1200100000' 42 headers={ 43 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36' 44 } 45 46 #初始化list 47 templist=[] 48 dataRes=[] 49 #最受好评 50 for i in range(1,31): 51 url_1='https://film.sohu.com/list_0_0_0_2_2_' 52 auto=str(i) 53 url_2='_60.html?channeled=1200100000' 54 url=url_1+auto+url_2 55 response = requests.get(url, headers) 56 response.encoding = 'utf-8' 57 page_text = response.text 58 # etree_ = etree.HTML(page_text) 59 # 获取所有的li 60 soup = BeautifulSoup(page_text, 'lxml') 61 # 标签层级选择 62 li_list = soup.select('.movie-list>li') 63 print(len(li_list)) 64 if(len(li_list)==0): 65 print("最受好评爬取结束!") 66 if(len(dataRes)!=0): 67 return dataRes 68 for li in li_list: 69 li_text=str(li) 70 # print(li_text) 71 li_soup=BeautifulSoup(li_text,'lxml') 72 name=li_soup.find('div',class_="v_name_info").text 73 #添加名字 74 templist.append(name) 75 # print(name) 76 #添加评分 77 score=li_soup.find('span',class_='v_score').text 78 #处理评分 79 score=score[-4:-1] 80 templist.append(score) 81 # print(score) 82 #添加path 83 path=li_soup.find('a',target="_blank")['href'] 84 templist.append(path) 85 # print(path) 86 #添加播放状态 87 state="VIP" 88 templist.append(state) 89 print(templist) 90 dataRes.append(templist) 91 templist=[] 92 print("-------------------------------------------") 93 # print(len(dataRes)) 94 95 # #最新上架 96 # 97 # templist = [] 98 # for i in range(1,31): 99 # url_1='https://film.sohu.com/list_0_0_0_2_1_' 100 # auto=str(i) 101 # url_2='_60.html?channeled=1200100000' 102 # url=url_1+auto+url_2 103 # response = requests.get(url, headers) 104 # response.encoding = 'utf-8' 105 # page_text = response.text 106 # # etree_ = etree.HTML(page_text) 107 # # 获取所有的li 108 # soup = BeautifulSoup(page_text, 'lxml') 109 # # 标签层级选择 110 # li_list = soup.select('.movie-list>li') 111 # print(len(li_list)) 112 # if(len(li_list)==0): 113 # print("最新上架爬取结束!") 114 # if(len(dataRes)!=0): 115 # return dataRes 116 # for li in li_list: 117 # li_text=str(li) 118 # # print(li_text) 119 # li_soup=BeautifulSoup(li_text,'lxml') 120 # name=li_soup.find('div',class_="v_name_info").text 121 # #添加名字 122 # templist.append(name) 123 # # print(name) 124 # #添加评分 125 # score=li_soup.find('span',class_='v_score').text 126 # #处理评分 127 # score=score[-4:-1] 128 # templist.append(score) 129 # # print(score) 130 # #添加path 131 # path=li_soup.find('a',target="_blank")['href'] 132 # templist.append(path) 133 # # print(path) 134 # #添加播放状态 135 # state="VIP" 136 # templist.append(state) 137 # print(templist) 138 # dataRes.append(templist) 139 # templist=[] 140 # print("-------------------------------------------") 141 # # print(len(dataRes)) 142 # #本周热播 143 # templist = [] 144 # for i in range(1, 31): 145 # url_1 = 'https://film.sohu.com/list_0_0_0_2_0_' 146 # auto = str(i) 147 # url_2 = '_60.html?channeled=1200100000' 148 # url = url_1 + auto + url_2 149 # response = requests.get(url, headers) 150 # response.encoding = 'utf-8' 151 # page_text = response.text 152 # # etree_ = etree.HTML(page_text) 153 # # 获取所有的li 154 # soup = BeautifulSoup(page_text, 'lxml') 155 # # 标签层级选择 156 # li_list = soup.select('.movie-list>li') 157 # print(len(li_list)) 158 # if (len(li_list) == 0): 159 # print("本周热播爬取结束!") 160 # if (len(dataRes) != 0): 161 # return dataRes 162 # for li in li_list: 163 # li_text = str(li) 164 # # print(li_text) 165 # li_soup = BeautifulSoup(li_text, 'lxml') 166 # name = li_soup.find('div', class_="v_name_info").text 167 # # 添加名字 168 # templist.append(name) 169 # # print(name) 170 # # 添加评分 171 # score = li_soup.find('span', class_='v_score').text 172 # # 处理评分 173 # score = score[-4:-1] 174 # templist.append(score) 175 # # print(score) 176 # # 添加path 177 # path = li_soup.find('a', target="_blank")['href'] 178 # templist.append(path) 179 # # print(path) 180 # # 添加播放状态 181 # state = "VIP" 182 # templist.append(state) 183 # print(templist) 184 # dataRes.append(templist) 185 # templist = [] 186 # print("-------------------------------------------") 187 # print(len(dataRes)) 188 #list去重 189 # old_list = dataRes 190 # new_list = [] 191 # for i in old_list: 192 # if i not in new_list: 193 # new_list.append(i) 194 # print(new_list) # [2, 3, 4, 5, 1] 195 return dataRes 196 #插入数据库 197 def insert_souhu(): 198 cursor = None 199 conn = None 200 try: 201 count=0 202 list = get_souhu() 203 print(f"{time.asctime()}开始插入搜狐电影数据") 204 conn, cursor = get_conn() 205 sql = "insert into moviesohu (id,name,score,path,state) values(%s,%s,%s,%s,%s)" 206 for item in list: 207 print(item) 208 count = count + 1 209 #异常捕获,防止数据库主键冲突 210 try: 211 cursor.execute(sql, [0, item[0], item[1], item[2], item[3] ]) 212 except pymysql.err.IntegrityError: 213 print("重复!跳过!") 214 conn.commit() # 提交事务 update delete insert操作 215 print(f"{time.asctime()}插入搜狐电影数据完毕") 216 except: 217 traceback.print_exc() 218 finally: 219 close_conn(conn, cursor) 220 return; 221 222 if __name__ == '__main__': 223 # get_iqy() 224 # get_souhu() 225 insert_souhu()
运行截图
数据库截图
建表语句
1 CREATE TABLE `moviesohu` ( 2 `id` INT(11) NOT NULL AUTO_INCREMENT, 3 `name` VARCHAR(45) COLLATE utf8_bin NOT NULL, 4 `score` VARCHAR(45) COLLATE utf8_bin NOT NULL, 5 `path` VARCHAR(100) COLLATE utf8_bin NOT NULL, 6 `state` VARCHAR(10) COLLATE utf8_bin NOT NULL, 7 PRIMARY KEY (`name`), 8 KEY `id` (`id`) 9 ) ENGINE=INNODB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
标签:Python,text,list,li,爬取,url,mysql,print,templist 来源: https://www.cnblogs.com/rainbow-1/p/14772320.html