编程语言
首页 > 编程语言> > python爬取豆瓣250

python爬取豆瓣250

作者:互联网

import urllib.request
import ssl
import re
import xlwt
import DBUtils
import xlrd
from xlutils.copy import copy
def getContent(ye):
    headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" ,
    'Connection': 'keep-alive'
    }
    url = "https://movie.douban.com/top250?start=%s&filter="%ye
    ##请求对象(url+请求头)
    req = urllib.request.Request(url,headers = headers)
    ##获取页面内容
    page = urllib.request.urlopen(req).read()
 
    page = page.decode("utf-8")
    return page
 
# print(getContent(ye))
 
def getItem(content):
    pattern = re.compile(r'alt="(.*?)"')
    res = re.findall(pattern,content)
    res.pop()
    return res;
#
# content = getContent(ye)
# print(getItem(content))
 
def saveExcel():
    wb = xlwt.Workbook()
    sheet = wb.add_sheet("豆瓣250")
    header = ["书名"]
    for (i,v) in enumerate(header):
        sheet.write(0,i,v)
    wb.save("豆瓣.xls")
# content = getContent()
# list = getItem(content)
# saveExcel(list)
def wb(list,x):
    # 打开工作薄
    wb = xlrd.open_workbook("豆瓣.xls")
    # 复制一份工作薄,用来写入
    copyWb = copy(wb)
    # 通过索引获取表
    sheet = copyWb.get_sheet(0)
    for (i, v) in enumerate(list):
        sheet.write(x,0, v)
        x +=1
    # 保存,如果文件名和之前一样,覆盖
    # 文件名不存在:新的文件
    copyWb.save("豆瓣.xls")
def ye():
    ye = 0
    x = 1
    saveExcel()
    while ye<250:
        content = getContent(ye)
        list = getItem(content)
        wb(list,x)
        for i in range(0,len(list)):
            sql = "insert into tb_use(name) values ('%s');"%list[i]
            DBUtils.insertData(sql)
        ye +=25
        x +=25
    return "完成"
print(ye())

import pymysql.cursors
 
#获取连接
def getConnect():
    conn = pymysql.connect(host="", user="root", password="123", database="pymysql", charset="utf8")
    return conn
#关闭连接
def closeConnect(cursor,conn):
    if cursor:
        cursor.close()
    if conn:
        conn.close()
 
#插入数据
def insertData(sql):
    conn = getConnect()
 
    cursor = conn.cursor()
 
    cursor.execute(sql)
    conn.commit()
 
    closeConnect(cursor, conn)
    count = cursor.rowcount
    if count > 0:
        return True
    else:
        return False

标签:content,sheet,wb,python,ye,爬取,import,250,def
来源: https://blog.csdn.net/csh2388827741/article/details/118439498