〈2022-2-15〉使用<beautifulsoup>爬取ISO标准网站:基础入库(下一层数据采集入库)
作者:互联网
from urllib.request import urlopen from bs4 import BeautifulSoup import pymysql conn=pymysql.connect(host='127.0.0.1',user='root',passwd='password',port= 3306 ,db='ISO',charset='utf8') cursor = conn.cursor() if conn: print("database[ISO] 连接成功!") def catch(url): address_url = 'https://www.iso.org/' text = urlopen(url).read() soup = BeautifulSoup(text,'html.parser') global tbody if soup.find('table',id='datatable-committees'): table = soup.find('table',id='datatable-committees') tbody = table.find('tbody') elif soup.find('table',id='datatable-committee-children'): table = soup.find('table',id='datatable-committee-children') tbody = table.find('tbody') TC = set() i = 0 j = 0 k = 0 for tr in tbody: if len(tr) > 1: k += 1 td = tr.findAll('td') tc = tr.a.string.strip() title = td[1].string.strip() address = address_url[:-1]+tr.a['href'] sql_select = 'select count(ID) from tc_basic where tc = %s and address = %s' cursor.execute(sql_select,(tc,address)) select_count = cursor.fetchall() # print('select_count:',select_count) if select_count[0][0] == 0: i += 1 # print('This is new data ! \n ----------Insert data !---------') sql_maxID = 'SELECT MAX(ID) FROM tc_basic' cursor.execute(sql_maxID) row = cursor.fetchall() if row[0][0] is None: id = 1 # print('This is first data! ID is : ', id) else: lastID = int(row[0][0]) id = lastID + 1 # print('Continue to insert , the next ID is : ', id) sql = 'insert into tc_basic values (%s,%s,%s,%s)' cursor.execute(sql,(id,tc,title,address)) conn.commit() else: j += 1 # print('This data already exists in the database! \n ----------Update data !---------') pass else: pass print('采集完毕!共计',k,'条数据:其中新增数据',i,'条,重复数据',j,'条。') if (__name__ == '__main__'): url = 'https://www.iso.org/standards-catalogue/browse-by-tc.html' print('采集中:',url) catch(url) x = 1 sql_upID = 'SELECT MAX(ID) FROM tc_basic' cursor.execute(sql_upID) row = cursor.fetchall() while x <= row[0][0]: sql_addr = 'SELECT address FROM tc_basic WHERE id =%s' cursor.execute(sql_addr,(x)) addr = cursor.fetchall() address_url = addr[0][0].strip() print('二次采集第',x,'条 addr:',address_url) catch(address_url) x += 1
标签:cursor,15,id,爬取,sql,print,table,tc,入库 来源: https://www.cnblogs.com/MMD-Ali/p/15897291.html