〈2022-2-14〉使用<beautifulsoup>爬取ISO标准网站:基础爬取
作者:互联网
from urllib.request import urlopen from bs4 import BeautifulSoup url = 'https://www.iso.org/standards-catalogue/browse-by-tc.html' addrss_url = 'https://www.iso.org/' text = urlopen(url).read() soup = BeautifulSoup(text,'html.parser') table = soup.find('table',id='datatable-committees') # print('table:',table) tbody = table.find('tbody') # print('tbody:',tbody) TC = set() for tr in tbody: if len(tr) > 1: # print('tr:',tr) # print('-----------------------') td = tr.findAll('td') title = td[1].string.strip() # print('title:',title) title1 = set() title1.add('{}(title:{})'.format(tr.a.string,title)) TC.add('{}(address:{})'.format(title1,addrss_url+tr.a['href'])) print('\n'.join(sorted(TC,key=str.lower))) # print('TC:',TC) print('采集完毕!')
标签:14,title,tr,tbody,爬取,ISO,print,table,TC 来源: https://www.cnblogs.com/MMD-Ali/p/15893114.html