爬虫百度学术
作者:互联网
import requests from bs4 import BeautifulSoup import re from lxml import etree import time import csv requests.packages.urllib3.disable_warnings() #需要生成的cs名字 csv_name = "123.csv" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36" } """1 第一步""" # 获取要爬取的分页, 当前第几页,总共要多少条 def get_page_total(p1,total): for x in range(p1,total): p1=str(x*10) url = "https://xueshu.baidu.com/s?wd=journaluri%3A%2820bd239813882ced%29%20applied%20energy&pn="+p1+"&tn=SE_baiduxueshu_c1gjeupa&ie=utf-8&sc_hit=1" #print(url) print("当前第"+str(x)+"页,共需要"+str(total)+"条") get_url(url) time.sleep(1) #print(x) """2 第二步""" #获取url文章链接地址,-》跳转到详情页 """ content = ['https://xueshu.baidu.com/usercenter/paper/show?paperid=e04cdee2122f75b0011cc9e7b452d72b&site=xueshu_se', 'https://xueshu.baidu.com/usercenter/paper/show?paperid=9ccc121c6260e006c41c32f04ddf2e85&site=xueshu_se'] ... """ def get_url(url): r = requests.get(url, headers=headers, verify=False) html = r.text selector = etree.HTML(html) content = selector.xpath('//h3[@class="t c_font"]//a/@href') ##获取内容详情,百度文库每页是十条, get_page_content(content) """3""" #获取内容详情,百度文库每页是十条, def get_page_content(detail_url): for link in detail_url: data = [] #print(link) rr = requests.get(link, headers=headers,verify=False) selector = etree.HTML(rr.text) #作者 zuozhe = selector.xpath('//p[@class="author_text"]//span//text()') #摘要 zhaiyao = selector.xpath('//p[@class="abstract"]//text()') # print(zuozhe) # print(zhaiyao) data.append(','.join(zuozhe)) data.append(','.join(zhaiyao)) #print(data) print("开始写入csv") f_csv(data) time.sleep(1) pass """4""" #写入csv ["111", "222"] def f_csv(data): f = open(csv_name, 'a+', newline='', encoding='utf-8') # 2. 基于文件对象构建 csv写入对象 csv_writer = csv.writer(f) #csv_writer.writerow(["作者", '摘要']) # 3. 构建列表头 csv_writer.writerow(data) f.close() pass """run 爬虫""" #生成csv头部 csv_head = ["作者","摘要"] #print(csv_head) f_csv(csv_head) #获取每篇文章url # 获取要爬取的分页, 当前第几页,总共要多少条 get_page_total(0,1) #datas = [['M Poeschl', 'S Ward', 'P Owende'],['The energy efficiency of, different, biogas systems'] ] # # print(','.join(datas[0])) # print(','.join(datas[1])) #f_csv(datas)
标签:get,url,爬虫,print,import,csv,data,百度,学术 来源: https://www.cnblogs.com/wtcl/p/15831406.html