首页 > 编程语言> > python获取论文的Bibtex格式

python获取论文的Bibtex格式

2021-01-01 20:02:11 作者：互联网

1. 简介

批量从百度学术查询输入论文，获得相应论文的Bibtex格式，并保存。

from bs4 import BeautifulSoup
from selenium import webdriver
import time

'''
判断元素是否存在
'''
def isElementExist(driver, element):
    flag = True
    try:
        driver.find_element_by_xpath(element)
        return flag

    except:
        flag = False
        return flag


url = 'https://xueshu.baidu.com/s?wd=study&tn=SE_baiduxueshu_c1gjeupa&sc_f_para=sc_tasktype%3D%7BfirstSimpleSearch%7D&sc_hit=1&ie=utf-8&sort=sc_time'  # 所要爬取的百度学术的页面
i = 1  # 页码
j = 1  # 当前页面的文献计数，一个页面有十篇文献
refs = []  # 存储参考文献

driver = webdriver.Chrome("C:/Users/AppData/Local/Google/Chrome/Application/chromedriver.exe")
driver.get(url)
time.sleep(10)
while i <= 20:
    while j <= 400:  # 我随便设置的不会到的文献数
        print(j)
        h1 = driver.window_handles
        print('h1=', h1)
        driver.switch_to.window(driver.window_handles[0])  # 切换到第一个window
        if j % 10 != 1:
            driver.find_element_by_xpath('//*[@id="sc_quote_wr"]/div[1]/a').click()  # 不是第一条参考文献的话,关掉引用
            time.sleep(2)
        if j % 10 == 0:  # 判断当前文献是不是当前页面的最后一条  //*[@id="sc_quote_wr"]/div[2]/div[2]/a[1]
            break
        else:
            xpath = '//*[@id="' + str(j) + '"]/div[2]/div/a[2]'  # 引用的xpath，该xpath的不同在于str(j)的位置，从1开始每条文献加1，下一页也在上一页最后一条文献的基础上加1
            time.sleep(2)
            if isElementExist(driver, xpath):
                driver.find_element_by_xpath(xpath).click()  # 点击引用
                time.sleep(5)
                if isElementExist(driver, '//*[@id="sc_quote_wr"]/div[2]/div[2]/a[1]'):
                    driver.find_element_by_xpath('//*[@id="sc_quote_wr"]/div[2]/div[2]/a[1]').click()  # 点击bibtex
                else:
                    driver.find_element_by_xpath('//*[@id="sc_quote_wr"]/div[1]/a').click()  # 关掉引用
                    j = i * 10
                    break
                h2 = driver.window_handles
                print('h2=', h2)
                driver.switch_to.window(driver.window_handles[1])  # 转到第二个window，爬取具体内容
                content = driver.page_source.encode('utf-8')
                soup = BeautifulSoup(content, 'lxml')
                ref = soup.get_text()
                refs.append(ref)
                j += 1
                driver.close()
            else:
                j = i * 10
                break
    print(j)
    if i == 1:
        driver.find_element_by_xpath('//*[@id="page"]/a[8]').click()  # 点击下一页，第一页的下一页的xpath和其他页的下一页的xpath不同
        time.sleep(5)
    else:
        driver.find_element_by_xpath('//*[@id="page"]/a[9]').click()
        time.sleep(5)
    i += 1
    j += 1

driver.quit()

with open('F:/Experiment/BaiduXueshu/Englishoutput/study.txt', 'a', encoding='utf-8') as f:  # 写入文件
    for ref in refs:
        f.write(str(ref) + '\n')

参考:

用Python+beautiful Soup+selenium抓取百度学术文献的参考文献,使用,pythonBeautifulSoupselenium,爬取,引用

标签：python,time,driver,element,flag,Bibtex,import,sc,格式
来源： https://blog.csdn.net/rosefun96/article/details/112064886