数据库
首页 > 数据库> > 生物信息学:根据PDB名称、爬虫PDB数据库的信息、保存到Excel里

生物信息学:根据PDB名称、爬虫PDB数据库的信息、保存到Excel里

作者:互联网

毕设要统计蛋白质的信息,就写了个小脚本。
把PDB里的编号写到pdbselected里、我本来想做成根据蛋白质直接搜索到编号的,发现居然有反爬…只能先写成这个了…
具体爬的是哪个位置、建议自己在开发者模式里的自己搜索一下康康。

from selenium import webdriver
import xlwt
import random
import time
import re
driver=webdriver.Chrome(r'E:\Chrome\Application\chromedriver.exe')
#这里自己改
pdbselected=\
["6VCA",
"6VC9",
"6TVG",
"6TVX",
"6TW0",
"6TWA",
"6TWF",
"6XUQ",
"6XUE",
"6XUG",
"7JV8",
"7JV9",
"6YE2",
"6YE1",
"4H1S",
"6Z9B",
"6Z9D",
"3ZU0",
"6TVE",
"6HXW",
"6S7F",
"6S7H",
"4H2I",
"4H2B",
"3ZTV",
"4H2F",
"4H2G",
"4H1Y",
"4CD1",
"4CD3"]


work_book=xlwt.Workbook()
work_sheet = work_book.add_sheet("data")

for i in range(len(pdbselected)):
    time.sleep(random.uniform(1,3))
    url = 'http://www1.rcsb.org/structure/'+str(pdbselected[i])
    driver.get(url)
	
	#这里可以根据需要自己设定爬虫的xpath
    contain=driver.find_element_by_xpath("//*[@id='exp_header_0_snapshot']")
    contain_2=driver.find_element_by_xpath("//*[@class='list-unstyled']")
    #高分子含量
    contain_3=driver.find_element_by_xpath("//*[@id='macromoleculeContent']")
    #分子的其他信息
    contain_4=driver.find_element_by_xpath("//*[@id='macromolecule-entityId-1-rowDescription']")


	#把contain里爬到的模块的text分割
    ListOfContain=re.split("[\n:]", contain  .text)
    ListOfContain_2=  re.split("[\n:]", contain_2.text)
    ListOfContain_3 =   re.split("[\n:]", contain_3.text)
    ListOfContain_4 =   re.split("[\n: ]", contain_4.text)


    #合并list
    ListOfContain=ListOfContain+ListOfContain_2+ListOfContain_3+ListOfContain_4

    #先输入名字
    work_sheet.write(i, 0, pdbselected[i])
    for j in range(len(ListOfContain)):
        work_sheet.write(i, j+1, ListOfContain[j])

work_book.save('test.csv')


标签:driver,work,Excel,爬虫,ListOfContain,re,contain,import,PDB
来源: https://blog.csdn.net/weixin_45564533/article/details/115332776