python MoJaVe
作者:互联网
import requests
import pandas as pd
import numpy as np
import os
from lxml import etree
class Mojave():
def __init__(self):
workDir = "MOJAVE"
self.url = "http://www.physics.purdue.edu/astro/MOJAVE/allsources.html"
self.headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"}
self.makeDir(workDir)
def makeDir(self, dir):
"""
创建目录,首先判断当前目录下是否存在活目录名字,存在就直接转入,不然就创建之后再转入
"""
if os.path.exists(dir):
os.chdir(dir)
else:
os.mkdir(dir)
os.chdir(dir)
def mainPage(self,url):
"""
:return: MOJAVE主页html
"""
try:
response = requests.get(url=url, headers=self.headers,timeout=20)
html = etree.HTML(response.text)
return html
except:
print("爬取失败")
def subPage(self):
html=self.mainPage(url="http://www.physics.purdue.edu/astro/MOJAVE/sourcepages/0011+189.shtml")
elements=html.xpath('/html/body/center/table[3]/tbody/tr[6]')
print(len(elements))
def currentlySourceList(self):
html = self.mainPage(url=self.url)
currentlyUrl = ["http://www.physics.purdue.edu/astro/MOJAVE/" + i for i
in html.xpath('//td[@style="background-color:rgb(204, 255, 255);"]/small/a/@href')]
currentlyName = [i.replace(" ", "") for i in
html.xpath('//td[@style="background-color:rgb(204, 255, 255);"]/small/a/text()')]
return list(zip(currentlyName, currentlyUrl))
def noLongerSourceNameList(self):
html = self.mainPage(url=self.url)
noLongerUrl = ["http://www.physics.purdue.edu/astro/MOJAVE/" + i for i in
html.xpath('//td[@bgcolor="#FFFFCC"]/small/a/@href')]
cnoLongerName = [i.replace(" ", "") for i in
html.xpath('//td[@bgcolor="#FFFFCC"]/small/a/text()')]
return list(zip(noLongerUrl, cnoLongerName))
def _download(self,url,name):
try:
response=requests.get(url=url,headers=self.headers)
with open(name,"wb") as file:
file.write(response.content)
print(name+"下载成功")
except:
print(name+"++++++++++++++++++++++++++下载失败")
if __name__ == "__main__":
Mojave().subPage()
标签:__,python,self,url,html,MoJaVe,MOJAVE,def 来源: https://blog.csdn.net/weixin_51772689/article/details/121110288