MPI可靠性设计
作者:互联网
导入库函数
import subprocess import numpy as np import os
同步每个节点暂停状态
n = 100 array=np.load('zong.npy') flag= array[-1][0:2] flag1 = int(flag[0]) flag2 = int(flag[1]) def bijiao(): maxflag=-100 maxnode=-1 for line in open("cpu_ip.txt"): line = line.replace("\n", '') IP.append(line) for ip in IP: if (subprocess.call('timeout 5s ping ' + ip + ' -c3', shell=True)): # 执行成功返回0 ping不通的执行 rmip.append(ip) else: os.system('scp mpiuser@'+str(ip)+':/home/mpiuser/mpii/zong.npy /home/mpiuser/chaxun.npy') a=np.load('/home/mpiuser/chaxun.npy') f = a[-1][0:2] b=f[1] #dangqianxunhuancishu if b>maxflag: maxflag=b maxnode=ip continue return maxflag,maxnode
确定节点当前状态并继续运算
while(flag2 < flag1): IP = [] # cpu_ip rmip = [] # 要删除的IP for line in open("cpu_ip.txt"): line = line.replace("\n", '') IP.append(line) for ip in IP: if (subprocess.call('timeout 5s ping ' + ip + ' -c3', shell=True)): # 执行成功返回0 ping不通的执行 rmip.append(ip) else: continue # 修改节点的进程数 pro_sum = flag1**2 + 1 # 总进程数 new_IP = [] for i in range(len(IP)): if (IP[i] not in rmip): new_IP.append([IP[i],0]) left_sum = pro_sum while left_sum >= len(new_IP): left_sum -= len(new_IP) for i in range(len(new_IP)): new_IP[i][1] += 1 if left_sum > 0: for i in range(left_sum): new_IP[i][1] += 1 f = open("mpi_config.txt", "w") print("open") for i in range(len(new_IP)): f.write("{}:{}\n".format(new_IP[i][0], new_IP[i][1])) f.close() maxflag,maxnode=bijiao() print('maxflag',maxflag) print('maxnode', maxnode) os.system('scp mpiuser@' + str(maxnode) + ':/home/mpiuser/mpii/zong.npy /home/mpiuser/mpii/') # kaobei for ip in IP: if ip not in rmip: if ip != maxnode: os.system('scp /home/mpiuser/mpii/zong.npy mpiuser@' + str(ip) + ':/home/mpiuser/mpii/zong.npy ') #faguoqu if (os.system("mpiexec -n 17 -f mpi_config.txt python3 mpi_helloworld.py")): print("unsuccess!restart!") array = np.load('zong.npy') flag = array[-1][0:2] flag1 = int(flag[0]) flag2 = int(flag[1]) else: print("success!") break
标签:可靠性,ip,line,new,MPI,IP,设计,mpiuser,maxnode 来源: https://www.cnblogs.com/shi-yi/p/16247414.html