slurm集群搭建
作者:互联网
1. 环境准备
#vi /etc/sysconfig/selinux
# SELINUX=disabled
systemctl stop firewalld
systemctl disable firewalld
yum -y install epel-release
yum repolist
yum install axel yum-axelget
yum install ntp -y
systemctl enable ntpd
ntpdate pool.ntp.org
systemctl start ntpd
# change hostname
hostnamectl --static set-hostname newname
#vi /etc/hosts
1.1.1.100 master
1.1.1.101 client01
1.1.1.102 client02
2. nis安装
server
yum -y install ypserv rpcbind
nisdomainname simcloud.com
echo "nisdomainname simcloud.com" >>/etc/rc.local
echo "NISDOMAIN=simcloud.com" >> /etc/sysconfig/network
#cat /etc/sysconfig/network
YPSERV_ARGS="-p 1011"
#/etc/sysconfig/yppasswdd
YPPASSWDD_ARGS="--port 1012"
#cat /etc/ypserv.conf
dns: no
files: 30
xfr_check_port: yes
* : * : shadow.byname : port
* : * : passwd.adjunct.byname : port
systemctl restart rpcbind
systemctl restart ypserv
systemctl restart yppasswdd
systemctl enable rpcbind
systemctl enable ypserv
systemctl enable yppasswdd
rpcinfo -p localhost
rpcinfo -u localhost ypserv
/usr/lib64/yp/ypinit -m
make -C /var/yp
client
yum install -y rpcbind yp-tools ypbind
nisdomainname simcloud.com
echo "nisdomainname simcloud.com" >>/etc/rc.local
echo "NISDOMAIN=simcloud.com" >> /etc/sysconfig/network
#cat /etc/nsswitch.conf
passwd: files nis
shadow: files nis
group: files nis
hosts: files nis dns
#cat /etc/sysconfig/authconfig
USENIS=yes
#cat /etc/pam.d/system-auth
password sufficient pam_unix.so sha512 shadow nis nullok try_first_pass use_authtok
#cat /etc/yp.conf
domain simcloud.com server 192.168.18.128
systemctl restart rpcbind
systemctl restart ypbind
systemctl enable rpcbind
systemctl enable ypbind
yptest
3. nfs安装
server
yum -y install nfs-utils
systemctl enable rpcbind
systemctl enable nfs
systemctl start rpcbind
systemctl start nfs
rpcinfo -p localhost | grep nfs
100003 3 tcp 2049 nfs
100003 4 tcp 2049 nfs
100227 3 tcp 2049 nfs_acl
100003 3 udp 2049 nfs
100003 4 udp 2049 nfs
100227 3 udp 2049 nfs_acl
chmod 755 /home
chmod 755 /opt
# shareDir ip(rw,no_root_squash,no_all_squash,sync)
# ip 192.168.0.0/24: 客户端 IP 范围,* 代表所有,即没有限制。
# rw: 权限设置,可读可写。
# sync: 同步共享目录。
# no_root_squash: 可以使用 root 授权。
# no_all_squash: 可以使用普通用户授权。
/home *(rw,no_root_squash,sync)
/opt *(rw,no_root_squash,sync)
systemctl restart nfs
[root@mom01 home]# showmount -e localhost
Export list for localhost:
/opt *
/home *
#永久修改挂载
# vi /etc/fstab
client
yum -y install nfs-utils
systemctl enable rpcbind
systemctl start rpcbind
[root@boy01 ~]# showmount -e mom01
Export list for mom01:
/opt *
/home *
[root@boy01 ~]# mount mom01:/opt /opt
[root@boy01 ~]# mount mom01:/home /home
[root@boy01 ~]# ls /home/
c1 cndaqang test
#vi /etc/fstab
mom01:/home /home nfs defaults 0 0
mom01:/opt /opt nfs defaults 0 0
4. munge安装
yum -y install python
yum -y install python3
yum -y install epel-release
yum -y install gtk2
yum -y install gtk2-devel
yum -y install munge
yum -y install munge-devel
yum -y install perl
yum -y install gcc
yum -y install gcc-c++
yum -y install polkit
systemctl start polkit
mkdir /usr/local/etc
echo "
#slurm
USRLOCAL=/usr/local
export LD_LIBRARY_PATH=\${USRLOCAL}/lib:\$LD_LIBRARY_PATH
export LIBRARY_PATH=\${USRLOCAL}/lib:\$LIBRARY_PATH
export LIBRARY_PATH=\${USRLOCAL}/lib64:\$LIBRARY_PATH
export C_INCLUDE_PATH=\${USRLOCAL}/include:\$C_INCLUDE_PATH
export PATH=\${USRLOCAL}/bin:\$PATH
export PATH=\${USRLOCAL}/sbin:\$PATH
" >> /etc/profile
[root@master source]# /usr/sbin/create-munge-key
Generating a pseudo-random key using /dev/urandom completed.
[root@master source]# scp /etc/munge/munge.key node8:/etc/munge
chown munge:munge /etc/munge
chown munge:munge /var/run/munge
chown munge:munge /var/lib/munge
chown munge:munge /var/log/munge
chown munge:munge /etc/munge/munge.key
vi /usr/lib/systemd/system/munge.service
[Unit]
Description=MUNGE authentication service
Documentation=man:munged(8)
After=network.target
After=syslog.target
After=time-sync.target
[Service]
Type=forking
ExecStart=/usr/sbin/munged --syslog
PIDFile=/var/run/munge/munged.pid
User=munge
Group=munge
Restart=on-abort
ExecStartPre=-/usr/bin/mkdir -m 0755 -p /var/log/munge
ExecStartPre=-/usr/bin/chown -R munge:munge /var/log/munge
ExecStartPre=-/usr/bin/mkdir -m 0755 -p /var/run/munge
ExecStartPre=-/usr/bin/chown -R munge:munge /var/run/munge
[Install]
WantedBy=multi-user.target
systemctl daemon-reload
[root@master ~]# systemctl start munge
[root@master ~]# systemctl status munge
[root@client01 ~]# systemctl status munge
● munge.service - MUNGE authentication service
Loaded: loaded (/usr/lib/systemd/system/munge.service; enabled; vendor preset: disabled)
Active: active (running) since Sat 2020-11-07 11:58:51 CST; 10min ago
Docs: man:munged(8)
Process: 4684 ExecStart=/usr/sbin/munged --syslog (code=exited, status=0/SUCCESS)
Process: 4660 ExecStartPre=/usr/bin/chown -R munge:munge /var/run/munge (code=exited, status=0/SUCCESS)
Process: 4609 ExecStartPre=/usr/bin/mkdir -m 0755 -p /var/run/munge (code=exited, status=0/SUCCESS)
Process: 4588 ExecStartPre=/usr/bin/chown -R munge:munge /var/log/munge (code=exited, status=1/FAILURE)
Process: 4546 ExecStartPre=/usr/bin/mkdir -m 0755 -p /var/log/munge (code=exited, status=0/SUCCESS)
Main PID: 4724 (munged)
Tasks: 4
Memory: 672.0K
CGroup: /system.slice/munge.service
└─4724 /usr/sbin/munged --syslog
Nov 07 11:58:51 client01 munged[4724]: Found 3 users with supplementary groups in 0.003 seconds
5. slurm安装
[root@master source]# useradd slurm
[root@master source]# passwd slurm
#NIS同步更新账户
[root@master source]# make -C /var/yp
# 客户端无需此步骤,挂载后会自动增加slurm 账号
rm -rf /var/spool/slurm-llnl
mkdir /var/spool/slurm-llnl
chown -R slurm.slurm /var/spool/slurm-llnl
rm -rf /var/run/slurm-llnl/
mkdir /var/run/slurm-llnl/
chown -R slurm.slurm /var/run/slurm-llnl/
cd /opt/source/
#从https://download.schedmd.com/slurm/下载最新版即可
wget https://download.schedmd.com/slurm/slurm-20.11.0-0rc1.tar.bz2
tar -jxvf slurm-20.11.0-0rc1.tar.bz2
cd slurm-20.11.0-0rc1/
./configure #默认安装到/usr/local
make -j90 #注意需要python3
make install
cp etc/{slurmctld.service,slurmdbd.service,slurmd.service} /usr/lib/systemd/system
[root@master slurm-20.11.0-0rc1]# cat /usr/lib/systemd/system/slurmctld.service
[Unit]
Description=Slurm controller daemon
After=network.target munge.service
ConditionPathExists=/usr/local/etc/slurm.conf
[Service]
Type=simple
EnvironmentFile=-/etc/sysconfig/slurmctld
ExecStart=/usr/local/sbin/slurmctld -D $SLURMCTLD_OPTIONS
ExecReload=/bin/kill -HUP $MAINPID
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
[root@master slurm-20.11.0-0rc1]# cat /usr/lib/systemd/system/slurmd.service
[Unit]
Description=Slurm node daemon
After=munge.service network.target remote-fs.target
#ConditionPathExists=/usr/local/etc/slurm.conf
[Service]
Type=simple
EnvironmentFile=-/etc/sysconfig/slurmd
ExecStart=/usr/local/sbin/slurmd -D $SLURMD_OPTIONS
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
LimitNOFILE=131072
LimitMEMLOCK=infinity
LimitSTACK=infinity
Delegate=yes
[Install]
WantedBy=multi-user.target
cat << EOF > /usr/local/etc/slurm.conf
# slurm.conf file generated by configurator.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
SlurmctldHost=master
#SlurmctldHost=
#
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
#EpilogSlurmctld=
#FirstJobId=1
#MaxJobId=999999
#GresTypes=
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=1
#KillOnBadExit=0
#LaunchType=launch/slurm
#Licenses=foo*4,bar
#MailProg=/bin/mail
#MaxJobCount=5000
#MaxStepCount=40000
#MaxTasksPerNode=128
MpiDefault=none
#MpiParams=ports=#-#
#PluginDir=
#PlugStackConfig=
#PrivateData=jobs
ProctrackType=proctrack/pgid
#Prolog=
#PrologFlags=
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#RebootProgram=
ReturnToService=1
SlurmctldPidFile=/var/spool/slurm-llnl/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/spool/slurm-llnl/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurm-llnl
SlurmUser=slurm
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/var/spool/slurm-llnl
SwitchType=switch/none
#TaskEpilog=
TaskPlugin=task/affinity
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFS=/tmp
#TrackWCKey=no
#TreeWidth=
#UnkillableStepProgram=
#UsePAM=0
#
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
#MessageTimeout=10
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_Core
#
#
# JOB PRIORITY
#PriorityFlags=
#PriorityType=priority/basic
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
#PriorityWeightPartition=
#PriorityWeightQOS=
#
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
#AccountingStorageHost=
#AccountingStoragePass=
#AccountingStoragePort=
AccountingStorageType=accounting_storage/none
#AccountingStorageUser=
AccountingStoreJobComment=YES
ClusterName=cluster
#DebugFlags=
#JobCompHost=
#JobCompLoc=
#JobCompPass=
#JobCompPort=
JobCompType=jobcomp/none
#JobCompUser=
#JobContainerType=job_container/none
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/none
SlurmctldDebug=info
#SlurmctldLogFile=
SlurmdDebug=info
#SlurmdLogFile=
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
#SuspendProgram=
#ResumeProgram=
#SuspendTimeout=
#ResumeTimeout=
#ResumeRate=
#SuspendExcNodes=
#SuspendExcParts=
#SuspendRate=
#SuspendTime=
#
#
# COMPUTE NODES
NodeName=master,client01 CPUs=96 State=UNKNOWN
PartitionName=long Nodes=master,client01 Default=YES MaxTime=INFINITE State=UP
EOF
chown slurm:slurm /usr/local/etc/slurm.conf
systemctl start slurmd
systemctl enable slurmd
systemctl start slurmctld
systemctl enable slurmctld
[root@master slurm-20.11.0-0rc1]# systemctl status slurmctld
● slurmctld.service - Slurm controller daemon
Loaded: loaded (/usr/lib/systemd/system/slurmctld.service; enabled; vendor preset: disabled)
Active: active (running) since Fri 2020-11-06 22:41:46 CST; 8s ago
Main PID: 101889 (slurmctld)
CGroup: /system.slice/slurmctld.service
└─101889 /usr/local/sbin/slurmctld -D
Nov 06 22:41:46 master systemd[1]: Started Slurm controller daemon.
[root@master slurm-20.11.0-0rc1]# systemctl status slurmd
● slurmd.service - Slurm node daemon
Loaded: loaded (/usr/lib/systemd/system/slurmd.service; enabled; vendor preset: disabled)
Active: active (running) since Fri 2020-11-06 22:41:36 CST; 20s ago
Main PID: 101848 (slurmd)
CGroup: /system.slice/slurmd.service
└─101848 /usr/local/sbin/slurmd -D
Nov 06 22:41:36 master systemd[1]: Started Slurm node daemon.
6. 其他
# 配置防火墙
systemctl start firewalld
firewall-cmd --list-all
firewall-cmd --permanent --add-service=mountd,nfs,rpcbind
firewall-cmd --permanent --add-port=177/udp
firewall-cmd --permanent --add-rich-rule="rule family="ipv4" source address="10.10.10.103" port port="6818" protocol="tcp" accept"
参考:https://cndaqiang.github.io/2020/11/06/slurm-Centos7/
https://cndaqiang.github.io/2019/09/19/Centos7-CC19/
https://www.cnblogs.com/liuyongqian/articles/10789946.html
https://qizhanming.com/blog/2018/08/08/how-to-install-nfs-on-centos-7
标签:slurm,munge,etc,systemctl,集群,usr,var,搭建 来源: https://www.cnblogs.com/leowindy/p/15567669.html