SLURM

From The Power of Many

Controller Node 1. NTP 2. hostname

dnf install -y libaec dnf install -y slurm-slurmctld slurm-slurmdbd slurm-perlapi slurm-slurmrestd slurm-gui slurm-contribs

systemctl enable --now munge.service systemctl enable --now slurmdbd.service systemctl enable --now slurmctld.service

/etc/slurm/cgroup.conf 

###
#
# Slurm cgroup support configuration file
#
# See man slurm.conf and man cgroup.conf for further
# information on cgroup configuration parameters
#--
CgroupAutomount=yes
ConstrainKmemSpace=no	#avoid known Kernel issues
ConstrainCores=yes
ConstrainRAMSpace=yes
ConstrainSwapSpace=yes
AllowedRAMSpace=500
AllowedSwapSpace=0
MaxRAMPercent=100
MaxSwapPercent=25
TaskAffinity=no		#use task/affinity plugin instead

/etc/slurm/slurm.conf

### slurm.conf controls the config for Slurm both in terms of the slurmctld
### daemon but also for slurmd on the clients. This also controls node and
### partition definition.

############################## Global #####################################
### This section sets global definitions

### Name of the cluster.
ClusterName=CLUSTERNAME
MailDomain=DOMAIN.TLD

### User that slurm will run as.
SlurmUser=slurm

### Sets munge to be our authentication method
AuthType=auth/munge
CryptoType=crypto/munge

### Width of the communications heirarchy.  Recommended to be either N^(1/2)
### or N^(1/3) where N is the number of nodes.
### Currently we set TreeWidth to N^(1/3) due to the fact we are spanning
### multiple datacenters that have significant latency penalty.
#TreeWidth=12
#MessageTimeout=100
#RoutePlugin=route/topology
#TopologyPlugin=topology/tree

### We don't set a default version of MPI.
MpiDefault=none

SwitchType=switch/none

### scron settings
### Enable the use of scrontab to submit and manage periodic repeating jobs
ScronParameters=enable

########################### slurmctld #####################################
### This section sets the definitions specific to the slurmctld.

### Network info for slurmctld
SlurmctldHost=SLURMCTLD 
SlurmctldPort=6817
SlurmctldPidFile=/run/slurm/slurmctld.pid
StateSaveLocation=/tools/slurm/var/spool/slurm/ctld
SlurmctldTimeout=120
SlurmctldParameters=preempt_send_user_signal,reboot_from_controller,user_resv_delete

RebootProgram="/usr/sbin/reboot"

### Logging
#SlurmctldDebug=verbose
SlurmctldDebug=debug
SlurmctldLogFile=/var/log/slurm/slurmctld.log
#SlurmctldLogFile=/var/log/messages
#SlurmctldSyslogDebug=verbose
#DebugFlags=Route
DebugFlags=Gres

### Prolog that the slurmctld runs when it schedules jobs.
#PrologSlurmctld=/usr/local/sbin/slurmctld_prolog

### Where are are logging data about job completions.
#JobCompHost=holy-slurm02
JobCompType=jobcomp/filetxt
JobCompLoc=/tools/slurm/var/JobComp.log

########################## slurmd #########################################
### This section sets the definitions specific to the slurmd and cgroups.

SlurmdPort=6818
#SrunPortRange=7845-11845
SlurmdPidFile=/run/slurm/slurmd.pid
SlurmdSpoolDir=/tools/slurm/var/spool/slurm/d
SlurmdTimeout=300
#SlurmdDebug=verbose
SlurmdDebug=debug
SlurmdLogFile=/var/log/slurm/slurmd.log

### Turning on config_overrides else the logs get filled up with reports of bad nodes.
#SlurmdParameters=config_overrides

### Logging
#SlurmdSyslogDebug=verbose

### We are using cgroups to track things and mange tasks.
ProctrackType=proctrack/cgroup
TaskPlugin=task/affinity,task/cgroup

### We have MCS turned on so that we can run isolated jobs.
#MCSPlugin=mcs/account
MCSPlugin=mcs/group
MCSParameters=ondemand,ondemandselect,privatedata

### What gres types we are using.
#GresTypes=gpu
GresTypes=mem_free

### Where scratch space is located on the node.
#TmpFs=/scratch

### Nodes will stay down until reopened by the admin.
### This prevents flapping by nodes.
#ReturnToService=0
ReturnToService=1

### This sets how the cgroup will be set up by the prolog script.
### We need X11 in order to use the Slurm PAM libraries and X11.
PrologFlags=Contain,X11
#UsePAM=1

### Location of Epilog Script
#Epilog=/usr/local/bin/slurm_epilog

### Node Health Check
#HealthCheckInterval=500
#HealthCheckNodeState=CYCLE
#HealthCheckProgram=/usr/local/bin/node_monitor

############################ slurmdbd #####################################
### This section sets the definitions for slurmdbd and accounting.
AccountingStorageHost=SLURMDBDHOST
AccountingStoragePort=6819
#AccountingStorageUser=slurm
AccountingStorageType=accounting_storage/slurmdbd

### Determines what we track for accounting.
### AccountingStorageEnforce being set to safe means that if we enforce a
### GrpCPUMins limit, jobs will only launch if they will complete before
### that limit is exhuasted.  Else the jobs would simply terminate when the
### limit is hit regardless if the job still had time left.
AccountingStorageEnforce=safe
# DISABLING IB + LUSTRE ACCOUNTING FOR NOW UNTIL ISSUES RESOLVED
#AccountingStorageTRES=Billing,CPU,Energy,Mem,Node,FS/Disk,FS/Lustre,Pages,VMem,IC/OFED,gres/gpu
AccountingStorageTRES=Billing,CPU,Energy,Mem,Node,FS/Disk,Pages,VMem,gres/mem_free
AccountingStoreJobComment=YES
# DISABLING IB + LUSTRE ACCOUNTING FOR NOW UNTIL ISSUES RESOLVED
#AcctGatherInfinibandType=acct_gather_infiniband/ofed
#AcctGatherFilesystemType=acct_gather_filesystem/lustre

### Determines how frequently we will ping for job data.
### We use jobacct_gather/linux rather than cgroups as that is what is
### recommended by slurm in their documentation even though we have
### cgroups enabled.
JobAcctGatherType=jobacct_gather/linux
JobAcctGatherFrequency=30
#JobAcctGatherFrequency=task=30,network=30,filesystem=30


########################## Scheduling #####################################
### This section is specific to scheduling

### Tells the scheduler to enforce limits for all partitions
### that a job submits to.
EnforcePartLimits=ALL

### Let's slurm know that we have a jobsubmit.lua script
#JobSubmitPlugins=lua

### When a job is launched this locks the memory a user can use,
### it also sorts NUMA memory. In addition for srun invocations
### we test that the executable is actually available and has
### the correct permissions before launching.
LaunchParameters=mem_sort,slurmstepd_memlock_all,test_exec

### Set's licenses we are running.
### These to not tie into FlexLM or our License server.
#Licenses=lumerical:10,MATLAB_Distrib_Comp_Engine:256,renderman:29
Licenses=pcie_svt_free:195,low_priority:200

### Maximum sizes for Jobs.
#MaxJobCount=300000
#MaxArraySize=10000
#DefMemPerCPU=100
DefCpuPerGPU=1
#DefMemPerGPU=100
GpuFreqDef=low

DefMemPerNode=2600

### Job Timers
CompleteWait=0

### We set the EpilogMsgTime long so that Epilog Messages don't pile up all
### at one time due to forced exit which can cause problems for the master.
#EpilogMsgTime=3000000
InactiveLimit=0
KillWait=30

### This only applies to the reservation time limit, the job must still obey
### the partition time limit.
ResvOverRun=UNLIMITED
MinJobAge=300
Waittime=0

### Scheduling parameters
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_Core_Memory

### Govern's default preemption behavior
PreemptType=preempt/partition_prio
PreemptMode=REQUEUE

### default_queue_depth should be some multiple of the partition_job_depth,
### ideally number_of_partitions * partition_job_depth, but typically the main
### loop exits prematurely if you go over about 400. A partition_job_depth of
### 10 seems to work well.
SchedulerParameters=\
default_queue_depth=1500,\
partition_job_depth=10,\
bf_continue,\
bf_interval=30,\
bf_resolution=600,\
bf_window=11520,\
bf_max_job_part=0,\
bf_max_job_user=10,\
bf_max_job_test=100000,\
bf_max_job_start=1000,\
bf_ignore_newly_avail_nodes,\
enable_user_top,\
pack_serial_at_end,\
nohold_on_prolog_fail,\
permit_job_expansion,\
preempt_strict_order,\
preempt_youngest_first,\
reduce_completing_frag,\
requeue_setup_env_fail,\
max_rpc_cnt=16

### This controls how we handle dependencies. If we have invalid dependencies
### we terminate the job.
DependencyParameters=kill_invalid_depend

################################ Fairshare ################################
### This section sets the fairshare calculations

PriorityType=priority/multifactor
PriorityFlags=NO_FAIR_TREE

### Settings for fairshare calculation frequency and shape.
FairShareDampeningFactor=1
PriorityDecayHalfLife=28-0
PriorityCalcPeriod=5

### Settings for fairshare weighting.
PriorityMaxAge=7-0
PriorityWeightAge=1000
PriorityWeightFairshare=2000
PriorityWeightJobSize=0
PriorityWeightPartition=0
PriorityWeightQOS=10000


###########################################################################
############################## Node Definitions ###########################
### These are ordered in alphabetical order by node name.
### The AMD nodes do not match the actual hardware specs
### but should get us more job slots.
### Header names are names of those who bought the hardware.

###########################################################################
##########################Partition Definitions ###########################
### Organized in alphabetical order by group name.
### Then partitions are ordered by partition name.
### Nodes in the partition and groups should be ordered alphabetically.
### TRES Billing formula for CPU: AMD = 0.1, Sandy = 0.2, Broadwell = 0.4
### Skylake = 0.5, Cascade Lake = 1.0
### TRES Billing formula for GPU: TitanX = 2.2, K80 = 15.4, V100 = 75
### TRES Billing formula for Memory: NumCPU*CPUTRES/TotalMem

PartitionName=test State=UP PreemptMode=OFF PriorityTier=1 \
    Nodes=all

### Arguelles Delgado, Physics, FAS ###
### holy7c183[01-11] is on loan from shared ###
###PartitionName=arguelles_delgado State=UP PreemptMode=OFF PriorityTier=4 DefaultTime=0-00:10:00 \
###    TRESBillingWeights="CPU=1.0,Mem=0.25G" \
###    AllowGroups=arguelles_delgado_lab,slurm-admin \
###    Nodes=holy7c183[01-11]

/etc/slurm/slurmdbd.conf

#
# See the slurmdbd.conf man page for more information.
#
# Archive info
#ArchiveJobs=yes
#ArchiveDir="/tmp"
#ArchiveSteps=yes
#ArchiveScript=
#JobPurge=12
#StepPurge=1
#
# Authentication info
AuthType=auth/munge
#AuthInfo=/var/run/munge/munge.socket.2
#
# slurmdbd info
#DebugLevel=4
DebugLevel=verbose
#DefaultQOS=normal,standby
DbdAddr=localhost
DbdHost=SLURMDBDHOST
#DbdPort=6819
DbdPort=6819
#LogFile=
LogFile=/var/log/slurm/slurmdbd.log
#MessageTimeout=300
PidFile=/var/run/slurm/slurmdbd.pid
#PluginDir=
#PrivateData=accounts,users,usage,jobs
PurgeEventAfter=999month
PurgeJobAfter=999month
PurgeResvAfter=999month
PurgeStepAfter=999month
PurgeSuspendAfter=999month
PurgeTXNAfter=999month
PurgeUsageAfter=999month
#SlurmUser=root
SlurmUser=slurm
#TrackWCKey=yes
#
# Database info
StorageType=accounting_storage/mysql
StorageHost=localhost
StoragePort=3306
StoragePass=DBPASSWORD
StorageUser=slurm
StorageLoc=slurm_acct_db
#StorageLoc=slurm_acct_db