SLURM: Difference between revisions
// Edit via Wikitext Extension for VSCode |
No edit summary |
||
Line 365: | Line 365: | ||
</syntaxhighlight> | </syntaxhighlight> | ||
[[Category:HPC]] |
Latest revision as of 03:07, 19 March 2022
Controller Node 1. NTP 2. hostname
dnf install -y libaec dnf install -y slurm-slurmctld slurm-slurmdbd slurm-perlapi slurm-slurmrestd slurm-gui slurm-contribs
systemctl enable --now munge.service systemctl enable --now slurmdbd.service systemctl enable --now slurmctld.service
/etc/slurm/cgroup.conf
###
#
# Slurm cgroup support configuration file
#
# See man slurm.conf and man cgroup.conf for further
# information on cgroup configuration parameters
#--
CgroupAutomount=yes
ConstrainKmemSpace=no #avoid known Kernel issues
ConstrainCores=yes
ConstrainRAMSpace=yes
ConstrainSwapSpace=yes
AllowedRAMSpace=500
AllowedSwapSpace=0
MaxRAMPercent=100
MaxSwapPercent=25
TaskAffinity=no #use task/affinity plugin instead
/etc/slurm/slurm.conf
### slurm.conf controls the config for Slurm both in terms of the slurmctld
### daemon but also for slurmd on the clients. This also controls node and
### partition definition.
############################## Global #####################################
### This section sets global definitions
### Name of the cluster.
ClusterName=CLUSTERNAME
MailDomain=DOMAIN.TLD
### User that slurm will run as.
SlurmUser=slurm
### Sets munge to be our authentication method
AuthType=auth/munge
CryptoType=crypto/munge
### Width of the communications heirarchy. Recommended to be either N^(1/2)
### or N^(1/3) where N is the number of nodes.
### Currently we set TreeWidth to N^(1/3) due to the fact we are spanning
### multiple datacenters that have significant latency penalty.
#TreeWidth=12
#MessageTimeout=100
#RoutePlugin=route/topology
#TopologyPlugin=topology/tree
### We don't set a default version of MPI.
MpiDefault=none
SwitchType=switch/none
### scron settings
### Enable the use of scrontab to submit and manage periodic repeating jobs
ScronParameters=enable
########################### slurmctld #####################################
### This section sets the definitions specific to the slurmctld.
### Network info for slurmctld
SlurmctldHost=SLURMCTLD
SlurmctldPort=6817
SlurmctldPidFile=/run/slurm/slurmctld.pid
StateSaveLocation=/tools/slurm/var/spool/slurm/ctld
SlurmctldTimeout=120
SlurmctldParameters=preempt_send_user_signal,reboot_from_controller,user_resv_delete
RebootProgram="/usr/sbin/reboot"
### Logging
#SlurmctldDebug=verbose
SlurmctldDebug=debug
SlurmctldLogFile=/var/log/slurm/slurmctld.log
#SlurmctldLogFile=/var/log/messages
#SlurmctldSyslogDebug=verbose
#DebugFlags=Route
DebugFlags=Gres
### Prolog that the slurmctld runs when it schedules jobs.
#PrologSlurmctld=/usr/local/sbin/slurmctld_prolog
### Where are are logging data about job completions.
#JobCompHost=holy-slurm02
JobCompType=jobcomp/filetxt
JobCompLoc=/tools/slurm/var/JobComp.log
########################## slurmd #########################################
### This section sets the definitions specific to the slurmd and cgroups.
SlurmdPort=6818
#SrunPortRange=7845-11845
SlurmdPidFile=/run/slurm/slurmd.pid
SlurmdSpoolDir=/tools/slurm/var/spool/slurm/d
SlurmdTimeout=300
#SlurmdDebug=verbose
SlurmdDebug=debug
SlurmdLogFile=/var/log/slurm/slurmd.log
### Turning on config_overrides else the logs get filled up with reports of bad nodes.
#SlurmdParameters=config_overrides
### Logging
#SlurmdSyslogDebug=verbose
### We are using cgroups to track things and mange tasks.
ProctrackType=proctrack/cgroup
TaskPlugin=task/affinity,task/cgroup
### We have MCS turned on so that we can run isolated jobs.
#MCSPlugin=mcs/account
MCSPlugin=mcs/group
MCSParameters=ondemand,ondemandselect,privatedata
### What gres types we are using.
#GresTypes=gpu
GresTypes=mem_free
### Where scratch space is located on the node.
#TmpFs=/scratch
### Nodes will stay down until reopened by the admin.
### This prevents flapping by nodes.
#ReturnToService=0
ReturnToService=1
### This sets how the cgroup will be set up by the prolog script.
### We need X11 in order to use the Slurm PAM libraries and X11.
PrologFlags=Contain,X11
#UsePAM=1
### Location of Epilog Script
#Epilog=/usr/local/bin/slurm_epilog
### Node Health Check
#HealthCheckInterval=500
#HealthCheckNodeState=CYCLE
#HealthCheckProgram=/usr/local/bin/node_monitor
############################ slurmdbd #####################################
### This section sets the definitions for slurmdbd and accounting.
AccountingStorageHost=SLURMDBDHOST
AccountingStoragePort=6819
#AccountingStorageUser=slurm
AccountingStorageType=accounting_storage/slurmdbd
### Determines what we track for accounting.
### AccountingStorageEnforce being set to safe means that if we enforce a
### GrpCPUMins limit, jobs will only launch if they will complete before
### that limit is exhuasted. Else the jobs would simply terminate when the
### limit is hit regardless if the job still had time left.
AccountingStorageEnforce=safe
# DISABLING IB + LUSTRE ACCOUNTING FOR NOW UNTIL ISSUES RESOLVED
#AccountingStorageTRES=Billing,CPU,Energy,Mem,Node,FS/Disk,FS/Lustre,Pages,VMem,IC/OFED,gres/gpu
AccountingStorageTRES=Billing,CPU,Energy,Mem,Node,FS/Disk,Pages,VMem,gres/mem_free
AccountingStoreJobComment=YES
# DISABLING IB + LUSTRE ACCOUNTING FOR NOW UNTIL ISSUES RESOLVED
#AcctGatherInfinibandType=acct_gather_infiniband/ofed
#AcctGatherFilesystemType=acct_gather_filesystem/lustre
### Determines how frequently we will ping for job data.
### We use jobacct_gather/linux rather than cgroups as that is what is
### recommended by slurm in their documentation even though we have
### cgroups enabled.
JobAcctGatherType=jobacct_gather/linux
JobAcctGatherFrequency=30
#JobAcctGatherFrequency=task=30,network=30,filesystem=30
########################## Scheduling #####################################
### This section is specific to scheduling
### Tells the scheduler to enforce limits for all partitions
### that a job submits to.
EnforcePartLimits=ALL
### Let's slurm know that we have a jobsubmit.lua script
#JobSubmitPlugins=lua
### When a job is launched this locks the memory a user can use,
### it also sorts NUMA memory. In addition for srun invocations
### we test that the executable is actually available and has
### the correct permissions before launching.
LaunchParameters=mem_sort,slurmstepd_memlock_all,test_exec
### Set's licenses we are running.
### These to not tie into FlexLM or our License server.
#Licenses=lumerical:10,MATLAB_Distrib_Comp_Engine:256,renderman:29
Licenses=pcie_svt_free:195,low_priority:200
### Maximum sizes for Jobs.
#MaxJobCount=300000
#MaxArraySize=10000
#DefMemPerCPU=100
DefCpuPerGPU=1
#DefMemPerGPU=100
GpuFreqDef=low
DefMemPerNode=2600
### Job Timers
CompleteWait=0
### We set the EpilogMsgTime long so that Epilog Messages don't pile up all
### at one time due to forced exit which can cause problems for the master.
#EpilogMsgTime=3000000
InactiveLimit=0
KillWait=30
### This only applies to the reservation time limit, the job must still obey
### the partition time limit.
ResvOverRun=UNLIMITED
MinJobAge=300
Waittime=0
### Scheduling parameters
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_Core_Memory
### Govern's default preemption behavior
PreemptType=preempt/partition_prio
PreemptMode=REQUEUE
### default_queue_depth should be some multiple of the partition_job_depth,
### ideally number_of_partitions * partition_job_depth, but typically the main
### loop exits prematurely if you go over about 400. A partition_job_depth of
### 10 seems to work well.
SchedulerParameters=\
default_queue_depth=1500,\
partition_job_depth=10,\
bf_continue,\
bf_interval=30,\
bf_resolution=600,\
bf_window=11520,\
bf_max_job_part=0,\
bf_max_job_user=10,\
bf_max_job_test=100000,\
bf_max_job_start=1000,\
bf_ignore_newly_avail_nodes,\
enable_user_top,\
pack_serial_at_end,\
nohold_on_prolog_fail,\
permit_job_expansion,\
preempt_strict_order,\
preempt_youngest_first,\
reduce_completing_frag,\
requeue_setup_env_fail,\
max_rpc_cnt=16
### This controls how we handle dependencies. If we have invalid dependencies
### we terminate the job.
DependencyParameters=kill_invalid_depend
################################ Fairshare ################################
### This section sets the fairshare calculations
PriorityType=priority/multifactor
PriorityFlags=NO_FAIR_TREE
### Settings for fairshare calculation frequency and shape.
FairShareDampeningFactor=1
PriorityDecayHalfLife=28-0
PriorityCalcPeriod=5
### Settings for fairshare weighting.
PriorityMaxAge=7-0
PriorityWeightAge=1000
PriorityWeightFairshare=2000
PriorityWeightJobSize=0
PriorityWeightPartition=0
PriorityWeightQOS=10000
###########################################################################
############################## Node Definitions ###########################
### These are ordered in alphabetical order by node name.
### The AMD nodes do not match the actual hardware specs
### but should get us more job slots.
### Header names are names of those who bought the hardware.
###########################################################################
##########################Partition Definitions ###########################
### Organized in alphabetical order by group name.
### Then partitions are ordered by partition name.
### Nodes in the partition and groups should be ordered alphabetically.
### TRES Billing formula for CPU: AMD = 0.1, Sandy = 0.2, Broadwell = 0.4
### Skylake = 0.5, Cascade Lake = 1.0
### TRES Billing formula for GPU: TitanX = 2.2, K80 = 15.4, V100 = 75
### TRES Billing formula for Memory: NumCPU*CPUTRES/TotalMem
PartitionName=test State=UP PreemptMode=OFF PriorityTier=1 \
Nodes=all
### Arguelles Delgado, Physics, FAS ###
### holy7c183[01-11] is on loan from shared ###
###PartitionName=arguelles_delgado State=UP PreemptMode=OFF PriorityTier=4 DefaultTime=0-00:10:00 \
### TRESBillingWeights="CPU=1.0,Mem=0.25G" \
### AllowGroups=arguelles_delgado_lab,slurm-admin \
### Nodes=holy7c183[01-11]
/etc/slurm/slurmdbd.conf
#
# See the slurmdbd.conf man page for more information.
#
# Archive info
#ArchiveJobs=yes
#ArchiveDir="/tmp"
#ArchiveSteps=yes
#ArchiveScript=
#JobPurge=12
#StepPurge=1
#
# Authentication info
AuthType=auth/munge
#AuthInfo=/var/run/munge/munge.socket.2
#
# slurmdbd info
#DebugLevel=4
DebugLevel=verbose
#DefaultQOS=normal,standby
DbdAddr=localhost
DbdHost=SLURMDBDHOST
#DbdPort=6819
DbdPort=6819
#LogFile=
LogFile=/var/log/slurm/slurmdbd.log
#MessageTimeout=300
PidFile=/var/run/slurm/slurmdbd.pid
#PluginDir=
#PrivateData=accounts,users,usage,jobs
PurgeEventAfter=999month
PurgeJobAfter=999month
PurgeResvAfter=999month
PurgeStepAfter=999month
PurgeSuspendAfter=999month
PurgeTXNAfter=999month
PurgeUsageAfter=999month
#SlurmUser=root
SlurmUser=slurm
#TrackWCKey=yes
#
# Database info
StorageType=accounting_storage/mysql
StorageHost=localhost
StoragePort=3306
StoragePass=DBPASSWORD
StorageUser=slurm
StorageLoc=slurm_acct_db
#StorageLoc=slurm_acct_db