#!/bin/bash # $HeadURL$ $LastChangedRevision$ # Includes . $(miniade) || { echo "${0##*/}: ERROR: miniade failed (hint: run 'miniade' to see error)" >&2; exit 1; } . <(js-config --format=shell) || { echo "${0##*/}: INTERNAL ERROR: failed to run js-config" >&2; exit 3; } # Other globals JSQ_CMD=jsq main() { local MY_ARGS local WARNINGS_ENABLED COMMAND PROGNAME # Defaults for options WARNINGS_ENABLED=${JS_SHEPHERD_WARNINGS_ENABLED:-true} COMMAND= # Process options special_opts_handler() { case $1 in -c) COMMAND=$2; shift ;; *) return 1 ;; esac } miniade_process_options --help-handler=help MY_ARGS "$@" && set -- "${MY_ARGS[@]}" # Process arguments [ $# = 0 ] || miniade_bad_usage [ "X$COMMAND" != X ] || miniade_bad_usage # Sanity checks and derivations miniade_debug 10 "main: performing sanity checks and derivations ..." [ $(id -u) = 0 ] || miniade_error "this program can only be run as root" [ "X$JS_JOB_ID" != X ] || miniade_error "JS_JOB_ID: not set" [ "X$JS_USER" != X ] || miniade_error "JS_USER: not set" if fgrep -qw cgroup2 /proc/filesystems; then CGROUP_VERSION=2 else miniade_error "no cgroups support detected or cgroup version is not 2" fi CGROUP_NAME=$JS_JOB_ID miniade_debug 10 "main: CGROUP_NAME=$CGROUP_NAME" if $WARNINGS_ENABLED; then miniade_get_progname PROGNAME [ "X$JS_CPU_AFFINITY" != X ] || miniade_warning "JS_CPU_AFFINITY: not set (hint: does jsd manage the 'cpu' resource? is it affinous? is $PROGNAME being called by jsd?)" [ "X$JS_MEM_LIMIT" != X ] || miniade_warning "JS_MEM_LIMIT: not set (hint: does jsd manage the 'mem' resource? is $PROGNAME being called by jsd?)" [ "X$JS_TIME_LIMIT" != X ] || miniade_warning "JS_TIME_LIMIT: not set (hint: does jsd manage the 'time' resource? is $PROGNAME being called by jsd?)" fi # When making cpuset restrictions, we'll also need # to restrict the locations of memory "nodes" (NUMA # terminology) that each CPU in the cpuset can access. # I've only used systems that don't have NUMA, in # which case they have only node '0'. If I ever have # more than that then I need to recode a bit, but # I won't know how until I'm in that situation. (I'm # guessing it would involve making a map of cores to # nodes, which is probably simply "which socket is # this core in? then that's the memory node it's # allowed to access".) if [ "X$(ls /sys/devices/system/node | sed -nr 's/node//p')" != X0 ]; then miniade_internal "support for multiple memory nodes is not implemented yet (hint: see comments near this message)" fi # sudo needs to work for root (see below for explanation). if ! sudo -u root true; then miniade_error "sudo doesn't work (hint: shepherd requires that root be in sudo group; is it?)" # Actually it also requires that /etc/sudoers contains '%sudo ALL=(ALL:ALL) ALL', # but it usually does by default. fi create_cgroup_v$CGROUP_VERSION restrict_cgroup_v$CGROUP_VERSION add_this_process_to_cgroup_v$CGROUP_VERSION # Time limits are not managed using cgroups, but just using the timeout # command, which we use now to run the actual commands in the job. if [ "X$JS_TIME_LIMIT" != X ]; then miniade_debug 10 "main: timeout is ${JS_TIME_LIMIT}s" fi # We used to use 'su' to run a command as another user, but since # Debian 11 that no longer works, because su communicates with systemd # and systemd runs the command as the specified user and those processes # are then outside of this system group! sudo works correctly, but # is obviously more painful. Put more succinctly: with 'timeout sudo sleep' # we see four processes in the cgroup (the shell running this script, # timeout, sudo and sleep) but with 'timeout su sleep' we see only two # (the shell running this script and timeout). timeout ${JS_TIME_LIMIT:-0} sudo -u $JS_USER bash -c "$COMMAND" > /dev/null 2>&1 if [ $? = 124 ]; then miniade_warning "timeout after ${JS_TIME_LIMIT}s!" fi remove_this_process_from_cgroup_v$CGROUP_VERSION kill_all_processes_in_cgroup_v$CGROUP_VERSION delete_cgroup_v$CGROUP_VERSION } help() { local PROGNAME miniade_get_progname PROGNAME echo "Usage: $PROGNAME [ ] -c " exit 0 } #create_cgroup_v1() #{ # miniade_debug 10 "create_cgroup_v1: shepherd is setting up '$CGROUP_NAME' cgroup ..." # # (In principal, the cgroup-name directories should not exist already but during # # testing, when the tests keep failing, it's possible that they do, which means # # a normal 'mkdir' can fail. However, it is *essential* that this script does # # not fail (otherwise there'll be not even an attempt to clean up). For this # # reason we first call rmdir before mkdir.) # rmdir /sys/fs/cgroup/freezer/$CGROUP_NAME 2>/dev/null # mkdir /sys/fs/cgroup/freezer/$CGROUP_NAME # if [ "X$JS_CPU_AFFINITY" != X ]; then # rmdir /sys/fs/cgroup/cpuset/$CGROUP_NAME 2>/dev/null # mkdir /sys/fs/cgroup/cpuset/$CGROUP_NAME # fi # if [ "X$JS_MEM_LIMIT" != X ]; then # # See https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1348688 for more info on this error/check. # [ -f /sys/fs/cgroup/memory/memory.memsw.limit_in_bytes ] || miniade_error "memory.memsw.limit_in_bytes: not found (hint: boot with 'swapaccount=1')" # rmdir /sys/fs/cgroup/memory/$CGROUP_NAME 2>/dev/null # mkdir /sys/fs/cgroup/memory/$CGROUP_NAME # # Wait till synchronised. # while true; do # [ ! -f /sys/fs/cgroup/memory/$CGROUP_NAME/memory.limit_in_bytes ] || break # sleep 0.1 # done # fi #} create_cgroup_v2() { # In order for a facility to be a turn-on-able restriction in a cgroup, # it needs to be turned on in the parent cgroup. miniade_debug 10 "create_cgroup_v2: checking/enabling facilities in top-level cgroup ..." if [ "X$JS_CPU_AFFINITY" != X ]; then miniade_debug 10 "create_cgroup_v2: checking cpuset restricting is enabled ..." if ! fgrep -qw cpuset /sys/fs/cgroup/cgroup.subtree_control; then miniade_debug 10 "create_cgroup_v2: enabling cpuset restricting ..." echo +cpuset > /sys/fs/cgroup/cgroup.subtree_control if ! fgrep -qw cpuset /sys/fs/cgroup/cgroup.subtree_control; then miniade_error "failed to enable cpuset restricting in top-level cgroup" fi fi fi miniade_debug 10 "create_cgroup_v2: $CGROUP_NAME: creating cgroup ..." # If the shepherd didn't clean up before then mkdir will fail. We # could discard mkdir's stderr, but safer is to rmdir first and # discard rmdir's stderr. rmdir /sys/fs/cgroup/$CGROUP_NAME 2>/dev/null || true mkdir /sys/fs/cgroup/$CGROUP_NAME if [ "X$JS_CPU_AFFINITY" != X ]; then echo +cpuset > /sys/fs/cgroup/$CGROUP_NAME/cgroup.subtree_control if ! fgrep -qw cpuset /sys/fs/cgroup/$CGROUP_NAME/cgroup.subtree_control; then miniade_error "failed to enable cpuset restricting in $CGROUP_NAME cgroup" fi fi } #restrict_cgroup_v1() #{ # if [ "X$JS_CPU_AFFINITY" != X ]; then # echo $JS_CPU_AFFINITY > /sys/fs/cgroup/cpuset/$CGROUP_NAME/cpuset.cpus # # For an explanation of this search this script for 'NUMA'. # echo 0 > /sys/fs/cgroup/cpuset/$CGROUP_NAME/cpuset.mems # fi # if [ "X$JS_MEM_LIMIT" != X ]; then # TRIED=0 # while [ $TRIED != 3 ]; do # echo ${JS_MEM_LIMIT}k > /sys/fs/cgroup/memory/$CGROUP_NAME/memory.limit_in_bytes && break # ((TRIED++)) # sleep 3 # done # [ $TRIED != 3 ] || miniade_error "memory.limit_in_bytes: failed to set even after $TRIED tries!" # TRIED=0 # while [ $TRIED != 3 ]; do # echo ${JS_MEM_LIMIT}k > /sys/fs/cgroup/memory/$CGROUP_NAME/memory.memsw.limit_in_bytes && break # ((TRIED++)) # sleep 3 # done # [ $TRIED != 3 ] || miniade_error "memory.memsw.limit_in_bytes: failed to set even after $TRIED tries!" # fi #} restrict_cgroup_v2() { if [ "X$JS_CPU_AFFINITY" != X ]; then echo $JS_CPU_AFFINITY > /sys/fs/cgroup/$CGROUP_NAME/cpuset.cpus # For an explanation of this search this script for 'NUMA'. echo 0 > /sys/fs/cgroup/$CGROUP_NAME/cpuset.mems fi if [ "X$JS_MEM_LIMIT" != X ]; then echo ${JS_MEM_LIMIT}k > /sys/fs/cgroup/$CGROUP_NAME/memory.max # Under cgroup_v1, the *sum* of memory and swap usage can be # restricted, but under cgroup_v2, they are separate. The link # https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html # says: # # The combined memory+swap accounting and limiting [of cgroup_v1] # is replaced by real control over swap space. echo 0 > /sys/fs/cgroup/$CGROUP_NAME/memory.swap.max fi } #add_this_process_to_cgroup_v1() #{ # # Add ourselves to the cgroup so that our children will be in the cgroup. # miniade_debug 10 "add_this_process_to_cgroup_v1: shepherd is putting itself in '$CGROUP_NAME' cgroup ..." # echo $$ >> /sys/fs/cgroup/freezer/$CGROUP_NAME/cgroup.procs # if [ "X$JS_CPU_AFFINITY" != X ]; then # echo $$ >> /sys/fs/cgroup/cpuset/$CGROUP_NAME/cgroup.procs # fi # if [ "X$JS_MEM_LIMIT" != X ]; then # echo $$ >> /sys/fs/cgroup/memory/$CGROUP_NAME/cgroup.procs # fi #} add_this_process_to_cgroup_v2() { miniade_debug 10 "add_this_process_to_cgroup_v2: shepherd is putting itself in '$CGROUP_NAME' cgroup ..." echo $$ >> /sys/fs/cgroup/$CGROUP_NAME/cgroup.procs } #remove_this_process_from_cgroup_v1() #{ # miniade_debug 10 "remove_this_process_from_cgroup_v1: shepherd is taking itself out of '$CGROUP_NAME' cgroup ..." # if [ "X$JS_CPU_AFFINITY" != X ]; then # echo $$ >> /sys/fs/cgroup/cpuset/cgroup.procs # fi # if [ "X$JS_MEM_LIMIT" != X ]; then # echo $$ >> /sys/fs/cgroup/memory/cgroup.procs # fi # echo $$ >> /sys/fs/cgroup/freezer/cgroup.procs # miniade_debug 10 "remove_this_process_from_cgroup_v1: waiting until last step is synchronised ..." # if [ "X$JS_CPU_AFFINITY" != X ]; then # while true; do # fgrep -qx $$ /sys/fs/cgroup/cpuset/$CGROUP_NAME/cgroup.procs || break # sleep 0.1 # done # fi # if [ "X$JS_MEM_LIMIT" != X ]; then # while true; do # fgrep -qx $$ /sys/fs/cgroup/memory/$CGROUP_NAME/cgroup.procs || break # sleep 0.1 # done # fi # while true; do # fgrep -qx $$ /sys/fs/cgroup/freezer/$CGROUP_NAME/cgroup.procs || break # sleep 0.1 # done #} remove_this_process_from_cgroup_v2() { miniade_debug 10 "remove_this_process_from_cgroup_v2: shepherd is taking itself out of '$CGROUP_NAME' cgroup ..." echo $$ >> /sys/fs/cgroup/cgroup.procs } #kill_all_processes_in_cgroup_v1() #{ # # At this point *and* if $COMMAND is well-behaved (meaning it made sure that # # its child processes all exited before it itself did, regardless of whether # # that was from normal exit or timeout) then there should be no processes # # left in the cgroup (for cgroups v1 that means more specifically in *any* # # facility - freezer, cpu or memory). To check that it's enough to check one # # facility. # IN_CGROUP_PIDS=$(paste -s -d' ' /sys/fs/cgroup/freezer/$CGROUP_NAME/cgroup.procs) # if [ "X$IN_CGROUP_PIDS" != X ]; then # miniade_warning "$JS_JOB_ID: child processes (${IN_CGROUP_PIDS// /, }) are still running; killing ..." # # Suspend the child processes so more can't be started. # echo FROZEN > /sys/fs/cgroup/freezer/$CGROUP_NAME/freezer.state # # Wait until synchronised. # while true; do # { ! fgrep -qx FROZEN /sys/fs/cgroup/freezer/$CGROUP_NAME/freezer.state; } || break # sleep 0.1 # done # # Slight race condition: the processes can still be listed in cgroup.procs # # in the test above, but be out of it by the time we get the PID list here. # # To work around that we discard stderr and kill's exit code. # kill -9 $IN_CGROUP_PIDS 2>/dev/null || true # # Resume them to allow them to exit. # echo THAWED > /sys/fs/cgroup/freezer/$CGROUP_NAME/freezer.state # # Wait until synchronised. # while true; do # [ "X$(cat /sys/fs/cgroup/freezer/$CGROUP_NAME/cgroup.procs)" != X ] || break # sleep 0.1 # done # fi #} kill_all_processes_in_cgroup_v2() { # At this point *and* if $COMMAND is well-behaved (meaning it made sure that # its child processes all exited before it itself did, regardless of whether # that was from normal exit or timeout) then there should be no processes # left in the cgroup (for cgroups v1 that means more specifically in *any* # facility - freezer, cpu or memory). To check that it's enough to check one # facility. IN_CGROUP_PIDS=$(paste -s -d' ' /sys/fs/cgroup/$CGROUP_NAME/cgroup.procs) miniade_debug 10 "kill_all_processes_in_cgroup_v2: IN_CGROUP_PIDS=$IN_CGROUP_PIDS" if [ "X$IN_CGROUP_PIDS" = X ]; then miniade_debug 10 "kill_all_processes_in_cgroup_v2: no pids in cgroup; returning early ..." return fi miniade_warning "$JS_JOB_ID: child processes (${IN_CGROUP_PIDS// /, }) are still running; killing ..." # Suspend the child processes so more can't be started. echo 1 > /sys/fs/cgroup/$CGROUP_NAME/cgroup.freeze # Slight race condition: the processes can still be listed in cgroup.procs # in the test above, but be out of it by the time we get the PID list here. # To work around that we discard stderr and kill's exit code. kill -9 $IN_CGROUP_PIDS 2>/dev/null || true # Resume them to allow them to exit. echo 0 > /sys/fs/cgroup/$CGROUP_NAME/cgroup.freeze # Wait until synchronised. for ATTEMPT in {1..100}; do IN_CGROUP_PIDS=$(paste -s -d' ' /sys/fs/cgroup/$CGROUP_NAME/cgroup.procs) [ "X$IN_CGROUP_PIDS" != X ] || break sleep 0.1 done } #delete_cgroup_v1() #{ # if [ "X$JS_MEM_LIMIT" != X ]; then # rmdir /sys/fs/cgroup/memory/$CGROUP_NAME # fi # if [ "X$JS_CPU_AFFINITY" != X ]; then # rmdir /sys/fs/cgroup/cpuset/$CGROUP_NAME # fi # rmdir /sys/fs/cgroup/freezer/$CGROUP_NAME #} delete_cgroup_v2() { rmdir /sys/fs/cgroup/$CGROUP_NAME } main "$@"