#!/bin/bash
#  $HeadURL$ $LastChangedRevision$

#  Includes
. $(miniade) || { echo "${0##*/}: ERROR: miniade failed (hint: run 'miniade' to see error)" >&2; exit 1; }
. <(js-config --format=shell) || { echo "${0##*/}: INTERNAL ERROR: failed to run js-config" >&2; exit 3; }

#  Other globals
JSQ_CMD=jsq

main()
{
    local MY_ARGS
    local WARNINGS_ENABLED COMMAND PROGNAME

    #  Defaults for options
    WARNINGS_ENABLED=${JS_SHEPHERD_WARNINGS_ENABLED:-true}
    COMMAND=
  
    #  Process options
    special_opts_handler()
    {
        case $1 in
            -c) COMMAND=$2; shift ;;
            *)  return 1 ;;
        esac
    }
    miniade_process_options --help-handler=help MY_ARGS "$@" && set -- "${MY_ARGS[@]}"

    #  Process arguments
    [ $# = 0 ] || miniade_bad_usage
    [ "X$COMMAND" != X ] || miniade_bad_usage

    #  Sanity checks and derivations
    miniade_debug 10 "main: performing sanity checks and derivations ..."
    [ $(id -u) = 0 ] || miniade_error "this program can only be run as root"
    [ "X$JS_JOB_ID" != X ] || miniade_error "JS_JOB_ID: not set"
    [ "X$JS_USER" != X ] || miniade_error "JS_USER: not set"
    if fgrep -qw cgroup2 /proc/filesystems; then
        CGROUP_VERSION=2
    else
        miniade_error "no cgroups support detected or cgroup version is not 2"
    fi
    CGROUP_NAME=$JS_JOB_ID
    miniade_debug 10 "main: CGROUP_NAME=$CGROUP_NAME"
    if $WARNINGS_ENABLED; then
        miniade_get_progname PROGNAME
        [ "X$JS_CPU_AFFINITY" != X ] || miniade_warning "JS_CPU_AFFINITY: not set (hint: does jsd manage the 'cpu' resource? is it affinous? is $PROGNAME being called by jsd?)"
        [ "X$JS_MEM_LIMIT" != X ] || miniade_warning "JS_MEM_LIMIT: not set (hint: does jsd manage the 'mem' resource? is $PROGNAME being called by jsd?)"
        [ "X$JS_TIME_LIMIT" != X ] || miniade_warning "JS_TIME_LIMIT: not set (hint: does jsd manage the 'time' resource? is $PROGNAME being called by jsd?)"
    fi
    #  When making cpuset restrictions, we'll also need
    #  to restrict the locations of memory "nodes" (NUMA
    #  terminology) that each CPU in the cpuset can access.
    #  I've only used systems that don't have NUMA, in 
    #  which case they have only node '0'. If I ever have
    #  more than that then I need to recode a bit, but
    #  I won't know how until I'm in that situation. (I'm
    #  guessing it would involve making a map of cores to
    #  nodes, which is probably simply "which socket is
    #  this core in? then that's the memory node it's
    #  allowed to access".)
    if [ "X$(ls /sys/devices/system/node | sed -nr 's/node//p')" != X0 ]; then
        miniade_internal "support for multiple memory nodes is not implemented yet (hint: see comments near this message)"
    fi
    #  sudo needs to work for root (see below for explanation).
    if ! sudo -u root true; then
        miniade_error "sudo doesn't work (hint: shepherd requires that root be in sudo group; is it?)"
        #  Actually it also requires that /etc/sudoers contains '%sudo ALL=(ALL:ALL) ALL', 
        #  but it usually does by default.
    fi

    create_cgroup_v$CGROUP_VERSION

    restrict_cgroup_v$CGROUP_VERSION

    add_this_process_to_cgroup_v$CGROUP_VERSION

    #  Time limits are not managed using cgroups, but just using the timeout 
    #  command, which we use now to run the actual commands in the job.
    if [ "X$JS_TIME_LIMIT" != X ]; then
        miniade_debug 10 "main: timeout is ${JS_TIME_LIMIT}s"
    fi
    #  We used to use 'su' to run a command as another user, but since
    #  Debian 11 that no longer works, because su communicates with systemd
    #  and systemd runs the command as the specified user and those processes
    #  are then outside of this system group! sudo works correctly, but
    #  is obviously more painful. Put more succinctly: with 'timeout sudo sleep'
    #  we see four processes in the cgroup (the shell running this script,
    #  timeout, sudo and sleep) but with 'timeout su sleep' we see only two
    #  (the shell running this script and timeout).
    timeout ${JS_TIME_LIMIT:-0} sudo -u $JS_USER bash -c "$COMMAND" > /dev/null 2>&1
    if [ $? = 124 ]; then
        miniade_warning "timeout after ${JS_TIME_LIMIT}s!"
    fi

    remove_this_process_from_cgroup_v$CGROUP_VERSION

    kill_all_processes_in_cgroup_v$CGROUP_VERSION

    delete_cgroup_v$CGROUP_VERSION
}

help()
{
    local PROGNAME

    miniade_get_progname PROGNAME
    echo "Usage: $PROGNAME [ <standard-options> ] -c <command>"
    exit 0
}

#create_cgroup_v1()
#{
#    miniade_debug 10 "create_cgroup_v1: shepherd is setting up '$CGROUP_NAME' cgroup ..."
#    #  (In principal, the cgroup-name directories should not exist already but during
#    #  testing, when the tests keep failing, it's possible that they do, which means
#    #  a normal 'mkdir' can fail. However, it is *essential* that this script does
#    #  not fail (otherwise there'll be not even an attempt to clean up). For this
#    #  reason we first call rmdir before mkdir.)
#    rmdir /sys/fs/cgroup/freezer/$CGROUP_NAME 2>/dev/null
#    mkdir /sys/fs/cgroup/freezer/$CGROUP_NAME
#    if [ "X$JS_CPU_AFFINITY" != X ]; then
#        rmdir /sys/fs/cgroup/cpuset/$CGROUP_NAME 2>/dev/null
#        mkdir /sys/fs/cgroup/cpuset/$CGROUP_NAME
#    fi
#    if [ "X$JS_MEM_LIMIT" != X ]; then
#        #  See https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1348688 for more info on this error/check.
#        [ -f /sys/fs/cgroup/memory/memory.memsw.limit_in_bytes ] || miniade_error "memory.memsw.limit_in_bytes: not found (hint: boot with 'swapaccount=1')"
#        rmdir /sys/fs/cgroup/memory/$CGROUP_NAME 2>/dev/null
#        mkdir /sys/fs/cgroup/memory/$CGROUP_NAME
#        #  Wait till synchronised.
#        while true; do
#            [ ! -f /sys/fs/cgroup/memory/$CGROUP_NAME/memory.limit_in_bytes ] || break
#            sleep 0.1
#        done
#    fi
#}

create_cgroup_v2()
{
    #  In order for a facility to be a turn-on-able restriction in a cgroup,
    #  it needs to be turned on in the parent cgroup.
    miniade_debug 10 "create_cgroup_v2: checking/enabling facilities in top-level cgroup ..."
    if [ "X$JS_CPU_AFFINITY" != X ]; then
        miniade_debug 10 "create_cgroup_v2: checking cpuset restricting is enabled ..."
        if ! fgrep -qw cpuset /sys/fs/cgroup/cgroup.subtree_control; then
            miniade_debug 10 "create_cgroup_v2: enabling cpuset restricting ..."
            echo +cpuset > /sys/fs/cgroup/cgroup.subtree_control
            if ! fgrep -qw cpuset /sys/fs/cgroup/cgroup.subtree_control; then
                miniade_error "failed to enable cpuset restricting in top-level cgroup"
            fi
        fi
    fi
        
    miniade_debug 10 "create_cgroup_v2: $CGROUP_NAME: creating cgroup ..."
    #  If the shepherd didn't clean up before then mkdir will fail. We
    #  could discard mkdir's stderr, but safer is to rmdir first and
    #  discard rmdir's stderr.
    rmdir /sys/fs/cgroup/$CGROUP_NAME 2>/dev/null || true
    mkdir /sys/fs/cgroup/$CGROUP_NAME

    if [ "X$JS_CPU_AFFINITY" != X ]; then
        echo +cpuset > /sys/fs/cgroup/$CGROUP_NAME/cgroup.subtree_control
        if ! fgrep -qw cpuset /sys/fs/cgroup/$CGROUP_NAME/cgroup.subtree_control; then
            miniade_error "failed to enable cpuset restricting in $CGROUP_NAME cgroup"
        fi
    fi
}

#restrict_cgroup_v1()
#{
#    if [ "X$JS_CPU_AFFINITY" != X ]; then
#        echo $JS_CPU_AFFINITY > /sys/fs/cgroup/cpuset/$CGROUP_NAME/cpuset.cpus
#        #  For an explanation of this search this script for 'NUMA'.
#        echo 0 > /sys/fs/cgroup/cpuset/$CGROUP_NAME/cpuset.mems
#    fi
#    if [ "X$JS_MEM_LIMIT" != X ]; then
#        TRIED=0
#        while [ $TRIED != 3 ]; do
#            echo ${JS_MEM_LIMIT}k > /sys/fs/cgroup/memory/$CGROUP_NAME/memory.limit_in_bytes && break
#            ((TRIED++))
#            sleep 3
#        done
#        [ $TRIED != 3 ] || miniade_error "memory.limit_in_bytes: failed to set even after $TRIED tries!"
#        TRIED=0
#        while [ $TRIED != 3 ]; do
#            echo ${JS_MEM_LIMIT}k > /sys/fs/cgroup/memory/$CGROUP_NAME/memory.memsw.limit_in_bytes && break
#            ((TRIED++))
#            sleep 3
#        done
#        [ $TRIED != 3 ] || miniade_error "memory.memsw.limit_in_bytes: failed to set even after $TRIED tries!"
#    fi
#}

restrict_cgroup_v2()
{
    if [ "X$JS_CPU_AFFINITY" != X ]; then
        echo $JS_CPU_AFFINITY > /sys/fs/cgroup/$CGROUP_NAME/cpuset.cpus
        #  For an explanation of this search this script for 'NUMA'.
        echo 0 > /sys/fs/cgroup/$CGROUP_NAME/cpuset.mems
    fi
    if [ "X$JS_MEM_LIMIT" != X ]; then
        echo ${JS_MEM_LIMIT}k > /sys/fs/cgroup/$CGROUP_NAME/memory.max
        #  Under cgroup_v1, the *sum* of memory and swap usage can be 
        #  restricted, but under cgroup_v2, they are separate. The link
	#  https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
        #  says:
        #
        #      The combined memory+swap accounting and limiting [of cgroup_v1]
        #      is replaced by real control over swap space.
        echo 0 > /sys/fs/cgroup/$CGROUP_NAME/memory.swap.max
    fi
}

#add_this_process_to_cgroup_v1()
#{
#    #  Add ourselves to the cgroup so that our children will be in the cgroup.
#    miniade_debug 10 "add_this_process_to_cgroup_v1: shepherd is putting itself in '$CGROUP_NAME' cgroup ..."
#    echo $$ >> /sys/fs/cgroup/freezer/$CGROUP_NAME/cgroup.procs
#    if [ "X$JS_CPU_AFFINITY" != X ]; then
#        echo $$ >> /sys/fs/cgroup/cpuset/$CGROUP_NAME/cgroup.procs
#    fi
#    if [ "X$JS_MEM_LIMIT" != X ]; then
#        echo $$ >> /sys/fs/cgroup/memory/$CGROUP_NAME/cgroup.procs
#    fi
#}

add_this_process_to_cgroup_v2()
{
    miniade_debug 10 "add_this_process_to_cgroup_v2: shepherd is putting itself in '$CGROUP_NAME' cgroup ..."
    echo $$ >> /sys/fs/cgroup/$CGROUP_NAME/cgroup.procs
}

#remove_this_process_from_cgroup_v1()
#{
#    miniade_debug 10 "remove_this_process_from_cgroup_v1: shepherd is taking itself out of '$CGROUP_NAME' cgroup ..."
#    if [ "X$JS_CPU_AFFINITY" != X ]; then
#        echo $$ >> /sys/fs/cgroup/cpuset/cgroup.procs
#    fi
#    if [ "X$JS_MEM_LIMIT" != X ]; then
#        echo $$ >> /sys/fs/cgroup/memory/cgroup.procs
#    fi
#    echo $$ >> /sys/fs/cgroup/freezer/cgroup.procs
#    miniade_debug 10 "remove_this_process_from_cgroup_v1: waiting until last step is synchronised ..."
#    if [ "X$JS_CPU_AFFINITY" != X ]; then
#        while true; do
#            fgrep -qx $$ /sys/fs/cgroup/cpuset/$CGROUP_NAME/cgroup.procs || break
#            sleep 0.1
#        done
#    fi
#    if [ "X$JS_MEM_LIMIT" != X ]; then
#        while true; do
#            fgrep -qx $$ /sys/fs/cgroup/memory/$CGROUP_NAME/cgroup.procs || break
#            sleep 0.1
#        done
#    fi
#    while true; do
#        fgrep -qx $$ /sys/fs/cgroup/freezer/$CGROUP_NAME/cgroup.procs || break
#        sleep 0.1
#    done
#}

remove_this_process_from_cgroup_v2()
{
    miniade_debug 10 "remove_this_process_from_cgroup_v2: shepherd is taking itself out of '$CGROUP_NAME' cgroup ..."
    echo $$ >> /sys/fs/cgroup/cgroup.procs
}

#kill_all_processes_in_cgroup_v1()
#{
#    #  At this point *and* if $COMMAND is well-behaved (meaning it made sure that
#    #  its child processes all exited before it itself did, regardless of whether
#    #  that was from normal exit or timeout) then there should be no processes 
#    #  left in the cgroup (for cgroups v1 that means more specifically in *any*
#    #  facility - freezer, cpu or memory). To check that it's enough to check one
#    #  facility.
#    IN_CGROUP_PIDS=$(paste -s -d' ' /sys/fs/cgroup/freezer/$CGROUP_NAME/cgroup.procs)
#    if [ "X$IN_CGROUP_PIDS" != X ]; then
#        miniade_warning "$JS_JOB_ID: child processes (${IN_CGROUP_PIDS// /, }) are still running; killing ..."
#        #  Suspend the child processes so more can't be started.
#        echo FROZEN > /sys/fs/cgroup/freezer/$CGROUP_NAME/freezer.state
#        #  Wait until synchronised.
#        while true; do
#            { ! fgrep -qx FROZEN /sys/fs/cgroup/freezer/$CGROUP_NAME/freezer.state; } || break
#            sleep 0.1
#        done
#        #  Slight race condition: the processes can still be listed in cgroup.procs
#        #  in the test above, but be out of it by the time we get the PID list here.
#        #  To work around that we discard stderr and kill's exit code.
#        kill -9 $IN_CGROUP_PIDS 2>/dev/null || true
#        #  Resume them to allow them to exit.
#        echo THAWED > /sys/fs/cgroup/freezer/$CGROUP_NAME/freezer.state
#        #  Wait until synchronised.
#        while true; do
#            [ "X$(cat /sys/fs/cgroup/freezer/$CGROUP_NAME/cgroup.procs)" != X ] || break
#            sleep 0.1
#        done
#    fi
#}

kill_all_processes_in_cgroup_v2()
{
    #  At this point *and* if $COMMAND is well-behaved (meaning it made sure that
    #  its child processes all exited before it itself did, regardless of whether
    #  that was from normal exit or timeout) then there should be no processes 
    #  left in the cgroup (for cgroups v1 that means more specifically in *any*
    #  facility - freezer, cpu or memory). To check that it's enough to check one
    #  facility.
    IN_CGROUP_PIDS=$(paste -s -d' ' /sys/fs/cgroup/$CGROUP_NAME/cgroup.procs)
    miniade_debug 10 "kill_all_processes_in_cgroup_v2: IN_CGROUP_PIDS=$IN_CGROUP_PIDS"
    if [ "X$IN_CGROUP_PIDS" = X ]; then
        miniade_debug 10 "kill_all_processes_in_cgroup_v2: no pids in cgroup; returning early ..."
        return
    fi
    miniade_warning "$JS_JOB_ID: child processes (${IN_CGROUP_PIDS// /, }) are still running; killing ..."

    #  Suspend the child processes so more can't be started.
    echo 1 > /sys/fs/cgroup/$CGROUP_NAME/cgroup.freeze

    #  Slight race condition: the processes can still be listed in cgroup.procs
    #  in the test above, but be out of it by the time we get the PID list here.
    #  To work around that we discard stderr and kill's exit code.
    kill -9 $IN_CGROUP_PIDS 2>/dev/null || true
    #  Resume them to allow them to exit.
    echo 0 > /sys/fs/cgroup/$CGROUP_NAME/cgroup.freeze

    #  Wait until synchronised.
    for ATTEMPT in {1..100}; do
        IN_CGROUP_PIDS=$(paste -s -d' ' /sys/fs/cgroup/$CGROUP_NAME/cgroup.procs)
        [ "X$IN_CGROUP_PIDS" != X ] || break
        sleep 0.1
    done
}

#delete_cgroup_v1()
#{
#    if [ "X$JS_MEM_LIMIT" != X ]; then
#        rmdir /sys/fs/cgroup/memory/$CGROUP_NAME
#    fi
#    if [ "X$JS_CPU_AFFINITY" != X ]; then
#        rmdir /sys/fs/cgroup/cpuset/$CGROUP_NAME
#    fi
#    rmdir /sys/fs/cgroup/freezer/$CGROUP_NAME
#}

delete_cgroup_v2()
{
    rmdir /sys/fs/cgroup/$CGROUP_NAME
}

main "$@"