#!/usr/bin/python3 # # Modules # import inspect import signal import time import os import sys import pysigset # # Macros # MAX_CHILDREN = 10000 CHILDREN = 1000 TIMEOUT = 300 A_LONG_TIME = 3600 CHILD_RUN_TIME = lambda n: 1 # # Global variables # children = [] # # Functions # def main(): # # Initialise. # infomsg('parent initialising children status table ...') for i in range(0,MAX_CHILDREN): children.append({'pid':0}) infomsg('parent setting up signal handlers ...') # # Define signal set for two purposes: # (1) for sigprocmask() call, # (2) for sigaction() we need list of *additional* signals to block # while executing the handler (we specify all three signals, which # is slightly more than *just* the additional signals, but it does # no harm) # sigset = pysigset.SIGSET() pysigset.sigaddset(sigset, signal.SIGCHLD) pysigset.sigaddset(sigset, signal.SIGALRM) pysigset.sigaddset(sigset, signal.SIGUSR1) # cause delivery of SIGCHLD, etc to be delayed until we call sigsuspend() old_sigset = pysigset.SIGSET() pysigset.sigprocmask(signal.SIG_BLOCK, sigset, old_sigset) # later we need list of signals blocked before that sigprocmask() call, # but *excluding* SIGCHLD, etc. suspend_sigset = old_sigset pysigset.sigdelset(suspend_sigset, signal.SIGCHLD) pysigset.sigdelset(suspend_sigset, signal.SIGALRM) pysigset.sigdelset(suspend_sigset, signal.SIGUSR1) # establish signals handlers signal.signal(signal.SIGCHLD, handler) signal.signal(signal.SIGALRM, handler) signal.signal(signal.SIGUSR1, handler) # # Start children. # infomsg('parent starting %d children ...', CHILDREN) for i in range(0,CHILDREN): start_child_sleep(CHILD_RUN_TIME(i)) # # Main monitoring loop # infomsg('parent entering monitoring loop ...') while True: now = int(time.time()) # # Exit if no running children. # running_children_count = 0 for i in range(0,MAX_CHILDREN): if children[i]['pid'] != 0: running_children_count += 1 if running_children_count == 0: break # # Kill any children that have reached their timeout time and not # been killed already. # killed_something = False for i in range(0,MAX_CHILDREN): if children[i]['pid'] != 0 and children[i]['start'] != 0 and \ now >= children[i]['start']+TIMEOUT: os.kill(children[i]['pid'], signal.SIGTERM) children[i]['start'] = 0 killed_something = True # # Slight optimisation: if something did reach its timeout and got # killed then skip to reassessing if this program can exit. # if killed_something: continue # # Schedule timeout alarm of next-to-timeout child. # next_timeout = 0 for i in range(0,MAX_CHILDREN): if children[i]['pid'] != 0 and children[i]['start'] != 0: if next_timeout == 0: next_timeout = children[i]['start']+TIMEOUT - now elif children[i]['start']+TIMEOUT - now < next_timeout: next_timeout = children[i]['start']+TIMEOUT - now if next_timeout >= 1: signal.alarm(next_timeout) # # If there are dispatched-but-not-yet-delivered signals then # handle them. If there are none then wait for one. # if next_timeout >= 1: pysigset.sigsuspend(suspend_sigset) # # If SIGCHLD arrived before SIGALRM then the alarm is still # pending. Cancel it. # signal.alarm(0) # # Clean up and exit. # infomsg('parent cleaning up and exiting ...') signal.signal(signal.SIGUSR1, signal.SIG_DFL) signal.signal(signal.SIGALRM, signal.SIG_DFL) signal.signal(signal.SIGCHLD, signal.SIG_DFL) pysigset.sigprocmask(signal.SIG_SETMASK, old_sigset, None) return 0 def start_child_sleep(period): global children # # Find an empty slot to store info about the process we're # about launch. # for i in range(0,MAX_CHILDREN): if children[i]['pid'] == 0: break if i == MAX_CHILDREN: errormsg('start_child_sleep: unable to find a free slot') # # Launch a child process and note its pid and start time # in the empty slot. # buf = 'sleep %d' % (period) try: pid = os.fork() except: errormsg('fork failed') if pid > 0: children[i]['pid'] = pid children[i]['start'] = int(time.time()) return pid # # Only the child gets here # os.execlp('/bin/sh', 'sh', '-c', buf) errormsg('exec() failed') def handler(sig, frame): global children if sig == signal.SIGCHLD: while True: # Seems like waitpid() can either raise an exception or return 0 try: pid = os.waitpid(-1, os.WNOHANG)[0] except: break if pid <= 0: break for i in range(0,MAX_CHILDREN): if children[i]['pid'] == pid: children[i]['pid'] = 0 children[i]['start'] = 0 break elif sig == signal.SIGALRM: infomsg('parent received SIGALRM') elif sig == signal.SIGUSR1: finfomsg(sys.stderr, 'parent received SIGUSR1') for i in range(0,MAX_CHILDREN): if children[i]['pid'] != 0: finfomsg(sys.stderr, 'slot:%04d; pid:%05d, start=%ld', i, children[i]['pid'], children[i]['start']) else: errormsg('parent received unexpected signal %s', sig) def doubletime(): now = time.time() if not hasattr(doubletime, "start"): doubletime.start = now return now-doubletime.start def infomsg(fmt, *args): real_fmessage(inspect.stack()[1].function, sys.stdout, fmt, *args) def errormsg(fmt, *args): real_fmessage(inspect.stack()[1].function, sys.stdout, fmt, *args) sys.exit(1) def finfomsg(fp, fmt, *args): real_fmessage(inspect.stack()[1].function, fp, fmt, *args) def ferrormsg(fp, fmt, *args): real_fmessage(inspect.stack()[1].function, fp, fmt, *args) sys.exit(1) def real_fmessage(func, fp, fmt, *args): fp.write('%.06lf: %s: ' % (doubletime(), func)) fp.write(fmt % args) fp.write('\n') main()