Calls child program:
vc-restart.setl (Section A.37 [vc-restart.setl])
Textually #includes:
vc-admin.setl (Section A.1 [vc-admin.setl])
Source code: *
const yhwh = `vc-cron.setl';
-- This is supposed to be run every minute from the user's crontab,
-- using an entry like this:
--
-- * * * * * cd fred; setl vc-cron.setl
--
-- This program basically tries to detect and correct any problems with
-- the Box that is supposed to be permanently running. It tries to do
-- so without going crazy (like the Sorcerer's Apprentice) in the
-- attempt.
--
-- If and when it decides that the Box is down or malfunctioning, and
-- that it is time to try to correct the problem, it will try to clean
-- up (by removing apparently stale locks and processes) and restart
-- the Box.
const my_lock = `vc-cronlock'; -- lock file (mutex)
const vc_cronlog = `vc-cronlog'; -- log of how I exited
const vc_lock = `vc-lock'; -- Box's lock file
const vc_link = `vc-link.html'; -- link to pseudo- or other document
const vc_prefix = `CGI|'; -- pseudo-document convention
const vc_health = `vc-tcp/health'; -- has Box's health check host:port
const vc_camera = `vc-tcp/camera'; -- has Box's ``command'' host:port
const vc_going = `vc-going'; -- vc-go's lock file
const vc_quitting = `vc-quitting'; -- vc-quit's lock file
const vc_restarting = `vc-restarting'; -- vc-restart's lock file
const vc_rescount = `vc-rescount'; -- restarts since last ``success''
const vc_restart_cmd = `exec setl vc-restart.setl'; -- restart command
var box_locktext;
-- Try to acquire the lock named by my_lock, if appropriate.
--
-- If another recently started instance of this program is already
-- running, exit quietly and immediately. Otherwise, try to correct
-- the problem with the lock and exit more noisily, in the hope that
-- the cleanup effort (and any action the administrator subsequently
-- takes) will make it easy to get when the program is run again soon
-- (like in another minute):
--
make_symlink;
if (make_symlink_result := last_error)no_error then
if (locktext := readlink my_lock)om then
stamp := extract_timestamp locktext ? 0;
if stamp < tod - 15*60*1000 then
-- The lock is at least 15 minutes old, or bogus.
msg (`Lock file ' + render_lock locktext +
` is more than ' + str ((tod - stamp) div 60000) +
` minutes old.');
-- Try to find and blow away the old instantiation of this
-- program, remove the lock, and exit.
if (oldpid := extract_pid locktext)om then
if pexists oldpid then
kill_process (oldpid);
else
msg (`However, process ' + str oldpid +
` appears to be no longer active.');
end if;
else
msg (`Cannot find pid in ' + str locktext + `.');
end if;
msg (`Removing ' + my_lock + ` and exiting with status = 1.');
cron_exit (1, render_lock locktext + ` old or bogus');
else
-- Active instance, if any, is not considered too old.
if (oldpid := extract_pid locktext)om then
if pexists oldpid then
-- Bow out gracefully.
cron_log (0, `instance ' + str oldpid + ` still active.');
stop;
else
msg (`Lock file ' + render_lock locktext +
` indicates a young and active process, but ' +
str oldpid + ` appears to be no longer active.');
msg (`Removing ' + my_lock + ` and exiting with status = 1.');
cron_exit (1, `instance ' + str oldpid +
` disappeared mysteriously');
end if;
else
msg (`Cannot find pid in ' + str locktext + `.');
msg (`Removing ' + my_lock + ` and exiting with status = 1.');
cron_exit (1, `no pid found in ' + render_lock locktext);
end if;
end if;
else
msg (`Cannot create lock file ' + my_lock + ` for reason ' +
str make_symlink_result + `, but cannot read it as a ' +
`symlink either, for reason ' + str last_error + `.');
msg (`Attempting unlink of ' + my_lock + `.');
clear_error;
unlink (my_lock);
if last_errorno_error then
msg (`Unlink attempt failed for reason ' + str last_error + `.');
elseif lexists my_lock then
msg (`Unlink appeared to succeed, but ' + my_lock +
` still exists.');
else
msg (`Unlink apparently successful.');
end if;
msg (`Exiting with status = 2.');
cron_exit (2, `problem with access to ' + my_lock);
end if;
end if;
-- We have now acquired the mutex lock named in my_lock.
-- Now see if the Box seems to be up and healthy.
-- The Box lock file is supposed to be a symbolic link to a string of
-- information (not a real file). Try to read that information:
if (box_locktext := readlink vc_lock) = om then
failure (`Cannot read ' + vc_lock + ` as a symbolic link');
end if;
-- Look for the Box's process id embedded in the link
if (boxpid := extract_pid box_locktext) = om then
failure (`Could not find pid in ' + str box_locktext);
end if;
-- Check whether the process thus identified really exists
if not pexists boxpid then
failure (`Process ' + str boxpid + ` indicated in lock file ' +
render_link (vc_lock, box_locktext) + ` has disappeared');
end if;
-- Wait to see pseudo-document indicating that Box is up and available
--- magic-constants file? vc-limits.setl?
interval := 100; -- ms
limit := 100000; -- ms
loop for ms in {interval,2*interval..limit} doing
flag := (content := getfile vc_link)om and
match (content, vc_prefix) = vc_prefix;
while not flag do
select (om, interval);
end loop;
if not flag then
failure (box() + ` failed to reach "running" state');
end if;
-- Try to open the Box's health-check service
fd := open_box_service (vc_health, `health check');
-- Exercise the health check
if (line := getline fd)`ok' then
failure (box() + ` failed health check, reason = ' + str line);
end if;
close (fd); fd := om;
-- Try to open Box's camera-control command service
fd := open_box_service (vc_camera, `camera control command');
-- Absorb its opening niceties
while split (getline fd ? `.')[`.'] loop
pass;
end loop;
-- Tell it to ensure consistency between hardware and software state
printa (fd, `check');
close (fd); fd := om;
-- Zero the restart counter, unlink lock, and exit ``successfully''
putfile (vc_rescount, `0');
cron_exit (0, `OK');
-- Fancy Box identifier
proc box();
return `Box {' + render_link (vc_lock, box_locktext) + `}';
end proc;
proc open_box_service (vc_hpfile, what);
-- Look for the designated service of the Box
if (hp := getfile vc_hpfile) = om then
failure (box() + ` failed to create file ' + str vc_hpfile);
end if;
-- Try to open hp, the host:port of the designated service
if (fd := open (hp, `socket')) = om then
failure (box() + ` not listening on ' + what + ` port ' + hp);
end if;
return fd;
end proc;
-- This routine is called when the Box appears to be down or
-- malfunctioning, and attempts a restart if conditions conduce:
proc failure (message);
-- Update count of restart attempts made since the last time a
-- properly functioning Box was detected:
[raw] := split (getfile vc_rescount ? `0');
if raw(`[1-9][0-9]*')raw and raw
`0' then
msg (vc_rescount + ` file corrupted - contains ' + str raw +
` instead of a number - treating as 0');
n := 0;
else
n := val raw;
end if;
n +:= 1;
putfile (vc_rescount, str n);
-- In case of recurring failures, this ratchets the restart attempts
-- back in powers of 2 up to a maximum interval of 64 units (which is
-- 64 minutes if this program is run every minute by cron):
if n mod 64 = 0 or exists i in [0..5] | 2**i = n then
-- Try to restart the Box.
-- First try to make sure there aren't any active instances of
-- vc-go, vc-quit, or vc-restart:
should_run_restart := true;
loop for lockfile in [vc_going, vc_quitting, vc_restarting]
while should_run_restart do
if (locktext := readlink lockfile)om then
stamp := extract_timestamp locktext ? 0;
if stamp < tod - 10*60*1000 then
-- Lock file more than 10 minutes old, or bogus
msg (`Removing lock file ' + render_lock locktext);
unlink (lockfile);
if (stale_pid := extract_pid locktext)om then
if pexists stale_pid then
kill_process (stale_pid);
end if;
else
msg (`Could not find pid in ' + str locktext);
end if;
else
-- Lock file still quite young. Skip this restart
-- opportunity - another will come along presently:
should_run_restart := false;
end if;
end if;
end loop;
if should_run_restart then
-- Attempt restart, let the world know about it, and exit:
msg (message + ` - attempting to restart Box');
system (vc_restart_cmd);
cron_exit (1, message + ` - performed ' + str vc_restart_cmd);
end if;
end if;
-- Log the failure, unlink the lock, and exit without spamming
-- stderr or attempting a restart
cron_exit (1, message);
end proc;
proc kill_process (process_id);
msg (`Killing ' + str process_id + ` ...');
kill (process_id); -- send TERM signal
select (om, 333); -- give TERM a chance
kill (process_id, `KILL'); -- send KILL signal to make sure
end proc;
-- The log file named in vc_cronlog should only be used for this
-- purpose; other spam will be spewed on stderr and mailed to
-- the user by cron:
proc cron_log (rc, message);
fd := open (vc_cronlog, `a');
printa (fd, fdate(tod), `: pid', pid, `: exit', rc, `:', message);
close (fd);
end proc;
proc cron_exit (rc, message);
cron_log (rc, message);
finis (rc);
end proc;
#include ``vc-admin.setl''