#!/opt/perl-5.12.2-emg/bin/perl # # EMG WATCHDOG # $Change: 43049 $ # # This watchdog will monitor EMG process, check disk space and more. # If a problem arises it will try to remedy the problem and send an e-mail notification. # It also features an integrated web server which provides a json api for EMG server restarts etc. # # You can create a file "emg_watchdog.cfg" in the same directory and override conf variables in it: # @notify_recipients = ('my_name@my_domain.com'); # # Available api calls: # /api/ping - Check that watchdog responds # /api/emg_start - Start emgd # /api/emg_stop - Stop emgd (it may be restarted by watch on next check) # /api/emg_status - Get status for emgd process # /api/get_log_file_list - Get list of EMG log files # /api/get_log_file - Get contents (tail) of a EMG log file (params: file, rows, grep) # use strict; use constant LOG_CRIT => 'CRIT'; use constant LOG_ERR => 'ERR'; use constant LOG_WARN => 'WARN'; use constant LOG_INFO => 'INFO'; use constant URI_EMG_START => '/api/emg_start'; use constant URI_EMG_STOP => '/api/emg_stop'; use constant URI_EMG_STATUS => '/api/emg_status'; use constant URI_GET_LOG_FILE_LIST => '/api/get_log_file_list'; use constant URI_GET_LOG_FILE => '/api/get_log_file'; use constant URI_PING => '/api/ping'; # You need to install these cpan modules if they are not already present in your system. # For example, run "cpan Email::Simple" to install that module (and its dependencies). use Email::Sender::Simple qw(sendmail); use Email::Sender::Transport::SMTP; use Email::Simple; use Email::Simple::Creator; use File::Basename; use File::Copy; use Filesys::Df; use HTTP::Daemon; use HTTP::Status; use IPC::Shareable (':lock'); use JSON; use Net::Subnet; use POSIX qw(strftime); use URI::QueryParam; # Configuration variables - modify to match your environment # EMG config dir our $emg_dir = '/home/emg/etc'; # Watchdog log file our $logfile = "$emg_dir/log/emg_watchdog.log"; # Stop emgd quickly (signal emgd to stop, wait a few seconds and then kill it) our $emg_quickstop = 1; # Specifies how often (in seconds) checks should be run our $check_interval = 60; # Specifies how often (in seconds) notifications should be sent for persistent error (21600 secs = 6 hours) our $notify_interval = 21600; # File system to check for used space our $fs_to_check = '/'; # File system usage limit (in %), notification will be sent when limit exceeded our $fs_limit = 80; # Port on which integrated web server listens for incoming requests our $listen_port = 3000; # Array with recipients for e-mail notifications (empty by default) our @notify_recipients = (''); # Example with two recipients #our @notify_recipients = ('john1@example.com', 'john2@example.com'); # E-mail notification "From" address our $mail_from = 'changeme_from@example.com'; # E-mail notification subject prefix (consider adding hostname for easier identification) our $mail_subject_prefix = 'EMG watchdog'; # IP address of SMTP server to use for sending e-mails our $smtp_server = '127.0.0.1'; # Port of SMTP server to use for sending e-mails our $smtp_port = '25'; # Username for SMTP server authentication (undef = no auth) our $smtp_username = undef; # Password for SMTP server authentication (undef = no auth) our $smtp_password = undef; # Allowed client ip addresses / subnets for web server our $allowed_client_ips = subnet_matcher qw( 127.0.0.1/32 192.168.0.0/24 ); # You shouldn't need to modify anything below... my $dirname = dirname(__FILE__); # Configuration file where we can override above "our" variables my $cfg_file = "$dirname/emg_watchdog.cfg"; if(-f $cfg_file) { require $cfg_file; } # Auto-flush output $| = 1; my $mail_transport = undef; my $emglock; my $last_notify_diskspace = 0; my $last_notify_emg_startup = 0; # Set up environment variables $ENV{EMGDIR} = $emg_dir; sub emg_start { $emglock->shlock; notify(LOG_WARN, "EMG startup requested"); my $output = `emgd`; if($? != 0) { sleep 1; my $msg = "Output from 'emgd':\n$output\n--\nGeneral log file (last 100 lines):\n" . `tail -100 $emg_dir/log/general`; notify(LOG_CRIT, "EMG startup failed", $msg); } else { notify(LOG_WARN, "EMG started"); } $emglock->shunlock; } sub emg_stop { $emglock->shlock; notify(LOG_WARN, "EMG stop requested"); if($emg_quickstop) { if(system('emgd --stop >/dev/null 2>&1 &') == 0) { sleep 3; } `pkill -9 -f "emgd\b"`; sleep 1; } else { `emgd --stop`; } notify(LOG_WARN, "EMG stopped"); $emglock->shunlock; } sub get_log_file_list { my $logdir = "$emg_dir/log"; opendir(my $dh, $logdir); my @files = readdir($dh); closedir($dh); my @fileinfos = (); foreach(sort @files) { # Skip file names that start with "." next if(/^\./); my $file = "$logdir/$_"; next unless(-f $file); my @stat = stat($file); my $fileinfo; $fileinfo->{filename} = $_; $fileinfo->{size} = $stat[7]; $fileinfo->{mtime} = strftime("%Y-%m-%d %H:%M:%S", localtime($stat[9])); push(@fileinfos, $fileinfo); } @fileinfos; } sub get_log_file { my ($file, $maxrows, $search_string) = @_; my $logdir = "$emg_dir/log"; $maxrows ||= 100; do_log(LOG_INFO, "Get log file $file, rows $maxrows, search_string $search_string"); if($search_string) { $search_string =~ s/^\s+//; if($search_string ne '') { my $cmd = "cat $logdir/$file"; foreach(split(/\s+/, "$search_string")) { $cmd .= " | grep -i \"$_\" "; } return `$cmd | tail -$maxrows`; } } return `tail -$maxrows $logdir/$file`; } sub is_allowed_client_ip($) { my $ip = shift; return $allowed_client_ips->($ip); } sub create_response { my($status, $data) = @_; $status ||= 200; my $content; if($status == 200) { $content->{'status'} = 'ok'; } else { $content->{'status'} = 'error'; } if(defined($data)) { $content->{'data'} = $data; } my $response = HTTP::Response->new($status); $response->header('Content-Type' => 'application/json'); $response->content(encode_json($content)); return $response; } sub process_request($$) { my($c, $r) = @_; if ($r->method ne 'GET') { return 0; } if ($r->uri->path eq URI_PING) { $c->send_response(create_response); return 1; } elsif ($r->uri->path eq URI_EMG_START) { emg_start; $c->send_response(create_response); return 1; } elsif ($r->uri->path eq URI_EMG_STOP) { emg_stop; $c->send_response(create_response); return 1; } elsif ($r->uri->path eq URI_GET_LOG_FILE_LIST) { my @fileinfos = get_log_file_list; $c->send_response(create_response(RC_OK, \@fileinfos)); return 1; } elsif ($r->uri->path =~ URI_GET_LOG_FILE) { my %query = $r->uri->query_form; my $file = $query{file}; my $maxrows = $query{maxrows}; my $search_string = $query{search_string}; my @rows = get_log_file($file, $maxrows, $search_string); $c->send_response(create_response(RC_OK, \@rows)); return 1; } elsif ($r->uri->path eq URI_EMG_STATUS) { my $data = 'unknown'; if(system('emgstat >/dev/null') == 0) { $data = 'running'; } $c->send_response(create_response(RC_OK, $data)); return 1; } return 0; } sub start_http_server { my $d = HTTP::Daemon->new( LocalPort => $listen_port ) || die "Could not start http server"; if(fork() != 0) { return; } print "Server started at ", $d->url, "\n"; while (my $c = $d->accept) { my $client_ip = $c->peerhost(); while (my $r = $c->get_request) { if(!is_allowed_client_ip($client_ip)) { do_log(LOG_WARN, "Rejected request from ip $client_ip, uri=" . $r->uri->path); $c->send_response(create_response(RC_FORBIDDEN, 'Client not allowed')); next; } do_log(LOG_INFO, "Request from ip $client_ip, uri=" . $r->uri->path); unless(process_request($c, $r)) { $c->send_response(create_response(RC_FORBIDDEN, 'Invalid request')); } } $c->close; undef($c); } exit; } sub init_smtp_client { my %params = { host => $smtp_server, port => $smtp_port, sasl_username => $smtp_username, sasl_password => $smtp_password }; $mail_transport = Email::Sender::Transport::SMTP->new( %params ); } sub init_lock { $emglock = tie my $_emglock, 'IPC::Shareable', { key => 'emgl', create => 1, mode => 0600 }; } sub init_all { init_smtp_client; init_lock; } sub notify_send { my ($to, $from, $subject, $message) = @_; my $email = Email::Simple->create( header => [ To => $to, From => $from, Subject => $subject, ], body => $message, ); sendmail($email, { transport => $mail_transport }); } sub do_log($$) { my($loglevel, $subject) = @_; my $ts = strftime("%Y-%m-%d %H:%M:%S", localtime); open(my $fh, ">>$logfile"); print $fh "$ts $loglevel: $subject\n"; close($fh); } sub notify($$;$) { my ($loglevel, $subject, $message) = @_; $message ||= $subject; do_log($loglevel, $subject); foreach my $recipient ( @notify_recipients) { if($recipient ne '') { notify_send($recipient, $mail_from, "$mail_subject_prefix - $loglevel: $subject", $message); } } } sub check_env { if(system('emgd -v >/dev/null') != 0) { print "Could not run \"emgd -v\", environment does not seem to be set up correctly.\n"; print "EMGDIR is set to $ENV{EMGDIR}, does that seem correct?\n"; die; } } sub check_emg { $emglock->shlock; if(system('emgstat >/dev/null') != 0) { my $output = `emgd`; if($? != 0) { if(time > ($last_notify_emg_startup + $notify_interval)) { my $msg = "Output from 'emgd':\n$output\n--\nGeneral log file (last 100 lines):\n" . `tail -100 $emg_dir/log/general`; notify(LOG_CRIT, "EMG not running and startup failed", $msg); $last_notify_emg_startup = time; } $emglock->shunlock; return; } sleep 2; if(system('emgstat >/dev/null') != 0) { if(time > ($last_notify_emg_startup + $notify_interval)) { notify(LOG_CRIT, "EMG not running and startup seems to have failed"); $last_notify_emg_startup = time; } $emglock->shunlock; return; } notify(LOG_WARN, "EMG was not running but should now be up"); } $last_notify_emg_startup = 0; $emglock->shunlock; } sub check_diskspace { my $ref = df($fs_to_check); my $pc = $ref->{per}; if($pc > $fs_limit) { if(time > ($last_notify_diskspace + $notify_interval)) { notify(LOG_WARN, "File system $fs_to_check, $pc% full (limit $fs_limit%)"); $last_notify_diskspace = time; } } else { $last_notify_diskspace = 0; } } # Notify and exit if termination signal received sub caught_signal { notify(LOG_INFO, "Stopped"); exit; } # Verify that we can run emg commands check_env; # Init stuff init_all; # Spawn http server for serving actions start_http_server; notify(LOG_INFO, "Started"); sleep 1; # Set up signal handling $SIG{INT} = \&caught_signal; $SIG{TERM} = \&caught_signal; # Monitor loop while(1) { check_diskspace; check_emg; print "Sleeping for $check_interval seconds ... \n"; sleep $check_interval; }