Version 2 (modified by jonmills, 8 years ago)



The challenge in monitoring an environment like a Eucalyptus cluster is that it is always changing. Virtual machines are created and destroyed all the time. When virtual machines are running, we want to monitor them. When they no longer exist, we want to stop monitoring them. And most of all, we don't want to constantly alter the configuration of our monitoring system by hand to add and remove these hosts and their affiliated checks. This is where OMD shines, because we can combine the utility of Nagios eventhandlers with the ability of Check_MK to (re-)inventory hosts, rebuild Nagios object configuration, and reload Nagios. The result is a dynamic system that always knows what to monitor, and what not to monitor.

Check_MK inventory Eventhandler

The first step is to set up an eventhandler that can respond to a situation in which the service check "Check_MK inventory" discovers a new service.

extra_nagios_conf += r"""

# Defines an eventhandler (a Nagios command) that will run when service_description "Check_MK inventory" discovers 
# new things on a host that it can monitor.  The purpose is to automatically reconfigure Check_MK
# to monitor those newly-discovered services.
define command {
    command_name    cmk_reinventory

# Map the eventhandler command to the service definition in Nagios
extra_service_conf["event_handler"] = [
	( "cmk_reinventory", ALL_HOSTS, ["Check_MK inventory"]),
# Enable eventhandlers in Nagios for the service definition
extra_service_conf["event_handler_enabled"] = [
	( "1", ALL_HOSTS, ["Check_MK inventory"]),

Host Check Eventhandler

  • In Nagios, a Host Check is always a ping check, and the responses are UP or DOWN depending on whether the host could be reached.
  • We want to define an eventhandler that is triggered by the DOWN state of a host, but only for hosts with the Check_MK tag 'vm'
  • If the host has a 'vm' tag, and is in a DOWN state, and is no longer listed as 'running' or 'pending' by euca-describe-instances, then we want to remove it from Check_MK's & files, and reload Check_MK & Nagios
extra_nagios_conf += r"""
define command {
    command_name    del_vm
    command_line    $USER4$/local/bin/ $HOSTNAME$ $HOSTSTATE$
extra_host_conf["event_handler"] = [
	( "del_vm", [ "vm" ], ALL_HOSTS ),
extra_host_conf["event_handler_enabled"] = [
	( "1", [ "vm" ], ALL_HOSTS ),

'del_vm' eventhandler script

# Event handler script for re-inventorying a host when the
# "Check_MK Inventory" check comes back telling you that there
# are unchecked services on a host.

export PATH="/omd/sites/nagios/lib/perl5/bin:/omd/sites/nagios/local/bin:/omd/sites/nagios/bin:/omd/sites/nagios/local/lib/perl5/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/opt/local/bin:/opt/local/sbin"

# These are bash args brought in from the command line

case "$HOSTSTATE" in

	# Do nothing on ok


	# We need to verify the $INSTANCE is gone, using euca-describe-instances
       	RESULT=$(euca-describe-instances | egrep '(running|pending)' | grep ${HOSTNAME} >/dev/null; echo $?)
       	if [ $RESULT = 1 ]; then

	touch $LOG
	echo $0 > $LOG
	echo `date` >> $LOG
	echo " " >> $LOG

	# Clean up cmk
	echo "Running cmk --flush $HOSTNAME" >> $LOG
	${OMD_ROOT}/bin/cmk --flush $HOSTNAME >> $LOG

	# Remove the VM from Check_MK
	echo "Removing $HOSTNAME from" >> $LOG
	/bin/sed -i '/'$HOSTNAME'/ d' ${OMD_ROOT}/etc/check_mk/conf.d/ >> $LOG
	echo "Removing $HOSTNAME from" >> $LOG
	/bin/sed -i '/'$HOSTNAME'/ d' ${OMD_ROOT}/etc/check_mk/conf.d/ >> $LOG

	# Now re-inventory && reload
	echo "Running cmk -IIu" >> $LOG
	${OMD_ROOT}/bin/cmk -IIu >> $LOG
	echo "Running cmk -O" >> $LOG
	${OMD_ROOT}/bin/cmk -O >> $LOG



exit 0