Version 6 (modified by jonmills, 8 years ago)

--

Overview

The challenge in monitoring an environment like a Eucalyptus cluster is that it is always changing. Virtual machines are created and destroyed all the time. When virtual machines are running, we want to monitor them. When they no longer exist, we want to stop monitoring them. And most of all, we don't want to constantly alter the configuration of our monitoring system by hand to add and remove these hosts and their affiliated checks. This is where OMD shines, because we can combine the utility of Nagios eventhandlers with the ability of Check_MK to (re-)inventory hosts, rebuild Nagios object configuration, and reload Nagios. The result is a dynamic system that always knows what to monitor, and what not to monitor.

'Check_MK inventory' Eventhandler: Used to add new services discovered by Check_MK

  • The first step is to set up an eventhandler that can respond to a situation in which the service check "Check_MK inventory" discovers a new service.
    • ( $USER4$ is a Nagios custom macro defined in $OMD_ROOT/etc/nagios/resources.cfg -- it corresponds to the value of $OMD_ROOT itself )
extra_nagios_conf += r"""

# Defines an eventhandler (a Nagios command) that will run when service_description "Check_MK inventory" discovers 
# new things on a host that it can monitor.  The purpose is to automatically reconfigure Check_MK
# to monitor those newly-discovered services.
define command {
    command_name    cmk_reinventory
    command_line    $USER4$/local/bin/cmk_reinventory.sh $HOSTNAME$ $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEATTEMPT$
}
"""

# Map the eventhandler command to the service definition in Nagios
extra_service_conf["event_handler"] = [
	( "cmk_reinventory", ALL_HOSTS, ["Check_MK inventory"]),
]
# Enable eventhandlers in Nagios for the service definition
extra_service_conf["event_handler_enabled"] = [
	( "1", ALL_HOSTS, ["Check_MK inventory"]),
]

cmk_reinventory Eventhandler script

  • This script re-writes Check_MK's configuration files, then reload Check_MK, which in turn re-compiles Nagios configuration, and reloads the Nagios daemon.
#!/bin/bash
#
# Event handler script for re-inventorying a host when the
# "Check_MK Inventory" check comes back telling you that there
# are unchecked services on a host.

export PATH="/omd/sites/nagios/lib/perl5/bin:/omd/sites/nagios/local/bin:/omd/sites/nagios/bin:/omd/sites/nagios/local/lib/perl5/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/opt/local/bin:/opt/local/sbin"

touch /tmp/cmk_reinventory

# Note: The "Check_MK Inventory" check is enabled by having
# something like the following in your main.mk file: 
#	
#	inventory_max_cachefile_age = 120 # seconds (Default: 120 seconds)
#	inventory_check_interval = 120 # minutes
#	inventory_check_severity = 1

# What state is the "Check_MK Inventory" service in?

# Command executed by Nagios was 
# 	/opt/omd/site/`whoami`/local/bin/cmk_reinventory.sh $HOSTNAME$ $SERVICESTATE$ $SERVICESTATETYPE$ \
#		$SERVICEATTEMPT$
#
# These are bash args brought in from the command line
LOG=/tmp/cmk_reinventory
HOSTNAME=$1
SERVICESTATE=$2
SERVICESTATETYPE=$3
SERVICEATTEMPT=$4

echo "$0" > $LOG
echo `date` >> $LOG

case "$SERVICESTATE" in

OK)

	# All services are checked, so don't do anything...
	echo "SERVICESTATE is $SERVICESTATE" >> $LOG
	echo "Run the add_vm.sh script against $HOSTNAME" >> $LOG
        echo "${OMD_ROOT}/local/bin/add_vm.sh $HOSTNAME" >>$LOG
        ${OMD_ROOT}/local/bin/add_vm.sh $HOSTNAME >>$LOG
	;;

WARNING)

	echo "SERVICESTATE is $SERVICESTATE" >> $LOG

	# Because we defined 'inventory_check_severity = 1' in main.mk,
	# unchecked services will result in a warning. Fix it!

        case "$SERVICESTATETYPE" in
        # We're in a "soft" state, meaning that Nagios is in the middle of retrying the
        # check before it turns into a "hard" state and contacts get notified...

        SOFT)

		echo "SERVICESTATETYPE is $SERVICESTATETYPE" >> $LOG
                # What check attempt are we on?  We don't want to restart the web server on the first
                # check, because it may just be a fluke!

                case "$SERVICEATTEMPT" in

                # Wait until the check has been tried 3 times before executing the command.
                3)
			echo "SERVICEATTEMPT is $SERVICEATTEMPT" >> $LOG

			#Do the work
			echo "Re-inventorying the host $HOSTNAME" >> $LOG
			cmk -Iu ${HOSTNAME} >> $LOG
			echo "Reloading nagios..." >> $LOG
			cmk -O >> $LOG
			echo "Run the add_vm.sh script against $HOSTNAME" >> $LOG
			echo "${OMD_ROOT}/local/bin/add_vm.sh $HOSTNAME" >>$LOG
			${OMD_ROOT}/local/bin/add_vm.sh $HOSTNAME >>$LOG
                        ;;
                        esac
                ;;

        # If somehow we missed the 3rd try of the SOFT state, then we'll try again HARD
        HARD)
		echo "SERVICESTATETYPE is $SERVICESTATETYPE" >> $LOG
		#Do the work
		echo "Re-inventorying the host $HOSTNAME" >> $LOG
                cmk -Iu ${HOSTNAME} >> $LOG
                echo "Reloading nagios..." >> $LOG
                cmk -O >> $LOG
                echo "Run the add_vm.sh script against $HOSTNAME" >> $LOG
                echo "${OMD_ROOT}/local/bin/add_vm.sh $HOSTNAME" >>$LOG
                ${OMD_ROOT}/local/bin/add_vm.sh $HOSTNAME >>$LOG
                ;;
	
        esac
	;;

UNKNOWN)
	# Don't do anything; we handle it in the WARNING section...
	;;

CRITICAL)

	# Don't do anything; we handle it in the WARNING section...
	;;
esac

exit 0

add_vm.sh script

  • The purpose of the add_vm.sh script is to figure out if all the KVM Virtual Machines running on a Node Controller are being monitored, and if not, to add the VM and its IP address into Check_MK's configuration.
  • The script is called directly by cmk_reinventory.sh, rather than triggered by a Nagios Event. The reason is because no such event exists to trigger this script! When "Check_MK inventory" discovers a new type of KVM check, it brings that up as a new service in Nagios. But because it's initial state is OK, it never triggers an event. (Events are only triggered by a state change, and going from non-existent to OK is not a state change in Nagios' view.)
#!/bin/bash
#
# Event handler script for re-inventorying a host when the
# "Check_MK inventory" check comes back telling you that there
# are unchecked services on a host.

export PATH="/omd/sites/nagios/lib/perl5/bin:/omd/sites/nagios/local/bin:/omd/sites/nagios/bin:/omd/sites/nagios/local/lib/perl5/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/opt/local/bin:/opt/local/sbin"

# Command executed by Nagios was 
# 	/opt/omd/site/`whoami`/local/bin/cmk_addhost.sh $HOSTNAME$
#
# ${OMD_ROOT} should be an env var set by OMD itself...

HOSTNAME=$1
NODE=$(echo $HOSTNAME | awk -F. '{print $1}')
LOG=/tmp/add_vm.sh
touch $LOG

# This yields the hostnames of VMs running on $HOSTNAME
VMLIST=$(grep 'qemu' ${OMD_ROOT}/var/check_mk/autochecks/${HOSTNAME}.mk | \
        awk '{print $3}' | sed 's/'\','//' | sed 's/'\''//')
echo "VMs running on $HOSTNAME are:" > $LOG
echo $VMLIST >> $LOG

for INSTANCE in $VMLIST
do
	#First, see if the INSTANCE exists in the hosts.mk file
	if [ $(grep $INSTANCE ${OMD_ROOT}/etc/check_mk/conf.d/hosts.mk >/dev/null; echo $?) = 1 ]; then

		# We need to add it to the ipaddresses.mk file for fakedns
		IP=$(euca-describe-addresses | grep $INSTANCE | awk '{print $2}')
		echo "Instance $INSTANCE on $HOSTNAME has IP $IP" >> $LOG
		[ $IP ] && sed -i '/ipaddresses/a'\"''$INSTANCE'" : "'$IP'",\' ${OMD_ROOT}/etc/check_mk/conf.d/ipaddresses.mk >> $LOG

		# Add this hostname to the hosts.mk file, sorting it right below its KVM nodes's name
		[ $IP ] && sed -i '/'$HOSTNAME'/a'\'''$INSTANCE'|ping|vm|'$NODE\','\''' ${OMD_ROOT}/etc/check_mk/conf.d/hosts.mk >> $LOG	

		# Now inventory the new host
		echo "cmk -Iu $INSTANCE" >> $LOG
		${OMD_ROOT}/bin/cmk -Iu ${INSTANCE} >> $LOG
	fi
done;

# Now reload Nagios
[ $INSTANCE ] && ${OMD_ROOT}/bin/cmk -O >> $LOG

exit 0

Removing VMs: Host Check Eventhandler

  • In Nagios, a Host Check is always a ping check, and the responses are UP or DOWN depending on whether the host could be reached.
  • We want to define an eventhandler that is triggered by the DOWN state of a host, but only for hosts with the Check_MK tag 'vm'
  • If the host has a 'vm' tag, and is in a DOWN state, and is no longer listed as 'running' or 'pending' by euca-describe-instances, then we want to remove it from Check_MK's hosts.mk & ipaddresses.mk files, and reload Check_MK & Nagios
extra_nagios_conf += r"""
define command {
    command_name    del_vm
    command_line    $USER4$/local/bin/del_vm.sh $HOSTNAME$ $HOSTSTATE$
}
"""
extra_host_conf["event_handler"] = [
	( "del_vm", [ "vm" ], ALL_HOSTS ),
]	
extra_host_conf["event_handler_enabled"] = [
	( "1", [ "vm" ], ALL_HOSTS ),
]

'del_vm' eventhandler script

#!/bin/bash
#
# Event handler script for re-inventorying a host when the
# "Check_MK Inventory" check comes back telling you that there
# are unchecked services on a host.

export PATH="/omd/sites/nagios/lib/perl5/bin:/omd/sites/nagios/local/bin:/omd/sites/nagios/bin:/omd/sites/nagios/local/lib/perl5/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/opt/local/bin:/opt/local/sbin"

# These are bash args brought in from the command line
LOG=/tmp/del_vm.sh
HOSTNAME=$1
HOSTSTATE=$2

case "$HOSTSTATE" in

UP)
	# Do nothing on ok
	;;

DOWN)

	# We need to verify the $INSTANCE is gone, using euca-describe-instances
       	RESULT=$(euca-describe-instances | egrep '(running|pending)' | grep ${HOSTNAME} >/dev/null; echo $?)
       	if [ $RESULT = 1 ]; then

	#Logging...
	touch $LOG
	echo $0 > $LOG
	echo `date` >> $LOG
	echo "HOSTNAME is $HOSTNAME" >> $LOG
	echo "HOSTSTATE is $HOSTSTATE" >> $LOG
	echo " " >> $LOG

	# Clean up cmk
	echo "Running cmk --flush $HOSTNAME" >> $LOG
	${OMD_ROOT}/bin/cmk --flush $HOSTNAME >> $LOG

	# Remove the VM from Check_MK
	echo "Removing $HOSTNAME from hosts.mk" >> $LOG
	/bin/sed -i '/'$HOSTNAME'/ d' ${OMD_ROOT}/etc/check_mk/conf.d/hosts.mk >> $LOG
	echo "Removing $HOSTNAME from ipaddresses.mk" >> $LOG
	/bin/sed -i '/'$HOSTNAME'/ d' ${OMD_ROOT}/etc/check_mk/conf.d/ipaddresses.mk >> $LOG

	# Now re-inventory && reload
	echo "Running cmk -IIu" >> $LOG
	${OMD_ROOT}/bin/cmk -IIu >> $LOG
	echo "Running cmk -O" >> $LOG
	${OMD_ROOT}/bin/cmk -O >> $LOG

	fi
        ;;

esac

exit 0