Introduction
The following script was developed for health monitoring on a set of 3x proxies (Linux Squid) in a HA configuration managed by a pair of Linux IPVS load balancers. A proxy is automatically taken out of the cluster service if failure conditions are met. The main task of a proxy is to retrieve websites on behalf of users, so the test condition is to try and retrieve multiple websites that are expected to be up. In order to prevent the whole cluster from going "down" because a single external website happens to be not reachable, failure to retrieve multiple sites is the configurable condition for one proxy being declared "down". If all cluster members are having trouble, it is likely a problem with the Internet access itself. Therefore we redirect users to a splash page in that case, avoiding the non-descriptive browser timeout message.
Network Design
In our case 3 proxies (Squid) run in parallel, using a identical system setup. A pair of IPVS loadbalancers in active/standby mode round-robins over the proxies by re-writing the Ethernet MAC. The proxy cluster-control script runs on the loadbalancers and executes its actions only on the active LB, if necessary.
Code
#!/bin/sh
##########################################################
# proxy-clustercontrol.sh v1.0 by frank4dd, 20051220
#
# This script checks external web sites to determine
# if a cluster member is "healthy". If not, the member
# will be disabled. Disabled cluster members are
# re-enabled when they start to work again.
#
# Runs via cron in five minute intervals on the IPVS LB's
##########################################################
# set debug: 0=off 1=normal 2=verbose 3=very verbose
DEBUG=0
# Find the binaries
##########################################################
IFCONFIG="/sbin/ifconfig"
HTTPING="/usr/bin/httping"
IPVSADM="/sbin/ipvsadm"
LOGGER="/usr/bin/logger"
GREP="/bin/grep"
# Set the cluster members for monitoring
##########################################################
CLUSTERMBRS="192.168.20.72 192.168.20.73 192.168.20.74"
# Set the proxy port
##########################################################
PROXYPORT="74"
# Set the cluster service IP
##########################################################
CLUSTER_IP="192.168.20.20"
# set the sites to check against (Number must be equal
# or larger then WEBSITEFAIL)
##########################################################
CHECKSITES="http://www.heise.de http://www.msn.de \
http://www.google.com http://www.yahoo.de http://www.web.de"
# Comma separated list of HTML codes defining success
# 200 = OK, 301 = moved,
##########################################################
SUCCESS_CODES="200,301,304"
# Website Timeout in seconds
##########################################################
SITE_TIMEOUT="2"
# Number of failed sites that define a member "failure"
##########################################################
WEBSITEFAIL="3"
##########################################################
################# FUNCTION DEFS ##########################
##########################################################
# define the httping command line configuration here:
# httping -x proxy:port -g url -m machine_readable_output -o
# html_statuscodes_which_are_ok -q(uiet)
# -o works only in combination with -m!
# machine readable output is -1 for fail and response time for sucess
# we can't use -c > 1 because output is written sequentially.
##########################################################
HTTPING_CMDLINE() {
if [ $DEBUG -eq 3 ]; then
echo "$HTTPING -x $MEMBER:$PROXYPORT -g $SITE -t $SITE_TIMEOUT -c 1 -m -o $SUCCESS_CODES -q"
fi
HTTPING_RETURN=`$HTTPING -x $MEMBER:$PROXYPORT -g $SITE -t $SITE_TIMEOUT -c 1 -m -o $SUCCESS_CODES -q`
HTTPING_STATUS=$?
}
# Command to add a system to the cluster
##########################################################
CLUSTER_ADD() {
MSG1="Add system $MEMBER to cluster."
MSG2="$MEMBER is already in cluster."
# check if system is already enabled
$IPVSADM --list -n | $GREP -q $MEMBER
if [ $? -ne 0 ]; then
logger -p user.info -t proxy-clustercontrol $MSG1
echo $MSG1
$IPVSADM --add-server --tcp-service $CLUSTER_IP:0 --real-server $MEMBER -g -w 1
# -a -t 192.168.20.20:0 -r 192.168.20.72:0 -g -w 1
else
if [ $DEBUG -ge 1 ]; then echo $MSG2; fi
fi
}
# Command to delete a system from the cluster
##########################################################
CLUSTER_DEL() {
MSG1="Remove system $MEMBER from cluster."
MSG2="No action, $MEMBER isn't in cluster."
# check if system is still enabled
$IPVSADM --list -n | $GREP -q $MEMBER
if [ $? -eq 0 ]; then
logger -p user.info -t proxy-clustercontrol $MSG1
echo $MSG1
$IPVSADM --delete-server --tcp-service $CLUSTER_IP:0 --real-server $MEMBER
else
if [ $DEBUG -ge 1 ]; then echo $MSG2; fi
fi
}
# check if we are the active loadbalancer, otherwise exit
##########################################################
CHECK_ACTIVE() {
MSG1="OK Running on a active loadbalancer."
MSG2="Not an active loadbalancer, exiting."
# check if we got the cluster service IP
$IFCONFIG -a | $GREP -q $CLUSTER_IP
if [ $? -eq 0 ]; then
echo $MSG1
logger -p user.info -t proxy-clustercontrol $MSG1
else
if [ $DEBUG -ge 1 ]; then echo $MSG2; fi
exit -1;
fi
}
##########################################################
################# MAIN ###################################
##########################################################
# count the failures per member, 0=OK
SITEFAIL=0
# check if all cluster members fail, 0=OK, 1=fail
CLUSTERFAIL=1
# check if the binaries are there
[ ! -x $HTTPING ] && { echo "$HTTPING not found, exiting."; exit -1; }
[ ! -x $LOGGER ] && { echo "$LOGGER not found, exiting."; exit -1; }
[ ! -x $IFCONFIG ] && { echo "$IFCONFIG not found, exiting."; exit -1; }
[ ! -x $GREP ] && { echo "$GREP not found, exiting."; exit -1; }
[ ! -x $IPVSADM ] && { echo "$IPVSADM not found, exiting."; exit -1; }
CHECK_ACTIVE
# cycle through all members and check for all sites
for MEMBER in $CLUSTERMBRS; do
# reset failure counter
SITEFAIL=0
if [ $DEBUG -ge 1 ]; then echo "Checking: ---> $MEMBER:"; fi
for SITE in $CHECKSITES; do
if [ $DEBUG -ge 2 ]; then echo "Checking: $SITE on $MEMBER."; fi
# do the httping test here
HTTPING_CMDLINE
if [ $DEBUG -eq 3 ]; then echo "Returned: $HTTPING_RETURN."; fi
if [ $DEBUG -eq 3 ]; then echo "Statcode: $HTTPING_STATUS."; fi
# HTTPING returncode is not null (i.e. 127 when the connection
# is actively refused
if [ $HTTPING_STATUS -ne 0 ]; then
if [ $DEBUG -ge 2 ]; then echo "Failures: Connect refused ** NOT OK **."; fi
SITEFAIL=`expr $SITEFAIL + 1`
# HTTPING returncode is not 127, check for the output of it
else
if [ ! $HTTPING_RETURN ]; then
if [ $DEBUG -ge 2 ]; then echo "Failures: Proxy timeout ** NOT OK **."; fi
SITEFAIL=`expr $SITEFAIL + 1`
else if [ $HTTPING_RETURN == -1 ]; then
if [ $DEBUG -ge 2 ]; then echo "Failures: HTTP status ** NOT OK **."; fi
SITEFAIL=`expr $SITEFAIL + 1`;
fi
fi
fi
done
if [ $DEBUG -ge 1 ]; then
echo "Site chk: $MEMBER failed $SITEFAIL times."
echo "-----------------------------------------"
fi
if [ $SITEFAIL -ge $WEBSITEFAIL ]; then
# proxy is broken.
# check if proxy is already disabled, otherwise disable
CLUSTER_DEL
else
# proxy works fine.
# check if proxy is already enabled, otherwise enable
CLUSTER_ADD
fi
# check if we still have one cluster member
$IPVSADM --list -n | $GREP -q $MEMBER
if [ $? -eq 0 ]; then CLUSTERFAIL=0; fi
done
# check if there is at least 1 cluster member left
# or if the whole cluster fails
if [ $CLUSTERFAIL -eq 1 ]; then
MSG="All cluster members failed."
logger -p user.info -t proxy-clustercontrol $MSG
echo $MSG
fi
exit 0
##########################################################
#################### END #################################
##########################################################
Credits and Links
- Linux web clustering with IPVS home is here