#!/bin/bash fs=`df -lm / | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4` datafs=`df -lm /mnt/ands | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4` mem=`free -g | grep "Mem" | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 7` cpu=`uptime | sed -e "s/[[:space:]]/\n/g" | tail -n 1` if [ $fs -le 8192 ]; then echo "Only $(($fs / 1024)) GB left in the root file system" fi if [ $datafs -le 1048576 ]; then echo "Only $(($datafs / 1024)) GB left in the data file system" fi if [ $mem -le 16 ]; then echo "The system is starving on memory, $mem GB left free" fi if [ `echo "$cpu < 20" | bc` -eq 0 ]; then echo "The system is starving on cpu, $cpu is load average for the last 15 min" fi vol=$(/opt/MegaRAID/storcli/storcli64 /c0/v0 show | grep -P "^0/0" | grep "Optl" | wc -l) if [ -z "$vol" -o "$vol" -eq 0 ]; then echo "Raid volume is not optimal:" /opt/MegaRAID/storcli/storcli64 /c0/v0 show | grep -P "^0/0" fi disks=$(/opt/MegaRAID/storcli/storcli64 /c0 show | grep -P "(HDD|SSD)" | grep -v "Onln" | wc -l) if [ -z "$disks" -o "$disks" -ne 0 ]; then echo "Not all disks are online:" /opt/MegaRAID/storcli/storcli64 /c0 show | grep -P "(HDD|SSD)" | grep -v "Onln" fi ifaces=$(ovs-vsctl show | grep -oP "could not open network device\s*\Kveth[a-f0-9]+" | wc -l) if [ "$ifaces" -gt 50 ]; then echo "Too many rogue interfaces ($ifaces) is registered on OpenVSwitch bridge. It could introduce large delays in pod scheduling..." fi #Check various known problems vssize=$(du -sm /var/log/openvswitch/ovs-vswitchd.log | cut -f 1) if [ "$vssize" -gt 128 ]; then echo "Current OpenVSwitch log is over $vssize MB. It could indicate some severe problems in pod networking..." fi host google.com &> /dev/null if [ $? -ne 0 ]; then echo "DNS problems, can't resolve google.com" fi ping -c 1 -W 2 8.8.8.8 &> /dev/null if [ $? -ne 0 ]; then echo "Networkign problems, can't ping Google's public DNS server" fi