summaryrefslogtreecommitdiffstats
path: root/roles/ands_monitor/templates/scripts/check_server_status.sh.j2
blob: 0bef13cef084fd24a9969a7221c953ff3df9bd92 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/bin/bash

fs=`df -lm / | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4`
datafs=`df -lm /mnt/ands | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4`
mem=`free -g | grep "Mem" | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 7`
cpu=`uptime | sed -e "s/[[:space:]]/\n/g" | tail -n 1`

if [ $fs -le 8192 ]; then
    echo "Only $(($fs / 1024)) GB left in the root file system"
fi

if [ $datafs -le 1048576 ]; then
    echo "Only $(($datafs / 1024)) GB left in the data file system"
fi

if [ $mem -le 16 ]; then
    echo "The system is starving on memory, $mem GB left free"
fi

if [ `echo "$cpu < 20" | bc` -eq 0 ]; then
    echo "The system is starving on cpu, $cpu is load average for the last 15 min"
fi

vol=$(/opt/MegaRAID/storcli/storcli64 /c0/v0 show | grep -P "^0/0" | grep "Optl" | wc -l)
if [ -z "$vol" -o "$vol" -eq 0 ]; then
    echo "Raid volume is not optimal:"
    /opt/MegaRAID/storcli/storcli64 /c0/v0 show | grep -P "^0/0"
fi

disks=$(/opt/MegaRAID/storcli/storcli64 /c0 show | grep -P "(HDD|SSD)" | grep -v "Onln" | wc -l)
if [ -z "$disks" -o "$disks" -ne 0 ]; then
    echo "Not all disks are online:"
    /opt/MegaRAID/storcli/storcli64 /c0 show | grep -P "(HDD|SSD)" | grep -v "Onln"
fi

ifaces=$(ovs-vsctl show | grep -oP "could not open network device\s*\Kveth[a-f0-9]+" | wc -l)
if [ "$ifaces" -gt 50 ]; then
    echo "Too many rogue interfaces ($ifaces) is registered on OpenVSwitch bridge. It could introduce large delays in pod scheduling..."
fi

#Check various known problems
vssize=$(du -sm /var/log/openvswitch/ovs-vswitchd.log | cut -f 1)
if [ "$vssize" -gt 128 ]; then
    echo "Current OpenVSwitch log is over $vssize MB. It could indicate some severe problems in pod networking..."
fi

host google.com &> /dev/null
if [ $? -ne 0 ]; then
    echo "DNS problems, can't resolve google.com"
fi

ping -c 1 -W 2 8.8.8.8 &> /dev/null
if [ $? -ne 0 ]; then
    echo "Networkign problems, can't ping Google's public DNS server"
fi