Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 1 | #!/bin/bash |
| 2 | |
| 3 | trap "exit" INT TERM |
| 4 | trap "kill 0" EXIT |
| 5 | |
| 6 | # Prepare |
| 7 | BACKGROUND_TASKS=() |
| 8 | echo "Waiting for containers to settle..." |
Matthias Andreas Benkard | 12a5735 | 2021-12-28 18:02:04 +0100 | [diff] [blame] | 9 | for i in {30..1}; do |
| 10 | echo "${i}" |
| 11 | sleep 1 |
| 12 | done |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 13 | |
| 14 | if [[ "${USE_WATCHDOG}" =~ ^([nN][oO]|[nN])+$ ]]; then |
| 15 | echo -e "$(date) - USE_WATCHDOG=n, skipping watchdog..." |
| 16 | sleep 365d |
| 17 | exec $(readlink -f "$0") |
| 18 | fi |
| 19 | |
Matthias Andreas Benkard | 12a5735 | 2021-12-28 18:02:04 +0100 | [diff] [blame] | 20 | if [[ "${WATCHDOG_VERBOSE}" =~ ^([yY][eE][sS]|[yY])+$ ]]; then |
| 21 | SMTP_VERBOSE="--verbose" |
| 22 | set -xv |
| 23 | else |
| 24 | SMTP_VERBOSE="" |
| 25 | exec 2>/dev/null |
| 26 | fi |
| 27 | |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 28 | # Checks pipe their corresponding container name in this pipe |
| 29 | if [[ ! -p /tmp/com_pipe ]]; then |
| 30 | mkfifo /tmp/com_pipe |
| 31 | fi |
| 32 | |
| 33 | # Wait for containers |
| 34 | while ! mysqladmin status --socket=/var/run/mysqld/mysqld.sock -u${DBUSER} -p${DBPASS} --silent; do |
| 35 | echo "Waiting for SQL..." |
| 36 | sleep 2 |
| 37 | done |
| 38 | |
| 39 | # Do not attempt to write to slave |
| 40 | if [[ ! -z ${REDIS_SLAVEOF_IP} ]]; then |
| 41 | REDIS_CMDLINE="redis-cli -h ${REDIS_SLAVEOF_IP} -p ${REDIS_SLAVEOF_PORT}" |
| 42 | else |
| 43 | REDIS_CMDLINE="redis-cli -h redis -p 6379" |
| 44 | fi |
| 45 | |
| 46 | until [[ $(${REDIS_CMDLINE} PING) == "PONG" ]]; do |
| 47 | echo "Waiting for Redis..." |
| 48 | sleep 2 |
| 49 | done |
| 50 | |
| 51 | ${REDIS_CMDLINE} DEL F2B_RES > /dev/null |
| 52 | |
| 53 | # Common functions |
| 54 | get_ipv6(){ |
| 55 | local IPV6= |
| 56 | local IPV6_SRCS= |
| 57 | local TRY= |
Matthias Andreas Benkard | 7b2a3a1 | 2021-08-16 10:57:25 +0200 | [diff] [blame] | 58 | IPV6_SRCS[0]="ip6.mailcow.email" |
| 59 | IPV6_SRCS[1]="ip6.nevondo.com" |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 60 | until [[ ! -z ${IPV6} ]] || [[ ${TRY} -ge 10 ]]; do |
| 61 | IPV6=$(curl --connect-timeout 3 -m 10 -L6s ${IPV6_SRCS[$RANDOM % ${#IPV6_SRCS[@]} ]} | grep "^\([0-9a-fA-F]\{0,4\}:\)\{1,7\}[0-9a-fA-F]\{0,4\}$") |
| 62 | [[ ! -z ${TRY} ]] && sleep 1 |
| 63 | TRY=$((TRY+1)) |
| 64 | done |
| 65 | echo ${IPV6} |
| 66 | } |
| 67 | |
| 68 | array_diff() { |
| 69 | # https://stackoverflow.com/questions/2312762, Alex Offshore |
| 70 | eval local ARR1=\(\"\${$2[@]}\"\) |
| 71 | eval local ARR2=\(\"\${$3[@]}\"\) |
| 72 | local IFS=$'\n' |
| 73 | mapfile -t $1 < <(comm -23 <(echo "${ARR1[*]}" | sort) <(echo "${ARR2[*]}" | sort)) |
| 74 | } |
| 75 | |
| 76 | progress() { |
| 77 | SERVICE=${1} |
| 78 | TOTAL=${2} |
| 79 | CURRENT=${3} |
| 80 | DIFF=${4} |
| 81 | [[ -z ${DIFF} ]] && DIFF=0 |
| 82 | [[ -z ${TOTAL} || -z ${CURRENT} ]] && return |
| 83 | [[ ${CURRENT} -gt ${TOTAL} ]] && return |
| 84 | [[ ${CURRENT} -lt 0 ]] && CURRENT=0 |
| 85 | PERCENT=$(( 200 * ${CURRENT} / ${TOTAL} % 2 + 100 * ${CURRENT} / ${TOTAL} )) |
| 86 | ${REDIS_CMDLINE} LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"service\":\"${SERVICE}\",\"lvl\":\"${PERCENT}\",\"hpnow\":\"${CURRENT}\",\"hptotal\":\"${TOTAL}\",\"hpdiff\":\"${DIFF}\"}" > /dev/null |
| 87 | log_msg "${SERVICE} health level: ${PERCENT}% (${CURRENT}/${TOTAL}), health trend: ${DIFF}" no_redis |
| 88 | # Return 10 to indicate a dead service |
| 89 | [ ${CURRENT} -le 0 ] && return 10 |
| 90 | } |
| 91 | |
| 92 | log_msg() { |
| 93 | if [[ ${2} != "no_redis" ]]; then |
| 94 | ${REDIS_CMDLINE} LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"message\":\"$(printf '%s' "${1}" | \ |
| 95 | tr '\r\n%&;$"_[]{}-' ' ')\"}" > /dev/null |
| 96 | fi |
| 97 | echo $(date) $(printf '%s\n' "${1}") |
| 98 | } |
| 99 | |
| 100 | function mail_error() { |
| 101 | THROTTLE= |
| 102 | [[ -z ${1} ]] && return 1 |
| 103 | # If exists, body will be the content of "/tmp/${1}", even if ${2} is set |
| 104 | [[ -z ${2} ]] && BODY="Service was restarted on $(date), please check your mailcow installation." || BODY="$(date) - ${2}" |
| 105 | # If exists, mail will be throttled by argument in seconds |
| 106 | [[ ! -z ${3} ]] && THROTTLE=${3} |
| 107 | if [[ ! -z ${THROTTLE} ]]; then |
| 108 | TTL_LEFT="$(${REDIS_CMDLINE} TTL THROTTLE_${1} 2> /dev/null)" |
| 109 | if [[ "${TTL_LEFT}" == "-2" ]]; then |
| 110 | # Delay key not found, setting a delay key now |
| 111 | ${REDIS_CMDLINE} SET THROTTLE_${1} 1 EX ${THROTTLE} |
| 112 | else |
| 113 | log_msg "Not sending notification email now, blocked for ${TTL_LEFT} seconds..." |
| 114 | return 1 |
| 115 | fi |
| 116 | fi |
| 117 | WATCHDOG_NOTIFY_EMAIL=$(echo "${WATCHDOG_NOTIFY_EMAIL}" | sed 's/"//;s|"$||') |
| 118 | # Some exceptions for subject and body formats |
| 119 | if [[ ${1} == "fail2ban" ]]; then |
| 120 | SUBJECT="${BODY}" |
| 121 | BODY="Please see netfilter-mailcow for more details and triggered rules." |
| 122 | else |
Matthias Andreas Benkard | 7b2a3a1 | 2021-08-16 10:57:25 +0200 | [diff] [blame] | 123 | SUBJECT="${WATCHDOG_SUBJECT}: ${1}" |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 124 | fi |
| 125 | IFS=',' read -r -a MAIL_RCPTS <<< "${WATCHDOG_NOTIFY_EMAIL}" |
| 126 | for rcpt in "${MAIL_RCPTS[@]}"; do |
| 127 | RCPT_DOMAIN= |
Matthias Andreas Benkard | 12a5735 | 2021-12-28 18:02:04 +0100 | [diff] [blame] | 128 | RCPT_MX= |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 129 | RCPT_DOMAIN=$(echo ${rcpt} | awk -F @ {'print $NF'}) |
Matthias Andreas Benkard | 12a5735 | 2021-12-28 18:02:04 +0100 | [diff] [blame] | 130 | CHECK_FOR_VALID_MX=$(dig +short ${RCPT_DOMAIN} mx) |
| 131 | if [[ -z ${CHECK_FOR_VALID_MX} ]]; then |
| 132 | log_msg "Cannot determine MX for ${rcpt}, skipping email notification..." |
| 133 | return 1 |
| 134 | fi |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 135 | [ -f "/tmp/${1}" ] && BODY="/tmp/${1}" |
| 136 | timeout 10s ./smtp-cli --missing-modules-ok \ |
Matthias Andreas Benkard | 12a5735 | 2021-12-28 18:02:04 +0100 | [diff] [blame] | 137 | "${SMTP_VERBOSE}" \ |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 138 | --charset=UTF-8 \ |
| 139 | --subject="${SUBJECT}" \ |
| 140 | --body-plain="${BODY}" \ |
| 141 | --add-header="X-Priority: 1" \ |
| 142 | --to=${rcpt} \ |
| 143 | --from="watchdog@${MAILCOW_HOSTNAME}" \ |
| 144 | --hello-host=${MAILCOW_HOSTNAME} \ |
| 145 | --ipv4 |
Matthias Andreas Benkard | 12a5735 | 2021-12-28 18:02:04 +0100 | [diff] [blame] | 146 | if [[ $? -eq 1 ]]; then # exit code 1 is fine |
| 147 | log_msg "Sent notification email to ${rcpt}" |
| 148 | else |
| 149 | if [[ "${SMTP_VERBOSE}" == "" ]]; then |
| 150 | log_msg "Error while sending notification email to ${rcpt}. You can enable verbose logging by setting 'WATCHDOG_VERBOSE=y' in mailcow.conf." |
| 151 | else |
| 152 | log_msg "Error while sending notification email to ${rcpt}." |
| 153 | fi |
| 154 | fi |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 155 | done |
| 156 | } |
| 157 | |
| 158 | get_container_ip() { |
| 159 | # ${1} is container |
| 160 | CONTAINER_ID=() |
| 161 | CONTAINER_IPS=() |
| 162 | CONTAINER_IP= |
| 163 | LOOP_C=1 |
| 164 | until [[ ${CONTAINER_IP} =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]] || [[ ${LOOP_C} -gt 5 ]]; do |
| 165 | if [ ${IP_BY_DOCKER_API} -eq 0 ]; then |
| 166 | CONTAINER_IP=$(dig a "${1}" +short) |
| 167 | else |
| 168 | sleep 0.5 |
| 169 | # get long container id for exact match |
| 170 | CONTAINER_ID=($(curl --silent --insecure https://dockerapi/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], id: .Id}" | jq -rc "select( .name | tostring == \"${1}\") | select( .project | tostring | contains(\"${COMPOSE_PROJECT_NAME,,}\")) | .id")) |
| 171 | # returned id can have multiple elements (if scaled), shuffle for random test |
| 172 | CONTAINER_ID=($(printf "%s\n" "${CONTAINER_ID[@]}" | shuf)) |
| 173 | if [[ ! -z ${CONTAINER_ID} ]]; then |
| 174 | for matched_container in "${CONTAINER_ID[@]}"; do |
Matthias Andreas Benkard | 12a5735 | 2021-12-28 18:02:04 +0100 | [diff] [blame] | 175 | CONTAINER_IPS=($(curl --silent --insecure https://dockerapi/containers/${matched_container}/json | jq -r '.NetworkSettings.Networks[].IPAddress')) |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 176 | for ip_match in "${CONTAINER_IPS[@]}"; do |
| 177 | # grep will do nothing if one of these vars is empty |
| 178 | [[ -z ${ip_match} ]] && continue |
| 179 | [[ -z ${IPV4_NETWORK} ]] && continue |
| 180 | # only return ips that are part of our network |
| 181 | if ! grep -q ${IPV4_NETWORK} <(echo ${ip_match}); then |
| 182 | continue |
| 183 | else |
| 184 | CONTAINER_IP=${ip_match} |
| 185 | break |
| 186 | fi |
| 187 | done |
| 188 | [[ ! -z ${CONTAINER_IP} ]] && break |
| 189 | done |
| 190 | fi |
| 191 | fi |
| 192 | LOOP_C=$((LOOP_C + 1)) |
| 193 | done |
| 194 | [[ ${LOOP_C} -gt 5 ]] && echo 240.0.0.0 || echo ${CONTAINER_IP} |
| 195 | } |
| 196 | |
| 197 | # One-time check |
| 198 | if grep -qi "$(echo ${IPV6_NETWORK} | cut -d: -f1-3)" <<< "$(ip a s)"; then |
| 199 | if [[ -z "$(get_ipv6)" ]]; then |
| 200 | mail_error "ipv6-config" "enable_ipv6 is true in docker-compose.yml, but an IPv6 link could not be established. Please verify your IPv6 connection." |
| 201 | fi |
| 202 | fi |
| 203 | |
| 204 | external_checks() { |
| 205 | err_count=0 |
| 206 | diff_c=0 |
| 207 | THRESHOLD=${EXTERNAL_CHECKS_THRESHOLD} |
| 208 | # Reduce error count by 2 after restarting an unhealthy container |
| 209 | GUID=$(mysql -u${DBUSER} -p${DBPASS} ${DBNAME} -e "SELECT version FROM versions WHERE application = 'GUID'" -BN) |
| 210 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 211 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 212 | err_c_cur=${err_count} |
| 213 | CHECK_REPONSE="$(curl --connect-timeout 3 -m 10 -4 -s https://checks.mailcow.email -X POST -dguid=${GUID} 2> /dev/null)" |
| 214 | if [[ ! -z "${CHECK_REPONSE}" ]] && [[ "$(echo ${CHECK_REPONSE} | jq -r .response)" == "critical" ]]; then |
| 215 | echo ${CHECK_REPONSE} | jq -r .out > /tmp/external_checks |
| 216 | err_count=$(( ${err_count} + 1 )) |
| 217 | fi |
| 218 | CHECK_REPONSE6="$(curl --connect-timeout 3 -m 10 -6 -s https://checks.mailcow.email -X POST -dguid=${GUID} 2> /dev/null)" |
| 219 | if [[ ! -z "${CHECK_REPONSE6}" ]] && [[ "$(echo ${CHECK_REPONSE6} | jq -r .response)" == "critical" ]]; then |
| 220 | echo ${CHECK_REPONSE} | jq -r .out > /tmp/external_checks |
| 221 | err_count=$(( ${err_count} + 1 )) |
| 222 | fi |
| 223 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 224 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 225 | progress "External checks" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 226 | if [[ $? == 10 ]]; then |
| 227 | diff_c=0 |
| 228 | sleep 60 |
| 229 | else |
| 230 | diff_c=0 |
Matthias Andreas Benkard | 7b2a3a1 | 2021-08-16 10:57:25 +0200 | [diff] [blame] | 231 | sleep $(( ( RANDOM % 20 ) + 1800 )) |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 232 | fi |
| 233 | done |
| 234 | return 1 |
| 235 | } |
| 236 | |
| 237 | nginx_checks() { |
| 238 | err_count=0 |
| 239 | diff_c=0 |
| 240 | THRESHOLD=${NGINX_THRESHOLD} |
| 241 | # Reduce error count by 2 after restarting an unhealthy container |
| 242 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 243 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 244 | touch /tmp/nginx-mailcow; echo "$(tail -50 /tmp/nginx-mailcow)" > /tmp/nginx-mailcow |
| 245 | host_ip=$(get_container_ip nginx-mailcow) |
| 246 | err_c_cur=${err_count} |
| 247 | /usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u / -p 8081 2>> /tmp/nginx-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 248 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 249 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 250 | progress "Nginx" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 251 | if [[ $? == 10 ]]; then |
| 252 | diff_c=0 |
| 253 | sleep 1 |
| 254 | else |
| 255 | diff_c=0 |
| 256 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 257 | fi |
| 258 | done |
| 259 | return 1 |
| 260 | } |
| 261 | |
| 262 | unbound_checks() { |
| 263 | err_count=0 |
| 264 | diff_c=0 |
| 265 | THRESHOLD=${UNBOUND_THRESHOLD} |
| 266 | # Reduce error count by 2 after restarting an unhealthy container |
| 267 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 268 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 269 | touch /tmp/unbound-mailcow; echo "$(tail -50 /tmp/unbound-mailcow)" > /tmp/unbound-mailcow |
| 270 | host_ip=$(get_container_ip unbound-mailcow) |
| 271 | err_c_cur=${err_count} |
Matthias Andreas Benkard | 38837a8 | 2021-01-02 11:13:53 +0100 | [diff] [blame] | 272 | /usr/bin/nslookup -sil stackoverflow.com "${host_ip}" 2>> /tmp/unbound-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
Matthias Andreas Benkard | 4e2e5d9 | 2021-01-02 07:43:06 +0100 | [diff] [blame] | 273 | DNSSEC=$(dig com +dnssec "@${host_ip}" | egrep 'flags:.+ad') |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 274 | if [[ -z ${DNSSEC} ]]; then |
| 275 | echo "DNSSEC failure" 2>> /tmp/unbound-mailcow 1>&2 |
| 276 | err_count=$(( ${err_count} + 1)) |
| 277 | else |
| 278 | echo "DNSSEC check succeeded" 2>> /tmp/unbound-mailcow 1>&2 |
| 279 | fi |
| 280 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 281 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 282 | progress "Unbound" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 283 | if [[ $? == 10 ]]; then |
| 284 | diff_c=0 |
| 285 | sleep 1 |
| 286 | else |
| 287 | diff_c=0 |
| 288 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 289 | fi |
| 290 | done |
| 291 | return 1 |
| 292 | } |
| 293 | |
| 294 | redis_checks() { |
| 295 | # A check for the local redis container |
| 296 | err_count=0 |
| 297 | diff_c=0 |
| 298 | THRESHOLD=${REDIS_THRESHOLD} |
| 299 | # Reduce error count by 2 after restarting an unhealthy container |
| 300 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 301 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 302 | touch /tmp/redis-mailcow; echo "$(tail -50 /tmp/redis-mailcow)" > /tmp/redis-mailcow |
| 303 | host_ip=$(get_container_ip redis-mailcow) |
| 304 | err_c_cur=${err_count} |
| 305 | /usr/lib/nagios/plugins/check_tcp -4 -H redis-mailcow -p 6379 -E -s "PING\n" -q "QUIT" -e "PONG" 2>> /tmp/redis-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 306 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 307 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 308 | progress "Redis" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 309 | if [[ $? == 10 ]]; then |
| 310 | diff_c=0 |
| 311 | sleep 1 |
| 312 | else |
| 313 | diff_c=0 |
| 314 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 315 | fi |
| 316 | done |
| 317 | return 1 |
| 318 | } |
| 319 | |
| 320 | mysql_checks() { |
| 321 | err_count=0 |
| 322 | diff_c=0 |
| 323 | THRESHOLD=${MYSQL_THRESHOLD} |
| 324 | # Reduce error count by 2 after restarting an unhealthy container |
| 325 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 326 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 327 | touch /tmp/mysql-mailcow; echo "$(tail -50 /tmp/mysql-mailcow)" > /tmp/mysql-mailcow |
| 328 | err_c_cur=${err_count} |
| 329 | /usr/lib/nagios/plugins/check_mysql -s /var/run/mysqld/mysqld.sock -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} 2>> /tmp/mysql-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 330 | /usr/lib/nagios/plugins/check_mysql_query -s /var/run/mysqld/mysqld.sock -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} -q "SELECT COUNT(*) FROM information_schema.tables" 2>> /tmp/mysql-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 331 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 332 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 333 | progress "MySQL/MariaDB" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 334 | if [[ $? == 10 ]]; then |
| 335 | diff_c=0 |
| 336 | sleep 1 |
| 337 | else |
| 338 | diff_c=0 |
| 339 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 340 | fi |
| 341 | done |
| 342 | return 1 |
| 343 | } |
| 344 | |
| 345 | mysql_repl_checks() { |
| 346 | err_count=0 |
| 347 | diff_c=0 |
| 348 | THRESHOLD=${MYSQL_REPLICATION_THRESHOLD} |
| 349 | # Reduce error count by 2 after restarting an unhealthy container |
| 350 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 351 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 352 | touch /tmp/mysql_repl_checks; echo "$(tail -50 /tmp/mysql_repl_checks)" > /tmp/mysql_repl_checks |
| 353 | err_c_cur=${err_count} |
| 354 | /usr/lib/nagios/plugins/check_mysql_slavestatus.sh -S /var/run/mysqld/mysqld.sock -u root -p ${DBROOT} 2>> /tmp/mysql_repl_checks 1>&2; err_count=$(( ${err_count} + $? )) |
| 355 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 356 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 357 | progress "MySQL/MariaDB replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 358 | if [[ $? == 10 ]]; then |
| 359 | diff_c=0 |
| 360 | sleep 60 |
| 361 | else |
| 362 | diff_c=0 |
| 363 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 364 | fi |
| 365 | done |
| 366 | return 1 |
| 367 | } |
| 368 | |
| 369 | sogo_checks() { |
| 370 | err_count=0 |
| 371 | diff_c=0 |
| 372 | THRESHOLD=${SOGO_THRESHOLD} |
| 373 | # Reduce error count by 2 after restarting an unhealthy container |
| 374 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 375 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 376 | touch /tmp/sogo-mailcow; echo "$(tail -50 /tmp/sogo-mailcow)" > /tmp/sogo-mailcow |
| 377 | host_ip=$(get_container_ip sogo-mailcow) |
| 378 | err_c_cur=${err_count} |
Matthias Andreas Benkard | 12a5735 | 2021-12-28 18:02:04 +0100 | [diff] [blame] | 379 | /usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u /SOGo.index/ -p 20000 2>> /tmp/sogo-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 380 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 381 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 382 | progress "SOGo" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 383 | if [[ $? == 10 ]]; then |
| 384 | diff_c=0 |
| 385 | sleep 1 |
| 386 | else |
| 387 | diff_c=0 |
| 388 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 389 | fi |
| 390 | done |
| 391 | return 1 |
| 392 | } |
| 393 | |
| 394 | postfix_checks() { |
| 395 | err_count=0 |
| 396 | diff_c=0 |
| 397 | THRESHOLD=${POSTFIX_THRESHOLD} |
| 398 | # Reduce error count by 2 after restarting an unhealthy container |
| 399 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 400 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 401 | touch /tmp/postfix-mailcow; echo "$(tail -50 /tmp/postfix-mailcow)" > /tmp/postfix-mailcow |
| 402 | host_ip=$(get_container_ip postfix-mailcow) |
| 403 | err_c_cur=${err_count} |
| 404 | /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 589 -f "watchdog@invalid" -C "RCPT TO:watchdog@localhost" -C DATA -C . -R 250 2>> /tmp/postfix-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 405 | /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 589 -S 2>> /tmp/postfix-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 406 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 407 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 408 | progress "Postfix" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 409 | if [[ $? == 10 ]]; then |
| 410 | diff_c=0 |
| 411 | sleep 1 |
| 412 | else |
| 413 | diff_c=0 |
| 414 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 415 | fi |
| 416 | done |
| 417 | return 1 |
| 418 | } |
| 419 | |
| 420 | clamd_checks() { |
| 421 | err_count=0 |
| 422 | diff_c=0 |
| 423 | THRESHOLD=${CLAMD_THRESHOLD} |
| 424 | # Reduce error count by 2 after restarting an unhealthy container |
| 425 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 426 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 427 | touch /tmp/clamd-mailcow; echo "$(tail -50 /tmp/clamd-mailcow)" > /tmp/clamd-mailcow |
| 428 | host_ip=$(get_container_ip clamd-mailcow) |
| 429 | err_c_cur=${err_count} |
| 430 | /usr/lib/nagios/plugins/check_clamd -4 -H ${host_ip} 2>> /tmp/clamd-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 431 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 432 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 433 | progress "Clamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 434 | if [[ $? == 10 ]]; then |
| 435 | diff_c=0 |
| 436 | sleep 1 |
| 437 | else |
| 438 | diff_c=0 |
| 439 | sleep $(( ( RANDOM % 120 ) + 20 )) |
| 440 | fi |
| 441 | done |
| 442 | return 1 |
| 443 | } |
| 444 | |
| 445 | dovecot_checks() { |
| 446 | err_count=0 |
| 447 | diff_c=0 |
| 448 | THRESHOLD=${DOVECOT_THRESHOLD} |
| 449 | # Reduce error count by 2 after restarting an unhealthy container |
| 450 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 451 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 452 | touch /tmp/dovecot-mailcow; echo "$(tail -50 /tmp/dovecot-mailcow)" > /tmp/dovecot-mailcow |
| 453 | host_ip=$(get_container_ip dovecot-mailcow) |
| 454 | err_c_cur=${err_count} |
| 455 | /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 24 -f "watchdog@invalid" -C "RCPT TO:<watchdog@invalid>" -L -R "User doesn't exist" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 456 | /usr/lib/nagios/plugins/check_imap -4 -H ${host_ip} -p 993 -S -e "OK " 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 457 | /usr/lib/nagios/plugins/check_imap -4 -H ${host_ip} -p 143 -e "OK " 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 458 | /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 10001 -e "VERSION" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 459 | /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 4190 -e "Dovecot ready" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 460 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 461 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 462 | progress "Dovecot" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 463 | if [[ $? == 10 ]]; then |
| 464 | diff_c=0 |
| 465 | sleep 1 |
| 466 | else |
| 467 | diff_c=0 |
| 468 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 469 | fi |
| 470 | done |
| 471 | return 1 |
| 472 | } |
| 473 | |
| 474 | dovecot_repl_checks() { |
| 475 | err_count=0 |
| 476 | diff_c=0 |
| 477 | THRESHOLD=${DOVECOT_REPL_THRESHOLD} |
| 478 | D_REPL_STATUS=$(redis-cli -h redis -r GET DOVECOT_REPL_HEALTH) |
| 479 | # Reduce error count by 2 after restarting an unhealthy container |
| 480 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 481 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 482 | err_c_cur=${err_count} |
| 483 | D_REPL_STATUS=$(redis-cli --raw -h redis GET DOVECOT_REPL_HEALTH) |
| 484 | if [[ "${D_REPL_STATUS}" != "1" ]]; then |
| 485 | err_count=$(( ${err_count} + 1 )) |
| 486 | fi |
| 487 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 488 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 489 | progress "Dovecot replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 490 | if [[ $? == 10 ]]; then |
| 491 | diff_c=0 |
| 492 | sleep 60 |
| 493 | else |
| 494 | diff_c=0 |
| 495 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 496 | fi |
| 497 | done |
| 498 | return 1 |
| 499 | } |
| 500 | |
| 501 | cert_checks() { |
| 502 | err_count=0 |
| 503 | diff_c=0 |
| 504 | THRESHOLD=7 |
| 505 | # Reduce error count by 2 after restarting an unhealthy container |
| 506 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 507 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 508 | touch /tmp/certcheck; echo "$(tail -50 /tmp/certcheck)" > /tmp/certcheck |
| 509 | host_ip_postfix=$(get_container_ip postfix) |
| 510 | host_ip_dovecot=$(get_container_ip dovecot) |
| 511 | err_c_cur=${err_count} |
| 512 | /usr/lib/nagios/plugins/check_smtp -H ${host_ip_postfix} -p 589 -4 -S -D 7 2>> /tmp/certcheck 1>&2; err_count=$(( ${err_count} + $? )) |
| 513 | /usr/lib/nagios/plugins/check_imap -H ${host_ip_dovecot} -p 993 -4 -S -D 7 2>> /tmp/certcheck 1>&2; err_count=$(( ${err_count} + $? )) |
| 514 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 515 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 516 | progress "Primary certificate expiry check" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 517 | # Always sleep 5 minutes, mail notifications are limited |
| 518 | sleep 300 |
| 519 | done |
| 520 | return 1 |
| 521 | } |
| 522 | |
| 523 | phpfpm_checks() { |
| 524 | err_count=0 |
| 525 | diff_c=0 |
| 526 | THRESHOLD=${PHPFPM_THRESHOLD} |
| 527 | # Reduce error count by 2 after restarting an unhealthy container |
| 528 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 529 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 530 | touch /tmp/php-fpm-mailcow; echo "$(tail -50 /tmp/php-fpm-mailcow)" > /tmp/php-fpm-mailcow |
| 531 | host_ip=$(get_container_ip php-fpm-mailcow) |
| 532 | err_c_cur=${err_count} |
| 533 | /usr/lib/nagios/plugins/check_tcp -H ${host_ip} -p 9001 2>> /tmp/php-fpm-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 534 | /usr/lib/nagios/plugins/check_tcp -H ${host_ip} -p 9002 2>> /tmp/php-fpm-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 535 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 536 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 537 | progress "PHP-FPM" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 538 | if [[ $? == 10 ]]; then |
| 539 | diff_c=0 |
| 540 | sleep 1 |
| 541 | else |
| 542 | diff_c=0 |
| 543 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 544 | fi |
| 545 | done |
| 546 | return 1 |
| 547 | } |
| 548 | |
| 549 | ratelimit_checks() { |
| 550 | err_count=0 |
| 551 | diff_c=0 |
| 552 | THRESHOLD=${RATELIMIT_THRESHOLD} |
| 553 | RL_LOG_STATUS=$(redis-cli -h redis LRANGE RL_LOG 0 0 | jq .qid) |
| 554 | # Reduce error count by 2 after restarting an unhealthy container |
| 555 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 556 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 557 | err_c_cur=${err_count} |
| 558 | RL_LOG_STATUS_PREV=${RL_LOG_STATUS} |
| 559 | RL_LOG_STATUS=$(redis-cli -h redis LRANGE RL_LOG 0 0 | jq .qid) |
| 560 | if [[ ${RL_LOG_STATUS_PREV} != ${RL_LOG_STATUS} ]]; then |
| 561 | err_count=$(( ${err_count} + 1 )) |
| 562 | echo 'Last 10 applied ratelimits (may overlap with previous reports).' > /tmp/ratelimit |
| 563 | echo 'Full ratelimit buckets can be emptied by deleting the ratelimit hash from within mailcow UI (see /debug -> Protocols -> Ratelimit):' >> /tmp/ratelimit |
| 564 | echo >> /tmp/ratelimit |
| 565 | redis-cli --raw -h redis LRANGE RL_LOG 0 10 | jq . >> /tmp/ratelimit |
| 566 | fi |
| 567 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 568 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 569 | progress "Ratelimit" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 570 | if [[ $? == 10 ]]; then |
| 571 | diff_c=0 |
| 572 | sleep 1 |
| 573 | else |
| 574 | diff_c=0 |
| 575 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 576 | fi |
| 577 | done |
| 578 | return 1 |
| 579 | } |
| 580 | |
| 581 | mailq_checks() { |
| 582 | err_count=0 |
| 583 | diff_c=0 |
| 584 | THRESHOLD=${MAILQ_THRESHOLD} |
| 585 | # Reduce error count by 2 after restarting an unhealthy container |
| 586 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 587 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 588 | touch /tmp/mail_queue_status; echo "$(tail -50 /tmp/mail_queue_status)" > /tmp/mail_queue_status |
| 589 | MAILQ_LOG_STATUS=$(find /var/spool/postfix/deferred -type f | wc -l) |
| 590 | echo "Mail queue contains ${MAILQ_LOG_STATUS} items (critical limit is ${MAILQ_CRIT}) at $(date)" >> /tmp/mail_queue_status |
| 591 | err_c_cur=${err_count} |
| 592 | if [ ${MAILQ_LOG_STATUS} -ge ${MAILQ_CRIT} ]; then |
| 593 | err_count=$(( ${err_count} + 1 )) |
| 594 | echo "Mail queue contains ${MAILQ_LOG_STATUS} items (critical limit is ${MAILQ_CRIT}) at $(date)" >> /tmp/mail_queue_status |
| 595 | fi |
| 596 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 597 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 598 | progress "Mail queue" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 599 | if [[ $? == 10 ]]; then |
| 600 | diff_c=0 |
| 601 | sleep 60 |
| 602 | else |
| 603 | diff_c=0 |
| 604 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 605 | fi |
| 606 | done |
| 607 | return 1 |
| 608 | } |
| 609 | |
| 610 | fail2ban_checks() { |
| 611 | err_count=0 |
| 612 | diff_c=0 |
| 613 | THRESHOLD=${FAIL2BAN_THRESHOLD} |
| 614 | F2B_LOG_STATUS=($(${REDIS_CMDLINE} --raw HKEYS F2B_ACTIVE_BANS)) |
| 615 | F2B_RES= |
| 616 | # Reduce error count by 2 after restarting an unhealthy container |
| 617 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 618 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 619 | err_c_cur=${err_count} |
| 620 | F2B_LOG_STATUS_PREV=(${F2B_LOG_STATUS[@]}) |
| 621 | F2B_LOG_STATUS=($(${REDIS_CMDLINE} --raw HKEYS F2B_ACTIVE_BANS)) |
| 622 | array_diff F2B_RES F2B_LOG_STATUS F2B_LOG_STATUS_PREV |
| 623 | if [[ ! -z "${F2B_RES}" ]]; then |
| 624 | err_count=$(( ${err_count} + 1 )) |
| 625 | echo -n "${F2B_RES[@]}" | tr -cd "[a-fA-F0-9.:/] " | timeout 3s ${REDIS_CMDLINE} -x SET F2B_RES > /dev/null |
| 626 | if [ $? -ne 0 ]; then |
| 627 | ${REDIS_CMDLINE} -x DEL F2B_RES |
| 628 | fi |
| 629 | fi |
| 630 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 631 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 632 | progress "Fail2ban" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 633 | if [[ $? == 10 ]]; then |
| 634 | diff_c=0 |
| 635 | sleep 1 |
| 636 | else |
| 637 | diff_c=0 |
| 638 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 639 | fi |
| 640 | done |
| 641 | return 1 |
| 642 | } |
| 643 | |
| 644 | acme_checks() { |
| 645 | err_count=0 |
| 646 | diff_c=0 |
| 647 | THRESHOLD=${ACME_THRESHOLD} |
| 648 | ACME_LOG_STATUS=$(redis-cli -h redis GET ACME_FAIL_TIME) |
| 649 | if [[ -z "${ACME_LOG_STATUS}" ]]; then |
| 650 | ${REDIS_CMDLINE} SET ACME_FAIL_TIME 0 |
| 651 | ACME_LOG_STATUS=0 |
| 652 | fi |
| 653 | # Reduce error count by 2 after restarting an unhealthy container |
| 654 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 655 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 656 | err_c_cur=${err_count} |
| 657 | ACME_LOG_STATUS_PREV=${ACME_LOG_STATUS} |
| 658 | ACME_LC=0 |
| 659 | until [[ ! -z ${ACME_LOG_STATUS} ]] || [ ${ACME_LC} -ge 3 ]; do |
| 660 | ACME_LOG_STATUS=$(redis-cli -h redis GET ACME_FAIL_TIME 2> /dev/null) |
| 661 | sleep 3 |
| 662 | ACME_LC=$((ACME_LC+1)) |
| 663 | done |
| 664 | if [[ ${ACME_LOG_STATUS_PREV} != ${ACME_LOG_STATUS} ]]; then |
| 665 | err_count=$(( ${err_count} + 1 )) |
| 666 | fi |
| 667 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 668 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 669 | progress "ACME" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 670 | if [[ $? == 10 ]]; then |
| 671 | diff_c=0 |
| 672 | sleep 1 |
| 673 | else |
| 674 | diff_c=0 |
| 675 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 676 | fi |
| 677 | done |
| 678 | return 1 |
| 679 | } |
| 680 | |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 681 | rspamd_checks() { |
| 682 | err_count=0 |
| 683 | diff_c=0 |
| 684 | THRESHOLD=${RSPAMD_THRESHOLD} |
| 685 | # Reduce error count by 2 after restarting an unhealthy container |
| 686 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 687 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 688 | touch /tmp/rspamd-mailcow; echo "$(tail -50 /tmp/rspamd-mailcow)" > /tmp/rspamd-mailcow |
| 689 | host_ip=$(get_container_ip rspamd-mailcow) |
| 690 | err_c_cur=${err_count} |
| 691 | SCORE=$(echo 'To: null@localhost |
| 692 | From: watchdog@localhost |
| 693 | |
| 694 | Empty |
| 695 | ' | usr/bin/curl --max-time 10 -s --data-binary @- --unix-socket /var/lib/rspamd/rspamd.sock http://rspamd/scan | jq -rc .default.required_score) |
| 696 | if [[ ${SCORE} != "9999" ]]; then |
| 697 | echo "Rspamd settings check failed, score returned: ${SCORE}" 2>> /tmp/rspamd-mailcow 1>&2 |
| 698 | err_count=$(( ${err_count} + 1)) |
| 699 | else |
| 700 | echo "Rspamd settings check succeeded, score returned: ${SCORE}" 2>> /tmp/rspamd-mailcow 1>&2 |
| 701 | fi |
| 702 | # A dirty hack until a PING PONG event is implemented to worker proxy |
| 703 | # We expect an empty response, not a timeout |
| 704 | if [ "$(curl -s --max-time 10 ${host_ip}:9900 2> /dev/null ; echo $?)" == "28" ]; then |
| 705 | echo "Milter check failed" 2>> /tmp/rspamd-mailcow 1>&2; err_count=$(( ${err_count} + 1 )); |
| 706 | else |
| 707 | echo "Milter check succeeded" 2>> /tmp/rspamd-mailcow 1>&2 |
| 708 | fi |
| 709 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 710 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 711 | progress "Rspamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 712 | if [[ $? == 10 ]]; then |
| 713 | diff_c=0 |
| 714 | sleep 1 |
| 715 | else |
| 716 | diff_c=0 |
| 717 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 718 | fi |
| 719 | done |
| 720 | return 1 |
| 721 | } |
| 722 | |
| 723 | olefy_checks() { |
| 724 | err_count=0 |
| 725 | diff_c=0 |
| 726 | THRESHOLD=${OLEFY_THRESHOLD} |
| 727 | # Reduce error count by 2 after restarting an unhealthy container |
| 728 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 729 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 730 | touch /tmp/olefy-mailcow; echo "$(tail -50 /tmp/olefy-mailcow)" > /tmp/olefy-mailcow |
| 731 | host_ip=$(get_container_ip olefy-mailcow) |
| 732 | err_c_cur=${err_count} |
| 733 | /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 10055 -s "PING\n" 2>> /tmp/olefy-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 734 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 735 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 736 | progress "Olefy" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 737 | if [[ $? == 10 ]]; then |
| 738 | diff_c=0 |
| 739 | sleep 1 |
| 740 | else |
| 741 | diff_c=0 |
| 742 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 743 | fi |
| 744 | done |
| 745 | return 1 |
| 746 | } |
| 747 | |
| 748 | # Notify about start |
| 749 | if [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]]; then |
| 750 | mail_error "watchdog-mailcow" "Watchdog started monitoring mailcow." |
| 751 | fi |
| 752 | |
| 753 | # Create watchdog agents |
| 754 | |
| 755 | ( |
| 756 | while true; do |
| 757 | if ! nginx_checks; then |
| 758 | log_msg "Nginx hit error limit" |
| 759 | echo nginx-mailcow > /tmp/com_pipe |
| 760 | fi |
| 761 | done |
| 762 | ) & |
| 763 | PID=$! |
| 764 | echo "Spawned nginx_checks with PID ${PID}" |
| 765 | BACKGROUND_TASKS+=(${PID}) |
| 766 | |
| 767 | if [[ ${WATCHDOG_EXTERNAL_CHECKS} =~ ^([yY][eE][sS]|[yY])+$ ]]; then |
| 768 | ( |
| 769 | while true; do |
| 770 | if ! external_checks; then |
| 771 | log_msg "External checks hit error limit" |
| 772 | echo external_checks > /tmp/com_pipe |
| 773 | fi |
| 774 | done |
| 775 | ) & |
| 776 | PID=$! |
| 777 | echo "Spawned external_checks with PID ${PID}" |
| 778 | BACKGROUND_TASKS+=(${PID}) |
| 779 | fi |
| 780 | |
| 781 | if [[ ${WATCHDOG_MYSQL_REPLICATION_CHECKS} =~ ^([yY][eE][sS]|[yY])+$ ]]; then |
| 782 | ( |
| 783 | while true; do |
| 784 | if ! mysql_repl_checks; then |
| 785 | log_msg "MySQL replication check hit error limit" |
| 786 | echo mysql_repl_checks > /tmp/com_pipe |
| 787 | fi |
| 788 | done |
| 789 | ) & |
| 790 | PID=$! |
| 791 | echo "Spawned mysql_repl_checks with PID ${PID}" |
| 792 | BACKGROUND_TASKS+=(${PID}) |
| 793 | fi |
| 794 | |
| 795 | ( |
| 796 | while true; do |
| 797 | if ! mysql_checks; then |
| 798 | log_msg "MySQL hit error limit" |
| 799 | echo mysql-mailcow > /tmp/com_pipe |
| 800 | fi |
| 801 | done |
| 802 | ) & |
| 803 | PID=$! |
| 804 | echo "Spawned mysql_checks with PID ${PID}" |
| 805 | BACKGROUND_TASKS+=(${PID}) |
| 806 | |
| 807 | ( |
| 808 | while true; do |
| 809 | if ! redis_checks; then |
| 810 | log_msg "Local Redis hit error limit" |
| 811 | echo redis-mailcow > /tmp/com_pipe |
| 812 | fi |
| 813 | done |
| 814 | ) & |
| 815 | PID=$! |
| 816 | echo "Spawned redis_checks with PID ${PID}" |
| 817 | BACKGROUND_TASKS+=(${PID}) |
| 818 | |
| 819 | ( |
| 820 | while true; do |
| 821 | if ! phpfpm_checks; then |
| 822 | log_msg "PHP-FPM hit error limit" |
| 823 | echo php-fpm-mailcow > /tmp/com_pipe |
| 824 | fi |
| 825 | done |
| 826 | ) & |
| 827 | PID=$! |
| 828 | echo "Spawned phpfpm_checks with PID ${PID}" |
| 829 | BACKGROUND_TASKS+=(${PID}) |
| 830 | |
| 831 | if [[ "${SKIP_SOGO}" =~ ^([nN][oO]|[nN])+$ ]]; then |
| 832 | ( |
| 833 | while true; do |
| 834 | if ! sogo_checks; then |
| 835 | log_msg "SOGo hit error limit" |
| 836 | echo sogo-mailcow > /tmp/com_pipe |
| 837 | fi |
| 838 | done |
| 839 | ) & |
| 840 | PID=$! |
| 841 | echo "Spawned sogo_checks with PID ${PID}" |
| 842 | BACKGROUND_TASKS+=(${PID}) |
| 843 | fi |
| 844 | |
| 845 | if [ ${CHECK_UNBOUND} -eq 1 ]; then |
| 846 | ( |
| 847 | while true; do |
| 848 | if ! unbound_checks; then |
| 849 | log_msg "Unbound hit error limit" |
| 850 | echo unbound-mailcow > /tmp/com_pipe |
| 851 | fi |
| 852 | done |
| 853 | ) & |
| 854 | PID=$! |
| 855 | echo "Spawned unbound_checks with PID ${PID}" |
| 856 | BACKGROUND_TASKS+=(${PID}) |
| 857 | fi |
| 858 | |
| 859 | if [[ "${SKIP_CLAMD}" =~ ^([nN][oO]|[nN])+$ ]]; then |
| 860 | ( |
| 861 | while true; do |
| 862 | if ! clamd_checks; then |
| 863 | log_msg "Clamd hit error limit" |
| 864 | echo clamd-mailcow > /tmp/com_pipe |
| 865 | fi |
| 866 | done |
| 867 | ) & |
| 868 | PID=$! |
| 869 | echo "Spawned clamd_checks with PID ${PID}" |
| 870 | BACKGROUND_TASKS+=(${PID}) |
| 871 | fi |
| 872 | |
| 873 | ( |
| 874 | while true; do |
| 875 | if ! postfix_checks; then |
| 876 | log_msg "Postfix hit error limit" |
| 877 | echo postfix-mailcow > /tmp/com_pipe |
| 878 | fi |
| 879 | done |
| 880 | ) & |
| 881 | PID=$! |
| 882 | echo "Spawned postfix_checks with PID ${PID}" |
| 883 | BACKGROUND_TASKS+=(${PID}) |
| 884 | |
| 885 | ( |
| 886 | while true; do |
| 887 | if ! mailq_checks; then |
| 888 | log_msg "Mail queue hit error limit" |
| 889 | echo mail_queue_status > /tmp/com_pipe |
| 890 | fi |
| 891 | done |
| 892 | ) & |
| 893 | PID=$! |
| 894 | echo "Spawned mailq_checks with PID ${PID}" |
| 895 | BACKGROUND_TASKS+=(${PID}) |
| 896 | |
| 897 | ( |
| 898 | while true; do |
| 899 | if ! dovecot_checks; then |
| 900 | log_msg "Dovecot hit error limit" |
| 901 | echo dovecot-mailcow > /tmp/com_pipe |
| 902 | fi |
| 903 | done |
| 904 | ) & |
| 905 | PID=$! |
| 906 | echo "Spawned dovecot_checks with PID ${PID}" |
| 907 | BACKGROUND_TASKS+=(${PID}) |
| 908 | |
| 909 | ( |
| 910 | while true; do |
| 911 | if ! dovecot_repl_checks; then |
| 912 | log_msg "Dovecot hit error limit" |
| 913 | echo dovecot_repl_checks > /tmp/com_pipe |
| 914 | fi |
| 915 | done |
| 916 | ) & |
| 917 | PID=$! |
| 918 | echo "Spawned dovecot_repl_checks with PID ${PID}" |
| 919 | BACKGROUND_TASKS+=(${PID}) |
| 920 | |
| 921 | ( |
| 922 | while true; do |
| 923 | if ! rspamd_checks; then |
| 924 | log_msg "Rspamd hit error limit" |
| 925 | echo rspamd-mailcow > /tmp/com_pipe |
| 926 | fi |
| 927 | done |
| 928 | ) & |
| 929 | PID=$! |
| 930 | echo "Spawned rspamd_checks with PID ${PID}" |
| 931 | BACKGROUND_TASKS+=(${PID}) |
| 932 | |
| 933 | ( |
| 934 | while true; do |
| 935 | if ! ratelimit_checks; then |
| 936 | log_msg "Ratelimit hit error limit" |
| 937 | echo ratelimit > /tmp/com_pipe |
| 938 | fi |
| 939 | done |
| 940 | ) & |
| 941 | PID=$! |
| 942 | echo "Spawned ratelimit_checks with PID ${PID}" |
| 943 | BACKGROUND_TASKS+=(${PID}) |
| 944 | |
| 945 | ( |
| 946 | while true; do |
| 947 | if ! fail2ban_checks; then |
| 948 | log_msg "Fail2ban hit error limit" |
| 949 | echo fail2ban > /tmp/com_pipe |
| 950 | fi |
| 951 | done |
| 952 | ) & |
| 953 | PID=$! |
| 954 | echo "Spawned fail2ban_checks with PID ${PID}" |
| 955 | BACKGROUND_TASKS+=(${PID}) |
| 956 | |
| 957 | ( |
| 958 | while true; do |
| 959 | if ! cert_checks; then |
| 960 | log_msg "Cert check hit error limit" |
| 961 | echo certcheck > /tmp/com_pipe |
| 962 | fi |
| 963 | done |
| 964 | ) & |
| 965 | PID=$! |
| 966 | echo "Spawned cert_checks with PID ${PID}" |
| 967 | BACKGROUND_TASKS+=(${PID}) |
| 968 | |
| 969 | ( |
| 970 | while true; do |
| 971 | if ! olefy_checks; then |
| 972 | log_msg "Olefy hit error limit" |
| 973 | echo olefy-mailcow > /tmp/com_pipe |
| 974 | fi |
| 975 | done |
| 976 | ) & |
| 977 | PID=$! |
| 978 | echo "Spawned olefy_checks with PID ${PID}" |
| 979 | BACKGROUND_TASKS+=(${PID}) |
| 980 | |
| 981 | ( |
| 982 | while true; do |
| 983 | if ! acme_checks; then |
| 984 | log_msg "ACME client hit error limit" |
| 985 | echo acme-mailcow > /tmp/com_pipe |
| 986 | fi |
| 987 | done |
| 988 | ) & |
| 989 | PID=$! |
| 990 | echo "Spawned acme_checks with PID ${PID}" |
| 991 | BACKGROUND_TASKS+=(${PID}) |
| 992 | |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 993 | # Monitor watchdog agents, stop script when agents fails and wait for respawn by Docker (restart:always:n) |
| 994 | ( |
| 995 | while true; do |
| 996 | for bg_task in ${BACKGROUND_TASKS[*]}; do |
| 997 | if ! kill -0 ${bg_task} 1>&2; then |
| 998 | log_msg "Worker ${bg_task} died, stopping watchdog and waiting for respawn..." |
| 999 | kill -TERM 1 |
| 1000 | fi |
| 1001 | sleep 10 |
| 1002 | done |
| 1003 | done |
| 1004 | ) & |
| 1005 | |
| 1006 | # Monitor dockerapi |
| 1007 | ( |
| 1008 | while true; do |
| 1009 | while nc -z dockerapi 443; do |
| 1010 | sleep 3 |
| 1011 | done |
| 1012 | log_msg "Cannot find dockerapi-mailcow, waiting to recover..." |
| 1013 | kill -STOP ${BACKGROUND_TASKS[*]} |
| 1014 | until nc -z dockerapi 443; do |
| 1015 | sleep 3 |
| 1016 | done |
| 1017 | kill -CONT ${BACKGROUND_TASKS[*]} |
| 1018 | kill -USR1 ${BACKGROUND_TASKS[*]} |
| 1019 | done |
| 1020 | ) & |
| 1021 | |
| 1022 | # Actions when threshold limit is reached |
| 1023 | while true; do |
| 1024 | CONTAINER_ID= |
| 1025 | HAS_INITDB= |
| 1026 | read com_pipe_answer </tmp/com_pipe |
| 1027 | if [ -s "/tmp/${com_pipe_answer}" ]; then |
| 1028 | cat "/tmp/${com_pipe_answer}" |
| 1029 | fi |
| 1030 | if [[ ${com_pipe_answer} == "ratelimit" ]]; then |
| 1031 | log_msg "At least one ratelimit was applied" |
| 1032 | [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" |
| 1033 | elif [[ ${com_pipe_answer} == "mail_queue_status" ]]; then |
| 1034 | log_msg "Mail queue status is critical" |
| 1035 | [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" |
| 1036 | elif [[ ${com_pipe_answer} == "external_checks" ]]; then |
| 1037 | log_msg "Your mailcow is an open relay!" |
| 1038 | # Define $2 to override message text, else print service was restarted at ... |
| 1039 | [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please stop mailcow now and check your network configuration!" |
| 1040 | elif [[ ${com_pipe_answer} == "mysql_repl_checks" ]]; then |
| 1041 | log_msg "MySQL replication is not working properly" |
| 1042 | # Define $2 to override message text, else print service was restarted at ... |
| 1043 | # Once mail per 10 minutes |
| 1044 | [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check the SQL replication status" 600 |
| 1045 | elif [[ ${com_pipe_answer} == "dovecot_repl_checks" ]]; then |
| 1046 | log_msg "Dovecot replication is not working properly" |
| 1047 | # Define $2 to override message text, else print service was restarted at ... |
| 1048 | # Once mail per 10 minutes |
| 1049 | [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check the Dovecot replicator status" 600 |
| 1050 | elif [[ ${com_pipe_answer} == "certcheck" ]]; then |
| 1051 | log_msg "Certificates are about to expire" |
| 1052 | # Define $2 to override message text, else print service was restarted at ... |
| 1053 | # Only mail once a day |
| 1054 | [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please renew your certificate" 86400 |
| 1055 | elif [[ ${com_pipe_answer} == "acme-mailcow" ]]; then |
| 1056 | log_msg "acme-mailcow did not complete successfully" |
| 1057 | # Define $2 to override message text, else print service was restarted at ... |
| 1058 | [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check acme-mailcow for further information." |
| 1059 | elif [[ ${com_pipe_answer} == "fail2ban" ]]; then |
| 1060 | F2B_RES=($(timeout 4s ${REDIS_CMDLINE} --raw GET F2B_RES 2> /dev/null)) |
| 1061 | if [[ ! -z "${F2B_RES}" ]]; then |
| 1062 | ${REDIS_CMDLINE} DEL F2B_RES > /dev/null |
| 1063 | host= |
| 1064 | for host in "${F2B_RES[@]}"; do |
| 1065 | log_msg "Banned ${host}" |
| 1066 | rm /tmp/fail2ban 2> /dev/null |
| 1067 | timeout 2s whois "${host}" > /tmp/fail2ban |
| 1068 | [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && [[ ${WATCHDOG_NOTIFY_BAN} =~ ^([yY][eE][sS]|[yY])+$ ]] && mail_error "${com_pipe_answer}" "IP ban: ${host}" |
| 1069 | done |
| 1070 | fi |
| 1071 | elif [[ ${com_pipe_answer} =~ .+-mailcow ]]; then |
| 1072 | kill -STOP ${BACKGROUND_TASKS[*]} |
| 1073 | sleep 10 |
| 1074 | CONTAINER_ID=$(curl --silent --insecure https://dockerapi/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], id: .Id}" | jq -rc "select( .name | tostring | contains(\"${com_pipe_answer}\")) | select( .project | tostring | contains(\"${COMPOSE_PROJECT_NAME,,}\")) | .id") |
| 1075 | if [[ ! -z ${CONTAINER_ID} ]]; then |
| 1076 | if [[ "${com_pipe_answer}" == "php-fpm-mailcow" ]]; then |
| 1077 | HAS_INITDB=$(curl --silent --insecure -XPOST https://dockerapi/containers/${CONTAINER_ID}/top | jq '.msg.Processes[] | contains(["php -c /usr/local/etc/php -f /web/inc/init_db.inc.php"])' | grep true) |
| 1078 | fi |
| 1079 | S_RUNNING=$(($(date +%s) - $(curl --silent --insecure https://dockerapi/containers/${CONTAINER_ID}/json | jq .State.StartedAt | xargs -n1 date +%s -d))) |
| 1080 | if [ ${S_RUNNING} -lt 360 ]; then |
| 1081 | log_msg "Container is running for less than 360 seconds, skipping action..." |
| 1082 | elif [[ ! -z ${HAS_INITDB} ]]; then |
| 1083 | log_msg "Database is being initialized by php-fpm-mailcow, not restarting but delaying checks for a minute..." |
| 1084 | sleep 60 |
| 1085 | else |
| 1086 | log_msg "Sending restart command to ${CONTAINER_ID}..." |
| 1087 | curl --silent --insecure -XPOST https://dockerapi/containers/${CONTAINER_ID}/restart |
Matthias Andreas Benkard | 7b2a3a1 | 2021-08-16 10:57:25 +0200 | [diff] [blame] | 1088 | [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 1089 | log_msg "Wait for restarted container to settle and continue watching..." |
| 1090 | sleep 35 |
| 1091 | fi |
| 1092 | fi |
| 1093 | kill -CONT ${BACKGROUND_TASKS[*]} |
| 1094 | sleep 1 |
| 1095 | kill -USR1 ${BACKGROUND_TASKS[*]} |
| 1096 | fi |
| 1097 | done |