Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 1 | #!/bin/bash |
| 2 | |
| 3 | trap "exit" INT TERM |
| 4 | trap "kill 0" EXIT |
| 5 | |
| 6 | # Prepare |
| 7 | BACKGROUND_TASKS=() |
| 8 | echo "Waiting for containers to settle..." |
| 9 | sleep 30 |
| 10 | |
| 11 | if [[ "${USE_WATCHDOG}" =~ ^([nN][oO]|[nN])+$ ]]; then |
| 12 | echo -e "$(date) - USE_WATCHDOG=n, skipping watchdog..." |
| 13 | sleep 365d |
| 14 | exec $(readlink -f "$0") |
| 15 | fi |
| 16 | |
| 17 | # Checks pipe their corresponding container name in this pipe |
| 18 | if [[ ! -p /tmp/com_pipe ]]; then |
| 19 | mkfifo /tmp/com_pipe |
| 20 | fi |
| 21 | |
| 22 | # Wait for containers |
| 23 | while ! mysqladmin status --socket=/var/run/mysqld/mysqld.sock -u${DBUSER} -p${DBPASS} --silent; do |
| 24 | echo "Waiting for SQL..." |
| 25 | sleep 2 |
| 26 | done |
| 27 | |
| 28 | # Do not attempt to write to slave |
| 29 | if [[ ! -z ${REDIS_SLAVEOF_IP} ]]; then |
| 30 | REDIS_CMDLINE="redis-cli -h ${REDIS_SLAVEOF_IP} -p ${REDIS_SLAVEOF_PORT}" |
| 31 | else |
| 32 | REDIS_CMDLINE="redis-cli -h redis -p 6379" |
| 33 | fi |
| 34 | |
| 35 | until [[ $(${REDIS_CMDLINE} PING) == "PONG" ]]; do |
| 36 | echo "Waiting for Redis..." |
| 37 | sleep 2 |
| 38 | done |
| 39 | |
| 40 | ${REDIS_CMDLINE} DEL F2B_RES > /dev/null |
| 41 | |
| 42 | # Common functions |
| 43 | get_ipv6(){ |
| 44 | local IPV6= |
| 45 | local IPV6_SRCS= |
| 46 | local TRY= |
Matthias Andreas Benkard | 7b2a3a1 | 2021-08-16 10:57:25 +0200 | [diff] [blame] | 47 | IPV6_SRCS[0]="ip6.mailcow.email" |
| 48 | IPV6_SRCS[1]="ip6.nevondo.com" |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 49 | until [[ ! -z ${IPV6} ]] || [[ ${TRY} -ge 10 ]]; do |
| 50 | IPV6=$(curl --connect-timeout 3 -m 10 -L6s ${IPV6_SRCS[$RANDOM % ${#IPV6_SRCS[@]} ]} | grep "^\([0-9a-fA-F]\{0,4\}:\)\{1,7\}[0-9a-fA-F]\{0,4\}$") |
| 51 | [[ ! -z ${TRY} ]] && sleep 1 |
| 52 | TRY=$((TRY+1)) |
| 53 | done |
| 54 | echo ${IPV6} |
| 55 | } |
| 56 | |
| 57 | array_diff() { |
| 58 | # https://stackoverflow.com/questions/2312762, Alex Offshore |
| 59 | eval local ARR1=\(\"\${$2[@]}\"\) |
| 60 | eval local ARR2=\(\"\${$3[@]}\"\) |
| 61 | local IFS=$'\n' |
| 62 | mapfile -t $1 < <(comm -23 <(echo "${ARR1[*]}" | sort) <(echo "${ARR2[*]}" | sort)) |
| 63 | } |
| 64 | |
| 65 | progress() { |
| 66 | SERVICE=${1} |
| 67 | TOTAL=${2} |
| 68 | CURRENT=${3} |
| 69 | DIFF=${4} |
| 70 | [[ -z ${DIFF} ]] && DIFF=0 |
| 71 | [[ -z ${TOTAL} || -z ${CURRENT} ]] && return |
| 72 | [[ ${CURRENT} -gt ${TOTAL} ]] && return |
| 73 | [[ ${CURRENT} -lt 0 ]] && CURRENT=0 |
| 74 | PERCENT=$(( 200 * ${CURRENT} / ${TOTAL} % 2 + 100 * ${CURRENT} / ${TOTAL} )) |
| 75 | ${REDIS_CMDLINE} LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"service\":\"${SERVICE}\",\"lvl\":\"${PERCENT}\",\"hpnow\":\"${CURRENT}\",\"hptotal\":\"${TOTAL}\",\"hpdiff\":\"${DIFF}\"}" > /dev/null |
| 76 | log_msg "${SERVICE} health level: ${PERCENT}% (${CURRENT}/${TOTAL}), health trend: ${DIFF}" no_redis |
| 77 | # Return 10 to indicate a dead service |
| 78 | [ ${CURRENT} -le 0 ] && return 10 |
| 79 | } |
| 80 | |
| 81 | log_msg() { |
| 82 | if [[ ${2} != "no_redis" ]]; then |
| 83 | ${REDIS_CMDLINE} LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"message\":\"$(printf '%s' "${1}" | \ |
| 84 | tr '\r\n%&;$"_[]{}-' ' ')\"}" > /dev/null |
| 85 | fi |
| 86 | echo $(date) $(printf '%s\n' "${1}") |
| 87 | } |
| 88 | |
| 89 | function mail_error() { |
| 90 | THROTTLE= |
| 91 | [[ -z ${1} ]] && return 1 |
| 92 | # If exists, body will be the content of "/tmp/${1}", even if ${2} is set |
| 93 | [[ -z ${2} ]] && BODY="Service was restarted on $(date), please check your mailcow installation." || BODY="$(date) - ${2}" |
| 94 | # If exists, mail will be throttled by argument in seconds |
| 95 | [[ ! -z ${3} ]] && THROTTLE=${3} |
| 96 | if [[ ! -z ${THROTTLE} ]]; then |
| 97 | TTL_LEFT="$(${REDIS_CMDLINE} TTL THROTTLE_${1} 2> /dev/null)" |
| 98 | if [[ "${TTL_LEFT}" == "-2" ]]; then |
| 99 | # Delay key not found, setting a delay key now |
| 100 | ${REDIS_CMDLINE} SET THROTTLE_${1} 1 EX ${THROTTLE} |
| 101 | else |
| 102 | log_msg "Not sending notification email now, blocked for ${TTL_LEFT} seconds..." |
| 103 | return 1 |
| 104 | fi |
| 105 | fi |
| 106 | WATCHDOG_NOTIFY_EMAIL=$(echo "${WATCHDOG_NOTIFY_EMAIL}" | sed 's/"//;s|"$||') |
| 107 | # Some exceptions for subject and body formats |
| 108 | if [[ ${1} == "fail2ban" ]]; then |
| 109 | SUBJECT="${BODY}" |
| 110 | BODY="Please see netfilter-mailcow for more details and triggered rules." |
| 111 | else |
Matthias Andreas Benkard | 7b2a3a1 | 2021-08-16 10:57:25 +0200 | [diff] [blame] | 112 | SUBJECT="${WATCHDOG_SUBJECT}: ${1}" |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 113 | fi |
| 114 | IFS=',' read -r -a MAIL_RCPTS <<< "${WATCHDOG_NOTIFY_EMAIL}" |
| 115 | for rcpt in "${MAIL_RCPTS[@]}"; do |
| 116 | RCPT_DOMAIN= |
| 117 | #RCPT_MX= |
| 118 | RCPT_DOMAIN=$(echo ${rcpt} | awk -F @ {'print $NF'}) |
| 119 | # Latest smtp-cli looks up mx via dns |
| 120 | #RCPT_MX=$(dig +short ${RCPT_DOMAIN} mx | sort -n | awk '{print $2; exit}') |
| 121 | #if [[ -z ${RCPT_MX} ]]; then |
| 122 | # log_msg "Cannot determine MX for ${rcpt}, skipping email notification..." |
| 123 | # return 1 |
| 124 | #fi |
| 125 | [ -f "/tmp/${1}" ] && BODY="/tmp/${1}" |
| 126 | timeout 10s ./smtp-cli --missing-modules-ok \ |
| 127 | --charset=UTF-8 \ |
| 128 | --subject="${SUBJECT}" \ |
| 129 | --body-plain="${BODY}" \ |
| 130 | --add-header="X-Priority: 1" \ |
| 131 | --to=${rcpt} \ |
| 132 | --from="watchdog@${MAILCOW_HOSTNAME}" \ |
| 133 | --hello-host=${MAILCOW_HOSTNAME} \ |
| 134 | --ipv4 |
| 135 | #--server="${RCPT_MX}" |
| 136 | log_msg "Sent notification email to ${rcpt}" |
| 137 | done |
| 138 | } |
| 139 | |
| 140 | get_container_ip() { |
| 141 | # ${1} is container |
| 142 | CONTAINER_ID=() |
| 143 | CONTAINER_IPS=() |
| 144 | CONTAINER_IP= |
| 145 | LOOP_C=1 |
| 146 | until [[ ${CONTAINER_IP} =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]] || [[ ${LOOP_C} -gt 5 ]]; do |
| 147 | if [ ${IP_BY_DOCKER_API} -eq 0 ]; then |
| 148 | CONTAINER_IP=$(dig a "${1}" +short) |
| 149 | else |
| 150 | sleep 0.5 |
| 151 | # get long container id for exact match |
| 152 | CONTAINER_ID=($(curl --silent --insecure https://dockerapi/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], id: .Id}" | jq -rc "select( .name | tostring == \"${1}\") | select( .project | tostring | contains(\"${COMPOSE_PROJECT_NAME,,}\")) | .id")) |
| 153 | # returned id can have multiple elements (if scaled), shuffle for random test |
| 154 | CONTAINER_ID=($(printf "%s\n" "${CONTAINER_ID[@]}" | shuf)) |
| 155 | if [[ ! -z ${CONTAINER_ID} ]]; then |
| 156 | for matched_container in "${CONTAINER_ID[@]}"; do |
| 157 | CONTAINER_IPS=($(curl --silent --insecure https://dockerapi/containers/${matched_container}/json | jq -r '.NetworkSettings.Networks[].IPAddress')) |
| 158 | for ip_match in "${CONTAINER_IPS[@]}"; do |
| 159 | # grep will do nothing if one of these vars is empty |
| 160 | [[ -z ${ip_match} ]] && continue |
| 161 | [[ -z ${IPV4_NETWORK} ]] && continue |
| 162 | # only return ips that are part of our network |
| 163 | if ! grep -q ${IPV4_NETWORK} <(echo ${ip_match}); then |
| 164 | continue |
| 165 | else |
| 166 | CONTAINER_IP=${ip_match} |
| 167 | break |
| 168 | fi |
| 169 | done |
| 170 | [[ ! -z ${CONTAINER_IP} ]] && break |
| 171 | done |
| 172 | fi |
| 173 | fi |
| 174 | LOOP_C=$((LOOP_C + 1)) |
| 175 | done |
| 176 | [[ ${LOOP_C} -gt 5 ]] && echo 240.0.0.0 || echo ${CONTAINER_IP} |
| 177 | } |
| 178 | |
| 179 | # One-time check |
| 180 | if grep -qi "$(echo ${IPV6_NETWORK} | cut -d: -f1-3)" <<< "$(ip a s)"; then |
| 181 | if [[ -z "$(get_ipv6)" ]]; then |
| 182 | mail_error "ipv6-config" "enable_ipv6 is true in docker-compose.yml, but an IPv6 link could not be established. Please verify your IPv6 connection." |
| 183 | fi |
| 184 | fi |
| 185 | |
| 186 | external_checks() { |
| 187 | err_count=0 |
| 188 | diff_c=0 |
| 189 | THRESHOLD=${EXTERNAL_CHECKS_THRESHOLD} |
| 190 | # Reduce error count by 2 after restarting an unhealthy container |
| 191 | GUID=$(mysql -u${DBUSER} -p${DBPASS} ${DBNAME} -e "SELECT version FROM versions WHERE application = 'GUID'" -BN) |
| 192 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 193 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 194 | err_c_cur=${err_count} |
| 195 | CHECK_REPONSE="$(curl --connect-timeout 3 -m 10 -4 -s https://checks.mailcow.email -X POST -dguid=${GUID} 2> /dev/null)" |
| 196 | if [[ ! -z "${CHECK_REPONSE}" ]] && [[ "$(echo ${CHECK_REPONSE} | jq -r .response)" == "critical" ]]; then |
| 197 | echo ${CHECK_REPONSE} | jq -r .out > /tmp/external_checks |
| 198 | err_count=$(( ${err_count} + 1 )) |
| 199 | fi |
| 200 | CHECK_REPONSE6="$(curl --connect-timeout 3 -m 10 -6 -s https://checks.mailcow.email -X POST -dguid=${GUID} 2> /dev/null)" |
| 201 | if [[ ! -z "${CHECK_REPONSE6}" ]] && [[ "$(echo ${CHECK_REPONSE6} | jq -r .response)" == "critical" ]]; then |
| 202 | echo ${CHECK_REPONSE} | jq -r .out > /tmp/external_checks |
| 203 | err_count=$(( ${err_count} + 1 )) |
| 204 | fi |
| 205 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 206 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 207 | progress "External checks" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 208 | if [[ $? == 10 ]]; then |
| 209 | diff_c=0 |
| 210 | sleep 60 |
| 211 | else |
| 212 | diff_c=0 |
Matthias Andreas Benkard | 7b2a3a1 | 2021-08-16 10:57:25 +0200 | [diff] [blame] | 213 | sleep $(( ( RANDOM % 20 ) + 1800 )) |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 214 | fi |
| 215 | done |
| 216 | return 1 |
| 217 | } |
| 218 | |
| 219 | nginx_checks() { |
| 220 | err_count=0 |
| 221 | diff_c=0 |
| 222 | THRESHOLD=${NGINX_THRESHOLD} |
| 223 | # Reduce error count by 2 after restarting an unhealthy container |
| 224 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 225 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 226 | touch /tmp/nginx-mailcow; echo "$(tail -50 /tmp/nginx-mailcow)" > /tmp/nginx-mailcow |
| 227 | host_ip=$(get_container_ip nginx-mailcow) |
| 228 | err_c_cur=${err_count} |
| 229 | /usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u / -p 8081 2>> /tmp/nginx-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 230 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 231 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 232 | progress "Nginx" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 233 | if [[ $? == 10 ]]; then |
| 234 | diff_c=0 |
| 235 | sleep 1 |
| 236 | else |
| 237 | diff_c=0 |
| 238 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 239 | fi |
| 240 | done |
| 241 | return 1 |
| 242 | } |
| 243 | |
| 244 | unbound_checks() { |
| 245 | err_count=0 |
| 246 | diff_c=0 |
| 247 | THRESHOLD=${UNBOUND_THRESHOLD} |
| 248 | # Reduce error count by 2 after restarting an unhealthy container |
| 249 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 250 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 251 | touch /tmp/unbound-mailcow; echo "$(tail -50 /tmp/unbound-mailcow)" > /tmp/unbound-mailcow |
| 252 | host_ip=$(get_container_ip unbound-mailcow) |
| 253 | err_c_cur=${err_count} |
Matthias Andreas Benkard | 38837a8 | 2021-01-02 11:13:53 +0100 | [diff] [blame] | 254 | /usr/bin/nslookup -sil stackoverflow.com "${host_ip}" 2>> /tmp/unbound-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
Matthias Andreas Benkard | 4e2e5d9 | 2021-01-02 07:43:06 +0100 | [diff] [blame] | 255 | DNSSEC=$(dig com +dnssec "@${host_ip}" | egrep 'flags:.+ad') |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 256 | if [[ -z ${DNSSEC} ]]; then |
| 257 | echo "DNSSEC failure" 2>> /tmp/unbound-mailcow 1>&2 |
| 258 | err_count=$(( ${err_count} + 1)) |
| 259 | else |
| 260 | echo "DNSSEC check succeeded" 2>> /tmp/unbound-mailcow 1>&2 |
| 261 | fi |
| 262 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 263 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 264 | progress "Unbound" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 265 | if [[ $? == 10 ]]; then |
| 266 | diff_c=0 |
| 267 | sleep 1 |
| 268 | else |
| 269 | diff_c=0 |
| 270 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 271 | fi |
| 272 | done |
| 273 | return 1 |
| 274 | } |
| 275 | |
| 276 | redis_checks() { |
| 277 | # A check for the local redis container |
| 278 | err_count=0 |
| 279 | diff_c=0 |
| 280 | THRESHOLD=${REDIS_THRESHOLD} |
| 281 | # Reduce error count by 2 after restarting an unhealthy container |
| 282 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 283 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 284 | touch /tmp/redis-mailcow; echo "$(tail -50 /tmp/redis-mailcow)" > /tmp/redis-mailcow |
| 285 | host_ip=$(get_container_ip redis-mailcow) |
| 286 | err_c_cur=${err_count} |
| 287 | /usr/lib/nagios/plugins/check_tcp -4 -H redis-mailcow -p 6379 -E -s "PING\n" -q "QUIT" -e "PONG" 2>> /tmp/redis-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 288 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 289 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 290 | progress "Redis" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 291 | if [[ $? == 10 ]]; then |
| 292 | diff_c=0 |
| 293 | sleep 1 |
| 294 | else |
| 295 | diff_c=0 |
| 296 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 297 | fi |
| 298 | done |
| 299 | return 1 |
| 300 | } |
| 301 | |
| 302 | mysql_checks() { |
| 303 | err_count=0 |
| 304 | diff_c=0 |
| 305 | THRESHOLD=${MYSQL_THRESHOLD} |
| 306 | # Reduce error count by 2 after restarting an unhealthy container |
| 307 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 308 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 309 | touch /tmp/mysql-mailcow; echo "$(tail -50 /tmp/mysql-mailcow)" > /tmp/mysql-mailcow |
| 310 | err_c_cur=${err_count} |
| 311 | /usr/lib/nagios/plugins/check_mysql -s /var/run/mysqld/mysqld.sock -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} 2>> /tmp/mysql-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 312 | /usr/lib/nagios/plugins/check_mysql_query -s /var/run/mysqld/mysqld.sock -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} -q "SELECT COUNT(*) FROM information_schema.tables" 2>> /tmp/mysql-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 313 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 314 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 315 | progress "MySQL/MariaDB" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 316 | if [[ $? == 10 ]]; then |
| 317 | diff_c=0 |
| 318 | sleep 1 |
| 319 | else |
| 320 | diff_c=0 |
| 321 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 322 | fi |
| 323 | done |
| 324 | return 1 |
| 325 | } |
| 326 | |
| 327 | mysql_repl_checks() { |
| 328 | err_count=0 |
| 329 | diff_c=0 |
| 330 | THRESHOLD=${MYSQL_REPLICATION_THRESHOLD} |
| 331 | # Reduce error count by 2 after restarting an unhealthy container |
| 332 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 333 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 334 | touch /tmp/mysql_repl_checks; echo "$(tail -50 /tmp/mysql_repl_checks)" > /tmp/mysql_repl_checks |
| 335 | err_c_cur=${err_count} |
| 336 | /usr/lib/nagios/plugins/check_mysql_slavestatus.sh -S /var/run/mysqld/mysqld.sock -u root -p ${DBROOT} 2>> /tmp/mysql_repl_checks 1>&2; err_count=$(( ${err_count} + $? )) |
| 337 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 338 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 339 | progress "MySQL/MariaDB replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 340 | if [[ $? == 10 ]]; then |
| 341 | diff_c=0 |
| 342 | sleep 60 |
| 343 | else |
| 344 | diff_c=0 |
| 345 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 346 | fi |
| 347 | done |
| 348 | return 1 |
| 349 | } |
| 350 | |
| 351 | sogo_checks() { |
| 352 | err_count=0 |
| 353 | diff_c=0 |
| 354 | THRESHOLD=${SOGO_THRESHOLD} |
| 355 | # Reduce error count by 2 after restarting an unhealthy container |
| 356 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 357 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 358 | touch /tmp/sogo-mailcow; echo "$(tail -50 /tmp/sogo-mailcow)" > /tmp/sogo-mailcow |
| 359 | host_ip=$(get_container_ip sogo-mailcow) |
| 360 | err_c_cur=${err_count} |
| 361 | /usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u /SOGo.index/ -p 20000 -R "SOGo\.MainUI" 2>> /tmp/sogo-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 362 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 363 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 364 | progress "SOGo" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 365 | if [[ $? == 10 ]]; then |
| 366 | diff_c=0 |
| 367 | sleep 1 |
| 368 | else |
| 369 | diff_c=0 |
| 370 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 371 | fi |
| 372 | done |
| 373 | return 1 |
| 374 | } |
| 375 | |
| 376 | postfix_checks() { |
| 377 | err_count=0 |
| 378 | diff_c=0 |
| 379 | THRESHOLD=${POSTFIX_THRESHOLD} |
| 380 | # Reduce error count by 2 after restarting an unhealthy container |
| 381 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 382 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 383 | touch /tmp/postfix-mailcow; echo "$(tail -50 /tmp/postfix-mailcow)" > /tmp/postfix-mailcow |
| 384 | host_ip=$(get_container_ip postfix-mailcow) |
| 385 | err_c_cur=${err_count} |
| 386 | /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 589 -f "watchdog@invalid" -C "RCPT TO:watchdog@localhost" -C DATA -C . -R 250 2>> /tmp/postfix-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 387 | /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 589 -S 2>> /tmp/postfix-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 388 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 389 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 390 | progress "Postfix" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 391 | if [[ $? == 10 ]]; then |
| 392 | diff_c=0 |
| 393 | sleep 1 |
| 394 | else |
| 395 | diff_c=0 |
| 396 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 397 | fi |
| 398 | done |
| 399 | return 1 |
| 400 | } |
| 401 | |
| 402 | clamd_checks() { |
| 403 | err_count=0 |
| 404 | diff_c=0 |
| 405 | THRESHOLD=${CLAMD_THRESHOLD} |
| 406 | # Reduce error count by 2 after restarting an unhealthy container |
| 407 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 408 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 409 | touch /tmp/clamd-mailcow; echo "$(tail -50 /tmp/clamd-mailcow)" > /tmp/clamd-mailcow |
| 410 | host_ip=$(get_container_ip clamd-mailcow) |
| 411 | err_c_cur=${err_count} |
| 412 | /usr/lib/nagios/plugins/check_clamd -4 -H ${host_ip} 2>> /tmp/clamd-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 413 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 414 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 415 | progress "Clamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 416 | if [[ $? == 10 ]]; then |
| 417 | diff_c=0 |
| 418 | sleep 1 |
| 419 | else |
| 420 | diff_c=0 |
| 421 | sleep $(( ( RANDOM % 120 ) + 20 )) |
| 422 | fi |
| 423 | done |
| 424 | return 1 |
| 425 | } |
| 426 | |
| 427 | dovecot_checks() { |
| 428 | err_count=0 |
| 429 | diff_c=0 |
| 430 | THRESHOLD=${DOVECOT_THRESHOLD} |
| 431 | # Reduce error count by 2 after restarting an unhealthy container |
| 432 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 433 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 434 | touch /tmp/dovecot-mailcow; echo "$(tail -50 /tmp/dovecot-mailcow)" > /tmp/dovecot-mailcow |
| 435 | host_ip=$(get_container_ip dovecot-mailcow) |
| 436 | err_c_cur=${err_count} |
| 437 | /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 24 -f "watchdog@invalid" -C "RCPT TO:<watchdog@invalid>" -L -R "User doesn't exist" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 438 | /usr/lib/nagios/plugins/check_imap -4 -H ${host_ip} -p 993 -S -e "OK " 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 439 | /usr/lib/nagios/plugins/check_imap -4 -H ${host_ip} -p 143 -e "OK " 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 440 | /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 10001 -e "VERSION" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 441 | /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 4190 -e "Dovecot ready" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 442 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 443 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 444 | progress "Dovecot" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 445 | if [[ $? == 10 ]]; then |
| 446 | diff_c=0 |
| 447 | sleep 1 |
| 448 | else |
| 449 | diff_c=0 |
| 450 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 451 | fi |
| 452 | done |
| 453 | return 1 |
| 454 | } |
| 455 | |
| 456 | dovecot_repl_checks() { |
| 457 | err_count=0 |
| 458 | diff_c=0 |
| 459 | THRESHOLD=${DOVECOT_REPL_THRESHOLD} |
| 460 | D_REPL_STATUS=$(redis-cli -h redis -r GET DOVECOT_REPL_HEALTH) |
| 461 | # Reduce error count by 2 after restarting an unhealthy container |
| 462 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 463 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 464 | err_c_cur=${err_count} |
| 465 | D_REPL_STATUS=$(redis-cli --raw -h redis GET DOVECOT_REPL_HEALTH) |
| 466 | if [[ "${D_REPL_STATUS}" != "1" ]]; then |
| 467 | err_count=$(( ${err_count} + 1 )) |
| 468 | fi |
| 469 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 470 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 471 | progress "Dovecot replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 472 | if [[ $? == 10 ]]; then |
| 473 | diff_c=0 |
| 474 | sleep 60 |
| 475 | else |
| 476 | diff_c=0 |
| 477 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 478 | fi |
| 479 | done |
| 480 | return 1 |
| 481 | } |
| 482 | |
| 483 | cert_checks() { |
| 484 | err_count=0 |
| 485 | diff_c=0 |
| 486 | THRESHOLD=7 |
| 487 | # Reduce error count by 2 after restarting an unhealthy container |
| 488 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 489 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 490 | touch /tmp/certcheck; echo "$(tail -50 /tmp/certcheck)" > /tmp/certcheck |
| 491 | host_ip_postfix=$(get_container_ip postfix) |
| 492 | host_ip_dovecot=$(get_container_ip dovecot) |
| 493 | err_c_cur=${err_count} |
| 494 | /usr/lib/nagios/plugins/check_smtp -H ${host_ip_postfix} -p 589 -4 -S -D 7 2>> /tmp/certcheck 1>&2; err_count=$(( ${err_count} + $? )) |
| 495 | /usr/lib/nagios/plugins/check_imap -H ${host_ip_dovecot} -p 993 -4 -S -D 7 2>> /tmp/certcheck 1>&2; err_count=$(( ${err_count} + $? )) |
| 496 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 497 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 498 | progress "Primary certificate expiry check" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 499 | # Always sleep 5 minutes, mail notifications are limited |
| 500 | sleep 300 |
| 501 | done |
| 502 | return 1 |
| 503 | } |
| 504 | |
| 505 | phpfpm_checks() { |
| 506 | err_count=0 |
| 507 | diff_c=0 |
| 508 | THRESHOLD=${PHPFPM_THRESHOLD} |
| 509 | # Reduce error count by 2 after restarting an unhealthy container |
| 510 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 511 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 512 | touch /tmp/php-fpm-mailcow; echo "$(tail -50 /tmp/php-fpm-mailcow)" > /tmp/php-fpm-mailcow |
| 513 | host_ip=$(get_container_ip php-fpm-mailcow) |
| 514 | err_c_cur=${err_count} |
| 515 | /usr/lib/nagios/plugins/check_tcp -H ${host_ip} -p 9001 2>> /tmp/php-fpm-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 516 | /usr/lib/nagios/plugins/check_tcp -H ${host_ip} -p 9002 2>> /tmp/php-fpm-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 517 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 518 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 519 | progress "PHP-FPM" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 520 | if [[ $? == 10 ]]; then |
| 521 | diff_c=0 |
| 522 | sleep 1 |
| 523 | else |
| 524 | diff_c=0 |
| 525 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 526 | fi |
| 527 | done |
| 528 | return 1 |
| 529 | } |
| 530 | |
| 531 | ratelimit_checks() { |
| 532 | err_count=0 |
| 533 | diff_c=0 |
| 534 | THRESHOLD=${RATELIMIT_THRESHOLD} |
| 535 | RL_LOG_STATUS=$(redis-cli -h redis LRANGE RL_LOG 0 0 | jq .qid) |
| 536 | # Reduce error count by 2 after restarting an unhealthy container |
| 537 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 538 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 539 | err_c_cur=${err_count} |
| 540 | RL_LOG_STATUS_PREV=${RL_LOG_STATUS} |
| 541 | RL_LOG_STATUS=$(redis-cli -h redis LRANGE RL_LOG 0 0 | jq .qid) |
| 542 | if [[ ${RL_LOG_STATUS_PREV} != ${RL_LOG_STATUS} ]]; then |
| 543 | err_count=$(( ${err_count} + 1 )) |
| 544 | echo 'Last 10 applied ratelimits (may overlap with previous reports).' > /tmp/ratelimit |
| 545 | echo 'Full ratelimit buckets can be emptied by deleting the ratelimit hash from within mailcow UI (see /debug -> Protocols -> Ratelimit):' >> /tmp/ratelimit |
| 546 | echo >> /tmp/ratelimit |
| 547 | redis-cli --raw -h redis LRANGE RL_LOG 0 10 | jq . >> /tmp/ratelimit |
| 548 | fi |
| 549 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 550 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 551 | progress "Ratelimit" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 552 | if [[ $? == 10 ]]; then |
| 553 | diff_c=0 |
| 554 | sleep 1 |
| 555 | else |
| 556 | diff_c=0 |
| 557 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 558 | fi |
| 559 | done |
| 560 | return 1 |
| 561 | } |
| 562 | |
| 563 | mailq_checks() { |
| 564 | err_count=0 |
| 565 | diff_c=0 |
| 566 | THRESHOLD=${MAILQ_THRESHOLD} |
| 567 | # Reduce error count by 2 after restarting an unhealthy container |
| 568 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 569 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 570 | touch /tmp/mail_queue_status; echo "$(tail -50 /tmp/mail_queue_status)" > /tmp/mail_queue_status |
| 571 | MAILQ_LOG_STATUS=$(find /var/spool/postfix/deferred -type f | wc -l) |
| 572 | echo "Mail queue contains ${MAILQ_LOG_STATUS} items (critical limit is ${MAILQ_CRIT}) at $(date)" >> /tmp/mail_queue_status |
| 573 | err_c_cur=${err_count} |
| 574 | if [ ${MAILQ_LOG_STATUS} -ge ${MAILQ_CRIT} ]; then |
| 575 | err_count=$(( ${err_count} + 1 )) |
| 576 | echo "Mail queue contains ${MAILQ_LOG_STATUS} items (critical limit is ${MAILQ_CRIT}) at $(date)" >> /tmp/mail_queue_status |
| 577 | fi |
| 578 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 579 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 580 | progress "Mail queue" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 581 | if [[ $? == 10 ]]; then |
| 582 | diff_c=0 |
| 583 | sleep 60 |
| 584 | else |
| 585 | diff_c=0 |
| 586 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 587 | fi |
| 588 | done |
| 589 | return 1 |
| 590 | } |
| 591 | |
| 592 | fail2ban_checks() { |
| 593 | err_count=0 |
| 594 | diff_c=0 |
| 595 | THRESHOLD=${FAIL2BAN_THRESHOLD} |
| 596 | F2B_LOG_STATUS=($(${REDIS_CMDLINE} --raw HKEYS F2B_ACTIVE_BANS)) |
| 597 | F2B_RES= |
| 598 | # Reduce error count by 2 after restarting an unhealthy container |
| 599 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 600 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 601 | err_c_cur=${err_count} |
| 602 | F2B_LOG_STATUS_PREV=(${F2B_LOG_STATUS[@]}) |
| 603 | F2B_LOG_STATUS=($(${REDIS_CMDLINE} --raw HKEYS F2B_ACTIVE_BANS)) |
| 604 | array_diff F2B_RES F2B_LOG_STATUS F2B_LOG_STATUS_PREV |
| 605 | if [[ ! -z "${F2B_RES}" ]]; then |
| 606 | err_count=$(( ${err_count} + 1 )) |
| 607 | echo -n "${F2B_RES[@]}" | tr -cd "[a-fA-F0-9.:/] " | timeout 3s ${REDIS_CMDLINE} -x SET F2B_RES > /dev/null |
| 608 | if [ $? -ne 0 ]; then |
| 609 | ${REDIS_CMDLINE} -x DEL F2B_RES |
| 610 | fi |
| 611 | fi |
| 612 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 613 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 614 | progress "Fail2ban" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 615 | if [[ $? == 10 ]]; then |
| 616 | diff_c=0 |
| 617 | sleep 1 |
| 618 | else |
| 619 | diff_c=0 |
| 620 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 621 | fi |
| 622 | done |
| 623 | return 1 |
| 624 | } |
| 625 | |
| 626 | acme_checks() { |
| 627 | err_count=0 |
| 628 | diff_c=0 |
| 629 | THRESHOLD=${ACME_THRESHOLD} |
| 630 | ACME_LOG_STATUS=$(redis-cli -h redis GET ACME_FAIL_TIME) |
| 631 | if [[ -z "${ACME_LOG_STATUS}" ]]; then |
| 632 | ${REDIS_CMDLINE} SET ACME_FAIL_TIME 0 |
| 633 | ACME_LOG_STATUS=0 |
| 634 | fi |
| 635 | # Reduce error count by 2 after restarting an unhealthy container |
| 636 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 637 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 638 | err_c_cur=${err_count} |
| 639 | ACME_LOG_STATUS_PREV=${ACME_LOG_STATUS} |
| 640 | ACME_LC=0 |
| 641 | until [[ ! -z ${ACME_LOG_STATUS} ]] || [ ${ACME_LC} -ge 3 ]; do |
| 642 | ACME_LOG_STATUS=$(redis-cli -h redis GET ACME_FAIL_TIME 2> /dev/null) |
| 643 | sleep 3 |
| 644 | ACME_LC=$((ACME_LC+1)) |
| 645 | done |
| 646 | if [[ ${ACME_LOG_STATUS_PREV} != ${ACME_LOG_STATUS} ]]; then |
| 647 | err_count=$(( ${err_count} + 1 )) |
| 648 | fi |
| 649 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 650 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 651 | progress "ACME" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 652 | if [[ $? == 10 ]]; then |
| 653 | diff_c=0 |
| 654 | sleep 1 |
| 655 | else |
| 656 | diff_c=0 |
| 657 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 658 | fi |
| 659 | done |
| 660 | return 1 |
| 661 | } |
| 662 | |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 663 | rspamd_checks() { |
| 664 | err_count=0 |
| 665 | diff_c=0 |
| 666 | THRESHOLD=${RSPAMD_THRESHOLD} |
| 667 | # Reduce error count by 2 after restarting an unhealthy container |
| 668 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 669 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 670 | touch /tmp/rspamd-mailcow; echo "$(tail -50 /tmp/rspamd-mailcow)" > /tmp/rspamd-mailcow |
| 671 | host_ip=$(get_container_ip rspamd-mailcow) |
| 672 | err_c_cur=${err_count} |
| 673 | SCORE=$(echo 'To: null@localhost |
| 674 | From: watchdog@localhost |
| 675 | |
| 676 | Empty |
| 677 | ' | usr/bin/curl --max-time 10 -s --data-binary @- --unix-socket /var/lib/rspamd/rspamd.sock http://rspamd/scan | jq -rc .default.required_score) |
| 678 | if [[ ${SCORE} != "9999" ]]; then |
| 679 | echo "Rspamd settings check failed, score returned: ${SCORE}" 2>> /tmp/rspamd-mailcow 1>&2 |
| 680 | err_count=$(( ${err_count} + 1)) |
| 681 | else |
| 682 | echo "Rspamd settings check succeeded, score returned: ${SCORE}" 2>> /tmp/rspamd-mailcow 1>&2 |
| 683 | fi |
| 684 | # A dirty hack until a PING PONG event is implemented to worker proxy |
| 685 | # We expect an empty response, not a timeout |
| 686 | if [ "$(curl -s --max-time 10 ${host_ip}:9900 2> /dev/null ; echo $?)" == "28" ]; then |
| 687 | echo "Milter check failed" 2>> /tmp/rspamd-mailcow 1>&2; err_count=$(( ${err_count} + 1 )); |
| 688 | else |
| 689 | echo "Milter check succeeded" 2>> /tmp/rspamd-mailcow 1>&2 |
| 690 | fi |
| 691 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 692 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 693 | progress "Rspamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 694 | if [[ $? == 10 ]]; then |
| 695 | diff_c=0 |
| 696 | sleep 1 |
| 697 | else |
| 698 | diff_c=0 |
| 699 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 700 | fi |
| 701 | done |
| 702 | return 1 |
| 703 | } |
| 704 | |
| 705 | olefy_checks() { |
| 706 | err_count=0 |
| 707 | diff_c=0 |
| 708 | THRESHOLD=${OLEFY_THRESHOLD} |
| 709 | # Reduce error count by 2 after restarting an unhealthy container |
| 710 | trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1 |
| 711 | while [ ${err_count} -lt ${THRESHOLD} ]; do |
| 712 | touch /tmp/olefy-mailcow; echo "$(tail -50 /tmp/olefy-mailcow)" > /tmp/olefy-mailcow |
| 713 | host_ip=$(get_container_ip olefy-mailcow) |
| 714 | err_c_cur=${err_count} |
| 715 | /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 10055 -s "PING\n" 2>> /tmp/olefy-mailcow 1>&2; err_count=$(( ${err_count} + $? )) |
| 716 | [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1 |
| 717 | [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} )) |
| 718 | progress "Olefy" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c} |
| 719 | if [[ $? == 10 ]]; then |
| 720 | diff_c=0 |
| 721 | sleep 1 |
| 722 | else |
| 723 | diff_c=0 |
| 724 | sleep $(( ( RANDOM % 60 ) + 20 )) |
| 725 | fi |
| 726 | done |
| 727 | return 1 |
| 728 | } |
| 729 | |
| 730 | # Notify about start |
| 731 | if [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]]; then |
| 732 | mail_error "watchdog-mailcow" "Watchdog started monitoring mailcow." |
| 733 | fi |
| 734 | |
| 735 | # Create watchdog agents |
| 736 | |
| 737 | ( |
| 738 | while true; do |
| 739 | if ! nginx_checks; then |
| 740 | log_msg "Nginx hit error limit" |
| 741 | echo nginx-mailcow > /tmp/com_pipe |
| 742 | fi |
| 743 | done |
| 744 | ) & |
| 745 | PID=$! |
| 746 | echo "Spawned nginx_checks with PID ${PID}" |
| 747 | BACKGROUND_TASKS+=(${PID}) |
| 748 | |
| 749 | if [[ ${WATCHDOG_EXTERNAL_CHECKS} =~ ^([yY][eE][sS]|[yY])+$ ]]; then |
| 750 | ( |
| 751 | while true; do |
| 752 | if ! external_checks; then |
| 753 | log_msg "External checks hit error limit" |
| 754 | echo external_checks > /tmp/com_pipe |
| 755 | fi |
| 756 | done |
| 757 | ) & |
| 758 | PID=$! |
| 759 | echo "Spawned external_checks with PID ${PID}" |
| 760 | BACKGROUND_TASKS+=(${PID}) |
| 761 | fi |
| 762 | |
| 763 | if [[ ${WATCHDOG_MYSQL_REPLICATION_CHECKS} =~ ^([yY][eE][sS]|[yY])+$ ]]; then |
| 764 | ( |
| 765 | while true; do |
| 766 | if ! mysql_repl_checks; then |
| 767 | log_msg "MySQL replication check hit error limit" |
| 768 | echo mysql_repl_checks > /tmp/com_pipe |
| 769 | fi |
| 770 | done |
| 771 | ) & |
| 772 | PID=$! |
| 773 | echo "Spawned mysql_repl_checks with PID ${PID}" |
| 774 | BACKGROUND_TASKS+=(${PID}) |
| 775 | fi |
| 776 | |
| 777 | ( |
| 778 | while true; do |
| 779 | if ! mysql_checks; then |
| 780 | log_msg "MySQL hit error limit" |
| 781 | echo mysql-mailcow > /tmp/com_pipe |
| 782 | fi |
| 783 | done |
| 784 | ) & |
| 785 | PID=$! |
| 786 | echo "Spawned mysql_checks with PID ${PID}" |
| 787 | BACKGROUND_TASKS+=(${PID}) |
| 788 | |
| 789 | ( |
| 790 | while true; do |
| 791 | if ! redis_checks; then |
| 792 | log_msg "Local Redis hit error limit" |
| 793 | echo redis-mailcow > /tmp/com_pipe |
| 794 | fi |
| 795 | done |
| 796 | ) & |
| 797 | PID=$! |
| 798 | echo "Spawned redis_checks with PID ${PID}" |
| 799 | BACKGROUND_TASKS+=(${PID}) |
| 800 | |
| 801 | ( |
| 802 | while true; do |
| 803 | if ! phpfpm_checks; then |
| 804 | log_msg "PHP-FPM hit error limit" |
| 805 | echo php-fpm-mailcow > /tmp/com_pipe |
| 806 | fi |
| 807 | done |
| 808 | ) & |
| 809 | PID=$! |
| 810 | echo "Spawned phpfpm_checks with PID ${PID}" |
| 811 | BACKGROUND_TASKS+=(${PID}) |
| 812 | |
| 813 | if [[ "${SKIP_SOGO}" =~ ^([nN][oO]|[nN])+$ ]]; then |
| 814 | ( |
| 815 | while true; do |
| 816 | if ! sogo_checks; then |
| 817 | log_msg "SOGo hit error limit" |
| 818 | echo sogo-mailcow > /tmp/com_pipe |
| 819 | fi |
| 820 | done |
| 821 | ) & |
| 822 | PID=$! |
| 823 | echo "Spawned sogo_checks with PID ${PID}" |
| 824 | BACKGROUND_TASKS+=(${PID}) |
| 825 | fi |
| 826 | |
| 827 | if [ ${CHECK_UNBOUND} -eq 1 ]; then |
| 828 | ( |
| 829 | while true; do |
| 830 | if ! unbound_checks; then |
| 831 | log_msg "Unbound hit error limit" |
| 832 | echo unbound-mailcow > /tmp/com_pipe |
| 833 | fi |
| 834 | done |
| 835 | ) & |
| 836 | PID=$! |
| 837 | echo "Spawned unbound_checks with PID ${PID}" |
| 838 | BACKGROUND_TASKS+=(${PID}) |
| 839 | fi |
| 840 | |
| 841 | if [[ "${SKIP_CLAMD}" =~ ^([nN][oO]|[nN])+$ ]]; then |
| 842 | ( |
| 843 | while true; do |
| 844 | if ! clamd_checks; then |
| 845 | log_msg "Clamd hit error limit" |
| 846 | echo clamd-mailcow > /tmp/com_pipe |
| 847 | fi |
| 848 | done |
| 849 | ) & |
| 850 | PID=$! |
| 851 | echo "Spawned clamd_checks with PID ${PID}" |
| 852 | BACKGROUND_TASKS+=(${PID}) |
| 853 | fi |
| 854 | |
| 855 | ( |
| 856 | while true; do |
| 857 | if ! postfix_checks; then |
| 858 | log_msg "Postfix hit error limit" |
| 859 | echo postfix-mailcow > /tmp/com_pipe |
| 860 | fi |
| 861 | done |
| 862 | ) & |
| 863 | PID=$! |
| 864 | echo "Spawned postfix_checks with PID ${PID}" |
| 865 | BACKGROUND_TASKS+=(${PID}) |
| 866 | |
| 867 | ( |
| 868 | while true; do |
| 869 | if ! mailq_checks; then |
| 870 | log_msg "Mail queue hit error limit" |
| 871 | echo mail_queue_status > /tmp/com_pipe |
| 872 | fi |
| 873 | done |
| 874 | ) & |
| 875 | PID=$! |
| 876 | echo "Spawned mailq_checks with PID ${PID}" |
| 877 | BACKGROUND_TASKS+=(${PID}) |
| 878 | |
| 879 | ( |
| 880 | while true; do |
| 881 | if ! dovecot_checks; then |
| 882 | log_msg "Dovecot hit error limit" |
| 883 | echo dovecot-mailcow > /tmp/com_pipe |
| 884 | fi |
| 885 | done |
| 886 | ) & |
| 887 | PID=$! |
| 888 | echo "Spawned dovecot_checks with PID ${PID}" |
| 889 | BACKGROUND_TASKS+=(${PID}) |
| 890 | |
| 891 | ( |
| 892 | while true; do |
| 893 | if ! dovecot_repl_checks; then |
| 894 | log_msg "Dovecot hit error limit" |
| 895 | echo dovecot_repl_checks > /tmp/com_pipe |
| 896 | fi |
| 897 | done |
| 898 | ) & |
| 899 | PID=$! |
| 900 | echo "Spawned dovecot_repl_checks with PID ${PID}" |
| 901 | BACKGROUND_TASKS+=(${PID}) |
| 902 | |
| 903 | ( |
| 904 | while true; do |
| 905 | if ! rspamd_checks; then |
| 906 | log_msg "Rspamd hit error limit" |
| 907 | echo rspamd-mailcow > /tmp/com_pipe |
| 908 | fi |
| 909 | done |
| 910 | ) & |
| 911 | PID=$! |
| 912 | echo "Spawned rspamd_checks with PID ${PID}" |
| 913 | BACKGROUND_TASKS+=(${PID}) |
| 914 | |
| 915 | ( |
| 916 | while true; do |
| 917 | if ! ratelimit_checks; then |
| 918 | log_msg "Ratelimit hit error limit" |
| 919 | echo ratelimit > /tmp/com_pipe |
| 920 | fi |
| 921 | done |
| 922 | ) & |
| 923 | PID=$! |
| 924 | echo "Spawned ratelimit_checks with PID ${PID}" |
| 925 | BACKGROUND_TASKS+=(${PID}) |
| 926 | |
| 927 | ( |
| 928 | while true; do |
| 929 | if ! fail2ban_checks; then |
| 930 | log_msg "Fail2ban hit error limit" |
| 931 | echo fail2ban > /tmp/com_pipe |
| 932 | fi |
| 933 | done |
| 934 | ) & |
| 935 | PID=$! |
| 936 | echo "Spawned fail2ban_checks with PID ${PID}" |
| 937 | BACKGROUND_TASKS+=(${PID}) |
| 938 | |
| 939 | ( |
| 940 | while true; do |
| 941 | if ! cert_checks; then |
| 942 | log_msg "Cert check hit error limit" |
| 943 | echo certcheck > /tmp/com_pipe |
| 944 | fi |
| 945 | done |
| 946 | ) & |
| 947 | PID=$! |
| 948 | echo "Spawned cert_checks with PID ${PID}" |
| 949 | BACKGROUND_TASKS+=(${PID}) |
| 950 | |
| 951 | ( |
| 952 | while true; do |
| 953 | if ! olefy_checks; then |
| 954 | log_msg "Olefy hit error limit" |
| 955 | echo olefy-mailcow > /tmp/com_pipe |
| 956 | fi |
| 957 | done |
| 958 | ) & |
| 959 | PID=$! |
| 960 | echo "Spawned olefy_checks with PID ${PID}" |
| 961 | BACKGROUND_TASKS+=(${PID}) |
| 962 | |
| 963 | ( |
| 964 | while true; do |
| 965 | if ! acme_checks; then |
| 966 | log_msg "ACME client hit error limit" |
| 967 | echo acme-mailcow > /tmp/com_pipe |
| 968 | fi |
| 969 | done |
| 970 | ) & |
| 971 | PID=$! |
| 972 | echo "Spawned acme_checks with PID ${PID}" |
| 973 | BACKGROUND_TASKS+=(${PID}) |
| 974 | |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 975 | # Monitor watchdog agents, stop script when agents fails and wait for respawn by Docker (restart:always:n) |
| 976 | ( |
| 977 | while true; do |
| 978 | for bg_task in ${BACKGROUND_TASKS[*]}; do |
| 979 | if ! kill -0 ${bg_task} 1>&2; then |
| 980 | log_msg "Worker ${bg_task} died, stopping watchdog and waiting for respawn..." |
| 981 | kill -TERM 1 |
| 982 | fi |
| 983 | sleep 10 |
| 984 | done |
| 985 | done |
| 986 | ) & |
| 987 | |
| 988 | # Monitor dockerapi |
| 989 | ( |
| 990 | while true; do |
| 991 | while nc -z dockerapi 443; do |
| 992 | sleep 3 |
| 993 | done |
| 994 | log_msg "Cannot find dockerapi-mailcow, waiting to recover..." |
| 995 | kill -STOP ${BACKGROUND_TASKS[*]} |
| 996 | until nc -z dockerapi 443; do |
| 997 | sleep 3 |
| 998 | done |
| 999 | kill -CONT ${BACKGROUND_TASKS[*]} |
| 1000 | kill -USR1 ${BACKGROUND_TASKS[*]} |
| 1001 | done |
| 1002 | ) & |
| 1003 | |
| 1004 | # Actions when threshold limit is reached |
| 1005 | while true; do |
| 1006 | CONTAINER_ID= |
| 1007 | HAS_INITDB= |
| 1008 | read com_pipe_answer </tmp/com_pipe |
| 1009 | if [ -s "/tmp/${com_pipe_answer}" ]; then |
| 1010 | cat "/tmp/${com_pipe_answer}" |
| 1011 | fi |
| 1012 | if [[ ${com_pipe_answer} == "ratelimit" ]]; then |
| 1013 | log_msg "At least one ratelimit was applied" |
| 1014 | [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" |
| 1015 | elif [[ ${com_pipe_answer} == "mail_queue_status" ]]; then |
| 1016 | log_msg "Mail queue status is critical" |
| 1017 | [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" |
| 1018 | elif [[ ${com_pipe_answer} == "external_checks" ]]; then |
| 1019 | log_msg "Your mailcow is an open relay!" |
| 1020 | # Define $2 to override message text, else print service was restarted at ... |
| 1021 | [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please stop mailcow now and check your network configuration!" |
| 1022 | elif [[ ${com_pipe_answer} == "mysql_repl_checks" ]]; then |
| 1023 | log_msg "MySQL replication is not working properly" |
| 1024 | # Define $2 to override message text, else print service was restarted at ... |
| 1025 | # Once mail per 10 minutes |
| 1026 | [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check the SQL replication status" 600 |
| 1027 | elif [[ ${com_pipe_answer} == "dovecot_repl_checks" ]]; then |
| 1028 | log_msg "Dovecot replication is not working properly" |
| 1029 | # Define $2 to override message text, else print service was restarted at ... |
| 1030 | # Once mail per 10 minutes |
| 1031 | [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check the Dovecot replicator status" 600 |
| 1032 | elif [[ ${com_pipe_answer} == "certcheck" ]]; then |
| 1033 | log_msg "Certificates are about to expire" |
| 1034 | # Define $2 to override message text, else print service was restarted at ... |
| 1035 | # Only mail once a day |
| 1036 | [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please renew your certificate" 86400 |
| 1037 | elif [[ ${com_pipe_answer} == "acme-mailcow" ]]; then |
| 1038 | log_msg "acme-mailcow did not complete successfully" |
| 1039 | # Define $2 to override message text, else print service was restarted at ... |
| 1040 | [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check acme-mailcow for further information." |
| 1041 | elif [[ ${com_pipe_answer} == "fail2ban" ]]; then |
| 1042 | F2B_RES=($(timeout 4s ${REDIS_CMDLINE} --raw GET F2B_RES 2> /dev/null)) |
| 1043 | if [[ ! -z "${F2B_RES}" ]]; then |
| 1044 | ${REDIS_CMDLINE} DEL F2B_RES > /dev/null |
| 1045 | host= |
| 1046 | for host in "${F2B_RES[@]}"; do |
| 1047 | log_msg "Banned ${host}" |
| 1048 | rm /tmp/fail2ban 2> /dev/null |
| 1049 | timeout 2s whois "${host}" > /tmp/fail2ban |
| 1050 | [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && [[ ${WATCHDOG_NOTIFY_BAN} =~ ^([yY][eE][sS]|[yY])+$ ]] && mail_error "${com_pipe_answer}" "IP ban: ${host}" |
| 1051 | done |
| 1052 | fi |
| 1053 | elif [[ ${com_pipe_answer} =~ .+-mailcow ]]; then |
| 1054 | kill -STOP ${BACKGROUND_TASKS[*]} |
| 1055 | sleep 10 |
| 1056 | CONTAINER_ID=$(curl --silent --insecure https://dockerapi/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], id: .Id}" | jq -rc "select( .name | tostring | contains(\"${com_pipe_answer}\")) | select( .project | tostring | contains(\"${COMPOSE_PROJECT_NAME,,}\")) | .id") |
| 1057 | if [[ ! -z ${CONTAINER_ID} ]]; then |
| 1058 | if [[ "${com_pipe_answer}" == "php-fpm-mailcow" ]]; then |
| 1059 | HAS_INITDB=$(curl --silent --insecure -XPOST https://dockerapi/containers/${CONTAINER_ID}/top | jq '.msg.Processes[] | contains(["php -c /usr/local/etc/php -f /web/inc/init_db.inc.php"])' | grep true) |
| 1060 | fi |
| 1061 | S_RUNNING=$(($(date +%s) - $(curl --silent --insecure https://dockerapi/containers/${CONTAINER_ID}/json | jq .State.StartedAt | xargs -n1 date +%s -d))) |
| 1062 | if [ ${S_RUNNING} -lt 360 ]; then |
| 1063 | log_msg "Container is running for less than 360 seconds, skipping action..." |
| 1064 | elif [[ ! -z ${HAS_INITDB} ]]; then |
| 1065 | log_msg "Database is being initialized by php-fpm-mailcow, not restarting but delaying checks for a minute..." |
| 1066 | sleep 60 |
| 1067 | else |
| 1068 | log_msg "Sending restart command to ${CONTAINER_ID}..." |
| 1069 | curl --silent --insecure -XPOST https://dockerapi/containers/${CONTAINER_ID}/restart |
Matthias Andreas Benkard | 7b2a3a1 | 2021-08-16 10:57:25 +0200 | [diff] [blame] | 1070 | [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" |
Matthias Andreas Benkard | b382b10 | 2021-01-02 15:32:21 +0100 | [diff] [blame] | 1071 | log_msg "Wait for restarted container to settle and continue watching..." |
| 1072 | sleep 35 |
| 1073 | fi |
| 1074 | fi |
| 1075 | kill -CONT ${BACKGROUND_TASKS[*]} |
| 1076 | sleep 1 |
| 1077 | kill -USR1 ${BACKGROUND_TASKS[*]} |
| 1078 | fi |
| 1079 | done |