blob: 1e7c2f49f3c3bb08a267c46dc169f2e13d6c0b53 [file] [log] [blame]
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +01001#!/bin/bash
2
3trap "exit" INT TERM
4trap "kill 0" EXIT
5
6# Prepare
7BACKGROUND_TASKS=()
8echo "Waiting for containers to settle..."
9sleep 30
10
11if [[ "${USE_WATCHDOG}" =~ ^([nN][oO]|[nN])+$ ]]; then
12 echo -e "$(date) - USE_WATCHDOG=n, skipping watchdog..."
13 sleep 365d
14 exec $(readlink -f "$0")
15fi
16
17# Checks pipe their corresponding container name in this pipe
18if [[ ! -p /tmp/com_pipe ]]; then
19 mkfifo /tmp/com_pipe
20fi
21
22# Wait for containers
23while ! mysqladmin status --socket=/var/run/mysqld/mysqld.sock -u${DBUSER} -p${DBPASS} --silent; do
24 echo "Waiting for SQL..."
25 sleep 2
26done
27
28# Do not attempt to write to slave
29if [[ ! -z ${REDIS_SLAVEOF_IP} ]]; then
30 REDIS_CMDLINE="redis-cli -h ${REDIS_SLAVEOF_IP} -p ${REDIS_SLAVEOF_PORT}"
31else
32 REDIS_CMDLINE="redis-cli -h redis -p 6379"
33fi
34
35until [[ $(${REDIS_CMDLINE} PING) == "PONG" ]]; do
36 echo "Waiting for Redis..."
37 sleep 2
38done
39
40${REDIS_CMDLINE} DEL F2B_RES > /dev/null
41
42# Common functions
43get_ipv6(){
44 local IPV6=
45 local IPV6_SRCS=
46 local TRY=
47 IPV6_SRCS[0]="ip6.korves.net"
48 IPV6_SRCS[1]="ip6.mailcow.email"
49 until [[ ! -z ${IPV6} ]] || [[ ${TRY} -ge 10 ]]; do
50 IPV6=$(curl --connect-timeout 3 -m 10 -L6s ${IPV6_SRCS[$RANDOM % ${#IPV6_SRCS[@]} ]} | grep "^\([0-9a-fA-F]\{0,4\}:\)\{1,7\}[0-9a-fA-F]\{0,4\}$")
51 [[ ! -z ${TRY} ]] && sleep 1
52 TRY=$((TRY+1))
53 done
54 echo ${IPV6}
55}
56
57array_diff() {
58 # https://stackoverflow.com/questions/2312762, Alex Offshore
59 eval local ARR1=\(\"\${$2[@]}\"\)
60 eval local ARR2=\(\"\${$3[@]}\"\)
61 local IFS=$'\n'
62 mapfile -t $1 < <(comm -23 <(echo "${ARR1[*]}" | sort) <(echo "${ARR2[*]}" | sort))
63}
64
65progress() {
66 SERVICE=${1}
67 TOTAL=${2}
68 CURRENT=${3}
69 DIFF=${4}
70 [[ -z ${DIFF} ]] && DIFF=0
71 [[ -z ${TOTAL} || -z ${CURRENT} ]] && return
72 [[ ${CURRENT} -gt ${TOTAL} ]] && return
73 [[ ${CURRENT} -lt 0 ]] && CURRENT=0
74 PERCENT=$(( 200 * ${CURRENT} / ${TOTAL} % 2 + 100 * ${CURRENT} / ${TOTAL} ))
75 ${REDIS_CMDLINE} LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"service\":\"${SERVICE}\",\"lvl\":\"${PERCENT}\",\"hpnow\":\"${CURRENT}\",\"hptotal\":\"${TOTAL}\",\"hpdiff\":\"${DIFF}\"}" > /dev/null
76 log_msg "${SERVICE} health level: ${PERCENT}% (${CURRENT}/${TOTAL}), health trend: ${DIFF}" no_redis
77 # Return 10 to indicate a dead service
78 [ ${CURRENT} -le 0 ] && return 10
79}
80
81log_msg() {
82 if [[ ${2} != "no_redis" ]]; then
83 ${REDIS_CMDLINE} LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"message\":\"$(printf '%s' "${1}" | \
84 tr '\r\n%&;$"_[]{}-' ' ')\"}" > /dev/null
85 fi
86 echo $(date) $(printf '%s\n' "${1}")
87}
88
89function mail_error() {
90 THROTTLE=
91 [[ -z ${1} ]] && return 1
92 # If exists, body will be the content of "/tmp/${1}", even if ${2} is set
93 [[ -z ${2} ]] && BODY="Service was restarted on $(date), please check your mailcow installation." || BODY="$(date) - ${2}"
94 # If exists, mail will be throttled by argument in seconds
95 [[ ! -z ${3} ]] && THROTTLE=${3}
96 if [[ ! -z ${THROTTLE} ]]; then
97 TTL_LEFT="$(${REDIS_CMDLINE} TTL THROTTLE_${1} 2> /dev/null)"
98 if [[ "${TTL_LEFT}" == "-2" ]]; then
99 # Delay key not found, setting a delay key now
100 ${REDIS_CMDLINE} SET THROTTLE_${1} 1 EX ${THROTTLE}
101 else
102 log_msg "Not sending notification email now, blocked for ${TTL_LEFT} seconds..."
103 return 1
104 fi
105 fi
106 WATCHDOG_NOTIFY_EMAIL=$(echo "${WATCHDOG_NOTIFY_EMAIL}" | sed 's/"//;s|"$||')
107 # Some exceptions for subject and body formats
108 if [[ ${1} == "fail2ban" ]]; then
109 SUBJECT="${BODY}"
110 BODY="Please see netfilter-mailcow for more details and triggered rules."
111 else
112 SUBJECT="Watchdog ALERT: ${1}"
113 fi
114 IFS=',' read -r -a MAIL_RCPTS <<< "${WATCHDOG_NOTIFY_EMAIL}"
115 for rcpt in "${MAIL_RCPTS[@]}"; do
116 RCPT_DOMAIN=
117 #RCPT_MX=
118 RCPT_DOMAIN=$(echo ${rcpt} | awk -F @ {'print $NF'})
119 # Latest smtp-cli looks up mx via dns
120 #RCPT_MX=$(dig +short ${RCPT_DOMAIN} mx | sort -n | awk '{print $2; exit}')
121 #if [[ -z ${RCPT_MX} ]]; then
122 # log_msg "Cannot determine MX for ${rcpt}, skipping email notification..."
123 # return 1
124 #fi
125 [ -f "/tmp/${1}" ] && BODY="/tmp/${1}"
126 timeout 10s ./smtp-cli --missing-modules-ok \
127 --charset=UTF-8 \
128 --subject="${SUBJECT}" \
129 --body-plain="${BODY}" \
130 --add-header="X-Priority: 1" \
131 --to=${rcpt} \
132 --from="watchdog@${MAILCOW_HOSTNAME}" \
133 --hello-host=${MAILCOW_HOSTNAME} \
134 --ipv4
135 #--server="${RCPT_MX}"
136 log_msg "Sent notification email to ${rcpt}"
137 done
138}
139
140get_container_ip() {
141 # ${1} is container
142 CONTAINER_ID=()
143 CONTAINER_IPS=()
144 CONTAINER_IP=
145 LOOP_C=1
146 until [[ ${CONTAINER_IP} =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]] || [[ ${LOOP_C} -gt 5 ]]; do
147 if [ ${IP_BY_DOCKER_API} -eq 0 ]; then
148 CONTAINER_IP=$(dig a "${1}" +short)
149 else
150 sleep 0.5
151 # get long container id for exact match
152 CONTAINER_ID=($(curl --silent --insecure https://dockerapi/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], id: .Id}" | jq -rc "select( .name | tostring == \"${1}\") | select( .project | tostring | contains(\"${COMPOSE_PROJECT_NAME,,}\")) | .id"))
153 # returned id can have multiple elements (if scaled), shuffle for random test
154 CONTAINER_ID=($(printf "%s\n" "${CONTAINER_ID[@]}" | shuf))
155 if [[ ! -z ${CONTAINER_ID} ]]; then
156 for matched_container in "${CONTAINER_ID[@]}"; do
157 CONTAINER_IPS=($(curl --silent --insecure https://dockerapi/containers/${matched_container}/json | jq -r '.NetworkSettings.Networks[].IPAddress'))
158 for ip_match in "${CONTAINER_IPS[@]}"; do
159 # grep will do nothing if one of these vars is empty
160 [[ -z ${ip_match} ]] && continue
161 [[ -z ${IPV4_NETWORK} ]] && continue
162 # only return ips that are part of our network
163 if ! grep -q ${IPV4_NETWORK} <(echo ${ip_match}); then
164 continue
165 else
166 CONTAINER_IP=${ip_match}
167 break
168 fi
169 done
170 [[ ! -z ${CONTAINER_IP} ]] && break
171 done
172 fi
173 fi
174 LOOP_C=$((LOOP_C + 1))
175 done
176 [[ ${LOOP_C} -gt 5 ]] && echo 240.0.0.0 || echo ${CONTAINER_IP}
177}
178
179# One-time check
180if grep -qi "$(echo ${IPV6_NETWORK} | cut -d: -f1-3)" <<< "$(ip a s)"; then
181 if [[ -z "$(get_ipv6)" ]]; then
182 mail_error "ipv6-config" "enable_ipv6 is true in docker-compose.yml, but an IPv6 link could not be established. Please verify your IPv6 connection."
183 fi
184fi
185
186external_checks() {
187 err_count=0
188 diff_c=0
189 THRESHOLD=${EXTERNAL_CHECKS_THRESHOLD}
190 # Reduce error count by 2 after restarting an unhealthy container
191 GUID=$(mysql -u${DBUSER} -p${DBPASS} ${DBNAME} -e "SELECT version FROM versions WHERE application = 'GUID'" -BN)
192 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
193 while [ ${err_count} -lt ${THRESHOLD} ]; do
194 err_c_cur=${err_count}
195 CHECK_REPONSE="$(curl --connect-timeout 3 -m 10 -4 -s https://checks.mailcow.email -X POST -dguid=${GUID} 2> /dev/null)"
196 if [[ ! -z "${CHECK_REPONSE}" ]] && [[ "$(echo ${CHECK_REPONSE} | jq -r .response)" == "critical" ]]; then
197 echo ${CHECK_REPONSE} | jq -r .out > /tmp/external_checks
198 err_count=$(( ${err_count} + 1 ))
199 fi
200 CHECK_REPONSE6="$(curl --connect-timeout 3 -m 10 -6 -s https://checks.mailcow.email -X POST -dguid=${GUID} 2> /dev/null)"
201 if [[ ! -z "${CHECK_REPONSE6}" ]] && [[ "$(echo ${CHECK_REPONSE6} | jq -r .response)" == "critical" ]]; then
202 echo ${CHECK_REPONSE} | jq -r .out > /tmp/external_checks
203 err_count=$(( ${err_count} + 1 ))
204 fi
205 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
206 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
207 progress "External checks" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
208 if [[ $? == 10 ]]; then
209 diff_c=0
210 sleep 60
211 else
212 diff_c=0
213 sleep $(( ( RANDOM % 20 ) + 120 ))
214 fi
215 done
216 return 1
217}
218
219nginx_checks() {
220 err_count=0
221 diff_c=0
222 THRESHOLD=${NGINX_THRESHOLD}
223 # Reduce error count by 2 after restarting an unhealthy container
224 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
225 while [ ${err_count} -lt ${THRESHOLD} ]; do
226 touch /tmp/nginx-mailcow; echo "$(tail -50 /tmp/nginx-mailcow)" > /tmp/nginx-mailcow
227 host_ip=$(get_container_ip nginx-mailcow)
228 err_c_cur=${err_count}
229 /usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u / -p 8081 2>> /tmp/nginx-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
230 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
231 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
232 progress "Nginx" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
233 if [[ $? == 10 ]]; then
234 diff_c=0
235 sleep 1
236 else
237 diff_c=0
238 sleep $(( ( RANDOM % 60 ) + 20 ))
239 fi
240 done
241 return 1
242}
243
244unbound_checks() {
245 err_count=0
246 diff_c=0
247 THRESHOLD=${UNBOUND_THRESHOLD}
248 # Reduce error count by 2 after restarting an unhealthy container
249 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
250 while [ ${err_count} -lt ${THRESHOLD} ]; do
251 touch /tmp/unbound-mailcow; echo "$(tail -50 /tmp/unbound-mailcow)" > /tmp/unbound-mailcow
252 host_ip=$(get_container_ip unbound-mailcow)
253 err_c_cur=${err_count}
254 /usr/lib/nagios/plugins/check_dns -s ${host_ip} -H stackoverflow.com 2>> /tmp/unbound-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
255 DNSSEC=$(dig com +dnssec | egrep 'flags:.+ad')
256 if [[ -z ${DNSSEC} ]]; then
257 echo "DNSSEC failure" 2>> /tmp/unbound-mailcow 1>&2
258 err_count=$(( ${err_count} + 1))
259 else
260 echo "DNSSEC check succeeded" 2>> /tmp/unbound-mailcow 1>&2
261 fi
262 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
263 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
264 progress "Unbound" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
265 if [[ $? == 10 ]]; then
266 diff_c=0
267 sleep 1
268 else
269 diff_c=0
270 sleep $(( ( RANDOM % 60 ) + 20 ))
271 fi
272 done
273 return 1
274}
275
276redis_checks() {
277 # A check for the local redis container
278 err_count=0
279 diff_c=0
280 THRESHOLD=${REDIS_THRESHOLD}
281 # Reduce error count by 2 after restarting an unhealthy container
282 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
283 while [ ${err_count} -lt ${THRESHOLD} ]; do
284 touch /tmp/redis-mailcow; echo "$(tail -50 /tmp/redis-mailcow)" > /tmp/redis-mailcow
285 host_ip=$(get_container_ip redis-mailcow)
286 err_c_cur=${err_count}
287 /usr/lib/nagios/plugins/check_tcp -4 -H redis-mailcow -p 6379 -E -s "PING\n" -q "QUIT" -e "PONG" 2>> /tmp/redis-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
288 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
289 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
290 progress "Redis" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
291 if [[ $? == 10 ]]; then
292 diff_c=0
293 sleep 1
294 else
295 diff_c=0
296 sleep $(( ( RANDOM % 60 ) + 20 ))
297 fi
298 done
299 return 1
300}
301
302mysql_checks() {
303 err_count=0
304 diff_c=0
305 THRESHOLD=${MYSQL_THRESHOLD}
306 # Reduce error count by 2 after restarting an unhealthy container
307 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
308 while [ ${err_count} -lt ${THRESHOLD} ]; do
309 touch /tmp/mysql-mailcow; echo "$(tail -50 /tmp/mysql-mailcow)" > /tmp/mysql-mailcow
310 err_c_cur=${err_count}
311 /usr/lib/nagios/plugins/check_mysql -s /var/run/mysqld/mysqld.sock -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} 2>> /tmp/mysql-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
312 /usr/lib/nagios/plugins/check_mysql_query -s /var/run/mysqld/mysqld.sock -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} -q "SELECT COUNT(*) FROM information_schema.tables" 2>> /tmp/mysql-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
313 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
314 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
315 progress "MySQL/MariaDB" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
316 if [[ $? == 10 ]]; then
317 diff_c=0
318 sleep 1
319 else
320 diff_c=0
321 sleep $(( ( RANDOM % 60 ) + 20 ))
322 fi
323 done
324 return 1
325}
326
327mysql_repl_checks() {
328 err_count=0
329 diff_c=0
330 THRESHOLD=${MYSQL_REPLICATION_THRESHOLD}
331 # Reduce error count by 2 after restarting an unhealthy container
332 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
333 while [ ${err_count} -lt ${THRESHOLD} ]; do
334 touch /tmp/mysql_repl_checks; echo "$(tail -50 /tmp/mysql_repl_checks)" > /tmp/mysql_repl_checks
335 err_c_cur=${err_count}
336 /usr/lib/nagios/plugins/check_mysql_slavestatus.sh -S /var/run/mysqld/mysqld.sock -u root -p ${DBROOT} 2>> /tmp/mysql_repl_checks 1>&2; err_count=$(( ${err_count} + $? ))
337 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
338 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
339 progress "MySQL/MariaDB replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
340 if [[ $? == 10 ]]; then
341 diff_c=0
342 sleep 60
343 else
344 diff_c=0
345 sleep $(( ( RANDOM % 60 ) + 20 ))
346 fi
347 done
348 return 1
349}
350
351sogo_checks() {
352 err_count=0
353 diff_c=0
354 THRESHOLD=${SOGO_THRESHOLD}
355 # Reduce error count by 2 after restarting an unhealthy container
356 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
357 while [ ${err_count} -lt ${THRESHOLD} ]; do
358 touch /tmp/sogo-mailcow; echo "$(tail -50 /tmp/sogo-mailcow)" > /tmp/sogo-mailcow
359 host_ip=$(get_container_ip sogo-mailcow)
360 err_c_cur=${err_count}
361 /usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u /SOGo.index/ -p 20000 -R "SOGo\.MainUI" 2>> /tmp/sogo-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
362 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
363 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
364 progress "SOGo" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
365 if [[ $? == 10 ]]; then
366 diff_c=0
367 sleep 1
368 else
369 diff_c=0
370 sleep $(( ( RANDOM % 60 ) + 20 ))
371 fi
372 done
373 return 1
374}
375
376postfix_checks() {
377 err_count=0
378 diff_c=0
379 THRESHOLD=${POSTFIX_THRESHOLD}
380 # Reduce error count by 2 after restarting an unhealthy container
381 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
382 while [ ${err_count} -lt ${THRESHOLD} ]; do
383 touch /tmp/postfix-mailcow; echo "$(tail -50 /tmp/postfix-mailcow)" > /tmp/postfix-mailcow
384 host_ip=$(get_container_ip postfix-mailcow)
385 err_c_cur=${err_count}
386 /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 589 -f "watchdog@invalid" -C "RCPT TO:watchdog@localhost" -C DATA -C . -R 250 2>> /tmp/postfix-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
387 /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 589 -S 2>> /tmp/postfix-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
388 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
389 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
390 progress "Postfix" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
391 if [[ $? == 10 ]]; then
392 diff_c=0
393 sleep 1
394 else
395 diff_c=0
396 sleep $(( ( RANDOM % 60 ) + 20 ))
397 fi
398 done
399 return 1
400}
401
402clamd_checks() {
403 err_count=0
404 diff_c=0
405 THRESHOLD=${CLAMD_THRESHOLD}
406 # Reduce error count by 2 after restarting an unhealthy container
407 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
408 while [ ${err_count} -lt ${THRESHOLD} ]; do
409 touch /tmp/clamd-mailcow; echo "$(tail -50 /tmp/clamd-mailcow)" > /tmp/clamd-mailcow
410 host_ip=$(get_container_ip clamd-mailcow)
411 err_c_cur=${err_count}
412 /usr/lib/nagios/plugins/check_clamd -4 -H ${host_ip} 2>> /tmp/clamd-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
413 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
414 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
415 progress "Clamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
416 if [[ $? == 10 ]]; then
417 diff_c=0
418 sleep 1
419 else
420 diff_c=0
421 sleep $(( ( RANDOM % 120 ) + 20 ))
422 fi
423 done
424 return 1
425}
426
427dovecot_checks() {
428 err_count=0
429 diff_c=0
430 THRESHOLD=${DOVECOT_THRESHOLD}
431 # Reduce error count by 2 after restarting an unhealthy container
432 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
433 while [ ${err_count} -lt ${THRESHOLD} ]; do
434 touch /tmp/dovecot-mailcow; echo "$(tail -50 /tmp/dovecot-mailcow)" > /tmp/dovecot-mailcow
435 host_ip=$(get_container_ip dovecot-mailcow)
436 err_c_cur=${err_count}
437 /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 24 -f "watchdog@invalid" -C "RCPT TO:<watchdog@invalid>" -L -R "User doesn't exist" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
438 /usr/lib/nagios/plugins/check_imap -4 -H ${host_ip} -p 993 -S -e "OK " 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
439 /usr/lib/nagios/plugins/check_imap -4 -H ${host_ip} -p 143 -e "OK " 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
440 /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 10001 -e "VERSION" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
441 /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 4190 -e "Dovecot ready" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
442 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
443 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
444 progress "Dovecot" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
445 if [[ $? == 10 ]]; then
446 diff_c=0
447 sleep 1
448 else
449 diff_c=0
450 sleep $(( ( RANDOM % 60 ) + 20 ))
451 fi
452 done
453 return 1
454}
455
456dovecot_repl_checks() {
457 err_count=0
458 diff_c=0
459 THRESHOLD=${DOVECOT_REPL_THRESHOLD}
460 D_REPL_STATUS=$(redis-cli -h redis -r GET DOVECOT_REPL_HEALTH)
461 # Reduce error count by 2 after restarting an unhealthy container
462 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
463 while [ ${err_count} -lt ${THRESHOLD} ]; do
464 err_c_cur=${err_count}
465 D_REPL_STATUS=$(redis-cli --raw -h redis GET DOVECOT_REPL_HEALTH)
466 if [[ "${D_REPL_STATUS}" != "1" ]]; then
467 err_count=$(( ${err_count} + 1 ))
468 fi
469 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
470 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
471 progress "Dovecot replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
472 if [[ $? == 10 ]]; then
473 diff_c=0
474 sleep 60
475 else
476 diff_c=0
477 sleep $(( ( RANDOM % 60 ) + 20 ))
478 fi
479 done
480 return 1
481}
482
483cert_checks() {
484 err_count=0
485 diff_c=0
486 THRESHOLD=7
487 # Reduce error count by 2 after restarting an unhealthy container
488 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
489 while [ ${err_count} -lt ${THRESHOLD} ]; do
490 touch /tmp/certcheck; echo "$(tail -50 /tmp/certcheck)" > /tmp/certcheck
491 host_ip_postfix=$(get_container_ip postfix)
492 host_ip_dovecot=$(get_container_ip dovecot)
493 err_c_cur=${err_count}
494 /usr/lib/nagios/plugins/check_smtp -H ${host_ip_postfix} -p 589 -4 -S -D 7 2>> /tmp/certcheck 1>&2; err_count=$(( ${err_count} + $? ))
495 /usr/lib/nagios/plugins/check_imap -H ${host_ip_dovecot} -p 993 -4 -S -D 7 2>> /tmp/certcheck 1>&2; err_count=$(( ${err_count} + $? ))
496 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
497 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
498 progress "Primary certificate expiry check" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
499 # Always sleep 5 minutes, mail notifications are limited
500 sleep 300
501 done
502 return 1
503}
504
505phpfpm_checks() {
506 err_count=0
507 diff_c=0
508 THRESHOLD=${PHPFPM_THRESHOLD}
509 # Reduce error count by 2 after restarting an unhealthy container
510 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
511 while [ ${err_count} -lt ${THRESHOLD} ]; do
512 touch /tmp/php-fpm-mailcow; echo "$(tail -50 /tmp/php-fpm-mailcow)" > /tmp/php-fpm-mailcow
513 host_ip=$(get_container_ip php-fpm-mailcow)
514 err_c_cur=${err_count}
515 /usr/lib/nagios/plugins/check_tcp -H ${host_ip} -p 9001 2>> /tmp/php-fpm-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
516 /usr/lib/nagios/plugins/check_tcp -H ${host_ip} -p 9002 2>> /tmp/php-fpm-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
517 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
518 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
519 progress "PHP-FPM" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
520 if [[ $? == 10 ]]; then
521 diff_c=0
522 sleep 1
523 else
524 diff_c=0
525 sleep $(( ( RANDOM % 60 ) + 20 ))
526 fi
527 done
528 return 1
529}
530
531ratelimit_checks() {
532 err_count=0
533 diff_c=0
534 THRESHOLD=${RATELIMIT_THRESHOLD}
535 RL_LOG_STATUS=$(redis-cli -h redis LRANGE RL_LOG 0 0 | jq .qid)
536 # Reduce error count by 2 after restarting an unhealthy container
537 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
538 while [ ${err_count} -lt ${THRESHOLD} ]; do
539 err_c_cur=${err_count}
540 RL_LOG_STATUS_PREV=${RL_LOG_STATUS}
541 RL_LOG_STATUS=$(redis-cli -h redis LRANGE RL_LOG 0 0 | jq .qid)
542 if [[ ${RL_LOG_STATUS_PREV} != ${RL_LOG_STATUS} ]]; then
543 err_count=$(( ${err_count} + 1 ))
544 echo 'Last 10 applied ratelimits (may overlap with previous reports).' > /tmp/ratelimit
545 echo 'Full ratelimit buckets can be emptied by deleting the ratelimit hash from within mailcow UI (see /debug -> Protocols -> Ratelimit):' >> /tmp/ratelimit
546 echo >> /tmp/ratelimit
547 redis-cli --raw -h redis LRANGE RL_LOG 0 10 | jq . >> /tmp/ratelimit
548 fi
549 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
550 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
551 progress "Ratelimit" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
552 if [[ $? == 10 ]]; then
553 diff_c=0
554 sleep 1
555 else
556 diff_c=0
557 sleep $(( ( RANDOM % 60 ) + 20 ))
558 fi
559 done
560 return 1
561}
562
563mailq_checks() {
564 err_count=0
565 diff_c=0
566 THRESHOLD=${MAILQ_THRESHOLD}
567 # Reduce error count by 2 after restarting an unhealthy container
568 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
569 while [ ${err_count} -lt ${THRESHOLD} ]; do
570 touch /tmp/mail_queue_status; echo "$(tail -50 /tmp/mail_queue_status)" > /tmp/mail_queue_status
571 MAILQ_LOG_STATUS=$(find /var/spool/postfix/deferred -type f | wc -l)
572 echo "Mail queue contains ${MAILQ_LOG_STATUS} items (critical limit is ${MAILQ_CRIT}) at $(date)" >> /tmp/mail_queue_status
573 err_c_cur=${err_count}
574 if [ ${MAILQ_LOG_STATUS} -ge ${MAILQ_CRIT} ]; then
575 err_count=$(( ${err_count} + 1 ))
576 echo "Mail queue contains ${MAILQ_LOG_STATUS} items (critical limit is ${MAILQ_CRIT}) at $(date)" >> /tmp/mail_queue_status
577 fi
578 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
579 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
580 progress "Mail queue" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
581 if [[ $? == 10 ]]; then
582 diff_c=0
583 sleep 60
584 else
585 diff_c=0
586 sleep $(( ( RANDOM % 60 ) + 20 ))
587 fi
588 done
589 return 1
590}
591
592fail2ban_checks() {
593 err_count=0
594 diff_c=0
595 THRESHOLD=${FAIL2BAN_THRESHOLD}
596 F2B_LOG_STATUS=($(${REDIS_CMDLINE} --raw HKEYS F2B_ACTIVE_BANS))
597 F2B_RES=
598 # Reduce error count by 2 after restarting an unhealthy container
599 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
600 while [ ${err_count} -lt ${THRESHOLD} ]; do
601 err_c_cur=${err_count}
602 F2B_LOG_STATUS_PREV=(${F2B_LOG_STATUS[@]})
603 F2B_LOG_STATUS=($(${REDIS_CMDLINE} --raw HKEYS F2B_ACTIVE_BANS))
604 array_diff F2B_RES F2B_LOG_STATUS F2B_LOG_STATUS_PREV
605 if [[ ! -z "${F2B_RES}" ]]; then
606 err_count=$(( ${err_count} + 1 ))
607 echo -n "${F2B_RES[@]}" | tr -cd "[a-fA-F0-9.:/] " | timeout 3s ${REDIS_CMDLINE} -x SET F2B_RES > /dev/null
608 if [ $? -ne 0 ]; then
609 ${REDIS_CMDLINE} -x DEL F2B_RES
610 fi
611 fi
612 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
613 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
614 progress "Fail2ban" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
615 if [[ $? == 10 ]]; then
616 diff_c=0
617 sleep 1
618 else
619 diff_c=0
620 sleep $(( ( RANDOM % 60 ) + 20 ))
621 fi
622 done
623 return 1
624}
625
626acme_checks() {
627 err_count=0
628 diff_c=0
629 THRESHOLD=${ACME_THRESHOLD}
630 ACME_LOG_STATUS=$(redis-cli -h redis GET ACME_FAIL_TIME)
631 if [[ -z "${ACME_LOG_STATUS}" ]]; then
632 ${REDIS_CMDLINE} SET ACME_FAIL_TIME 0
633 ACME_LOG_STATUS=0
634 fi
635 # Reduce error count by 2 after restarting an unhealthy container
636 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
637 while [ ${err_count} -lt ${THRESHOLD} ]; do
638 err_c_cur=${err_count}
639 ACME_LOG_STATUS_PREV=${ACME_LOG_STATUS}
640 ACME_LC=0
641 until [[ ! -z ${ACME_LOG_STATUS} ]] || [ ${ACME_LC} -ge 3 ]; do
642 ACME_LOG_STATUS=$(redis-cli -h redis GET ACME_FAIL_TIME 2> /dev/null)
643 sleep 3
644 ACME_LC=$((ACME_LC+1))
645 done
646 if [[ ${ACME_LOG_STATUS_PREV} != ${ACME_LOG_STATUS} ]]; then
647 err_count=$(( ${err_count} + 1 ))
648 fi
649 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
650 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
651 progress "ACME" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
652 if [[ $? == 10 ]]; then
653 diff_c=0
654 sleep 1
655 else
656 diff_c=0
657 sleep $(( ( RANDOM % 60 ) + 20 ))
658 fi
659 done
660 return 1
661}
662
663ipv6nat_checks() {
664 err_count=0
665 diff_c=0
666 THRESHOLD=${IPV6NAT_THRESHOLD}
667 # Reduce error count by 2 after restarting an unhealthy container
668 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
669 while [ ${err_count} -lt ${THRESHOLD} ]; do
670 err_c_cur=${err_count}
671 CONTAINERS=$(curl --silent --insecure https://dockerapi/containers/json)
672 IPV6NAT_CONTAINER_ID=$(echo ${CONTAINERS} | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], id: .Id}" | jq -rc "select( .name | tostring | contains(\"ipv6nat-mailcow\")) | select( .project | tostring | contains(\"${COMPOSE_PROJECT_NAME,,}\")) | .id")
673 if [[ ! -z ${IPV6NAT_CONTAINER_ID} ]]; then
674 LATEST_STARTED="$(echo ${CONTAINERS} | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], StartedAt: .State.StartedAt}" | jq -rc "select( .project | tostring | contains(\"${COMPOSE_PROJECT_NAME,,}\")) | select( .name | tostring | contains(\"ipv6nat-mailcow\") | not)" | jq -rc .StartedAt | xargs -n1 date +%s -d | sort | tail -n1)"
675 LATEST_IPV6NAT="$(echo ${CONTAINERS} | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], StartedAt: .State.StartedAt}" | jq -rc "select( .project | tostring | contains(\"${COMPOSE_PROJECT_NAME,,}\")) | select( .name | tostring | contains(\"ipv6nat-mailcow\"))" | jq -rc .StartedAt | xargs -n1 date +%s -d | sort | tail -n1)"
676 DIFFERENCE_START_TIME=$(expr ${LATEST_IPV6NAT} - ${LATEST_STARTED} 2>/dev/null)
677 if [[ "${DIFFERENCE_START_TIME}" -lt 30 ]]; then
678 err_count=$(( ${err_count} + 1 ))
679 fi
680 fi
681 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
682 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
683 progress "IPv6 NAT" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
684 if [[ $? == 10 ]]; then
685 diff_c=0
686 sleep 30
687 else
688 diff_c=0
689 sleep 300
690 fi
691 done
692 return 1
693}
694
695
696rspamd_checks() {
697 err_count=0
698 diff_c=0
699 THRESHOLD=${RSPAMD_THRESHOLD}
700 # Reduce error count by 2 after restarting an unhealthy container
701 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
702 while [ ${err_count} -lt ${THRESHOLD} ]; do
703 touch /tmp/rspamd-mailcow; echo "$(tail -50 /tmp/rspamd-mailcow)" > /tmp/rspamd-mailcow
704 host_ip=$(get_container_ip rspamd-mailcow)
705 err_c_cur=${err_count}
706 SCORE=$(echo 'To: null@localhost
707From: watchdog@localhost
708
709Empty
710' | usr/bin/curl --max-time 10 -s --data-binary @- --unix-socket /var/lib/rspamd/rspamd.sock http://rspamd/scan | jq -rc .default.required_score)
711 if [[ ${SCORE} != "9999" ]]; then
712 echo "Rspamd settings check failed, score returned: ${SCORE}" 2>> /tmp/rspamd-mailcow 1>&2
713 err_count=$(( ${err_count} + 1))
714 else
715 echo "Rspamd settings check succeeded, score returned: ${SCORE}" 2>> /tmp/rspamd-mailcow 1>&2
716 fi
717 # A dirty hack until a PING PONG event is implemented to worker proxy
718 # We expect an empty response, not a timeout
719 if [ "$(curl -s --max-time 10 ${host_ip}:9900 2> /dev/null ; echo $?)" == "28" ]; then
720 echo "Milter check failed" 2>> /tmp/rspamd-mailcow 1>&2; err_count=$(( ${err_count} + 1 ));
721 else
722 echo "Milter check succeeded" 2>> /tmp/rspamd-mailcow 1>&2
723 fi
724 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
725 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
726 progress "Rspamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
727 if [[ $? == 10 ]]; then
728 diff_c=0
729 sleep 1
730 else
731 diff_c=0
732 sleep $(( ( RANDOM % 60 ) + 20 ))
733 fi
734 done
735 return 1
736}
737
738olefy_checks() {
739 err_count=0
740 diff_c=0
741 THRESHOLD=${OLEFY_THRESHOLD}
742 # Reduce error count by 2 after restarting an unhealthy container
743 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
744 while [ ${err_count} -lt ${THRESHOLD} ]; do
745 touch /tmp/olefy-mailcow; echo "$(tail -50 /tmp/olefy-mailcow)" > /tmp/olefy-mailcow
746 host_ip=$(get_container_ip olefy-mailcow)
747 err_c_cur=${err_count}
748 /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 10055 -s "PING\n" 2>> /tmp/olefy-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
749 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
750 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
751 progress "Olefy" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
752 if [[ $? == 10 ]]; then
753 diff_c=0
754 sleep 1
755 else
756 diff_c=0
757 sleep $(( ( RANDOM % 60 ) + 20 ))
758 fi
759 done
760 return 1
761}
762
763# Notify about start
764if [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]]; then
765 mail_error "watchdog-mailcow" "Watchdog started monitoring mailcow."
766fi
767
768# Create watchdog agents
769
770(
771while true; do
772 if ! nginx_checks; then
773 log_msg "Nginx hit error limit"
774 echo nginx-mailcow > /tmp/com_pipe
775 fi
776done
777) &
778PID=$!
779echo "Spawned nginx_checks with PID ${PID}"
780BACKGROUND_TASKS+=(${PID})
781
782if [[ ${WATCHDOG_EXTERNAL_CHECKS} =~ ^([yY][eE][sS]|[yY])+$ ]]; then
783(
784while true; do
785 if ! external_checks; then
786 log_msg "External checks hit error limit"
787 echo external_checks > /tmp/com_pipe
788 fi
789done
790) &
791PID=$!
792echo "Spawned external_checks with PID ${PID}"
793BACKGROUND_TASKS+=(${PID})
794fi
795
796if [[ ${WATCHDOG_MYSQL_REPLICATION_CHECKS} =~ ^([yY][eE][sS]|[yY])+$ ]]; then
797(
798while true; do
799 if ! mysql_repl_checks; then
800 log_msg "MySQL replication check hit error limit"
801 echo mysql_repl_checks > /tmp/com_pipe
802 fi
803done
804) &
805PID=$!
806echo "Spawned mysql_repl_checks with PID ${PID}"
807BACKGROUND_TASKS+=(${PID})
808fi
809
810(
811while true; do
812 if ! mysql_checks; then
813 log_msg "MySQL hit error limit"
814 echo mysql-mailcow > /tmp/com_pipe
815 fi
816done
817) &
818PID=$!
819echo "Spawned mysql_checks with PID ${PID}"
820BACKGROUND_TASKS+=(${PID})
821
822(
823while true; do
824 if ! redis_checks; then
825 log_msg "Local Redis hit error limit"
826 echo redis-mailcow > /tmp/com_pipe
827 fi
828done
829) &
830PID=$!
831echo "Spawned redis_checks with PID ${PID}"
832BACKGROUND_TASKS+=(${PID})
833
834(
835while true; do
836 if ! phpfpm_checks; then
837 log_msg "PHP-FPM hit error limit"
838 echo php-fpm-mailcow > /tmp/com_pipe
839 fi
840done
841) &
842PID=$!
843echo "Spawned phpfpm_checks with PID ${PID}"
844BACKGROUND_TASKS+=(${PID})
845
846if [[ "${SKIP_SOGO}" =~ ^([nN][oO]|[nN])+$ ]]; then
847(
848while true; do
849 if ! sogo_checks; then
850 log_msg "SOGo hit error limit"
851 echo sogo-mailcow > /tmp/com_pipe
852 fi
853done
854) &
855PID=$!
856echo "Spawned sogo_checks with PID ${PID}"
857BACKGROUND_TASKS+=(${PID})
858fi
859
860if [ ${CHECK_UNBOUND} -eq 1 ]; then
861(
862while true; do
863 if ! unbound_checks; then
864 log_msg "Unbound hit error limit"
865 echo unbound-mailcow > /tmp/com_pipe
866 fi
867done
868) &
869PID=$!
870echo "Spawned unbound_checks with PID ${PID}"
871BACKGROUND_TASKS+=(${PID})
872fi
873
874if [[ "${SKIP_CLAMD}" =~ ^([nN][oO]|[nN])+$ ]]; then
875(
876while true; do
877 if ! clamd_checks; then
878 log_msg "Clamd hit error limit"
879 echo clamd-mailcow > /tmp/com_pipe
880 fi
881done
882) &
883PID=$!
884echo "Spawned clamd_checks with PID ${PID}"
885BACKGROUND_TASKS+=(${PID})
886fi
887
888(
889while true; do
890 if ! postfix_checks; then
891 log_msg "Postfix hit error limit"
892 echo postfix-mailcow > /tmp/com_pipe
893 fi
894done
895) &
896PID=$!
897echo "Spawned postfix_checks with PID ${PID}"
898BACKGROUND_TASKS+=(${PID})
899
900(
901while true; do
902 if ! mailq_checks; then
903 log_msg "Mail queue hit error limit"
904 echo mail_queue_status > /tmp/com_pipe
905 fi
906done
907) &
908PID=$!
909echo "Spawned mailq_checks with PID ${PID}"
910BACKGROUND_TASKS+=(${PID})
911
912(
913while true; do
914 if ! dovecot_checks; then
915 log_msg "Dovecot hit error limit"
916 echo dovecot-mailcow > /tmp/com_pipe
917 fi
918done
919) &
920PID=$!
921echo "Spawned dovecot_checks with PID ${PID}"
922BACKGROUND_TASKS+=(${PID})
923
924(
925while true; do
926 if ! dovecot_repl_checks; then
927 log_msg "Dovecot hit error limit"
928 echo dovecot_repl_checks > /tmp/com_pipe
929 fi
930done
931) &
932PID=$!
933echo "Spawned dovecot_repl_checks with PID ${PID}"
934BACKGROUND_TASKS+=(${PID})
935
936(
937while true; do
938 if ! rspamd_checks; then
939 log_msg "Rspamd hit error limit"
940 echo rspamd-mailcow > /tmp/com_pipe
941 fi
942done
943) &
944PID=$!
945echo "Spawned rspamd_checks with PID ${PID}"
946BACKGROUND_TASKS+=(${PID})
947
948(
949while true; do
950 if ! ratelimit_checks; then
951 log_msg "Ratelimit hit error limit"
952 echo ratelimit > /tmp/com_pipe
953 fi
954done
955) &
956PID=$!
957echo "Spawned ratelimit_checks with PID ${PID}"
958BACKGROUND_TASKS+=(${PID})
959
960(
961while true; do
962 if ! fail2ban_checks; then
963 log_msg "Fail2ban hit error limit"
964 echo fail2ban > /tmp/com_pipe
965 fi
966done
967) &
968PID=$!
969echo "Spawned fail2ban_checks with PID ${PID}"
970BACKGROUND_TASKS+=(${PID})
971
972(
973while true; do
974 if ! cert_checks; then
975 log_msg "Cert check hit error limit"
976 echo certcheck > /tmp/com_pipe
977 fi
978done
979) &
980PID=$!
981echo "Spawned cert_checks with PID ${PID}"
982BACKGROUND_TASKS+=(${PID})
983
984(
985while true; do
986 if ! olefy_checks; then
987 log_msg "Olefy hit error limit"
988 echo olefy-mailcow > /tmp/com_pipe
989 fi
990done
991) &
992PID=$!
993echo "Spawned olefy_checks with PID ${PID}"
994BACKGROUND_TASKS+=(${PID})
995
996(
997while true; do
998 if ! acme_checks; then
999 log_msg "ACME client hit error limit"
1000 echo acme-mailcow > /tmp/com_pipe
1001 fi
1002done
1003) &
1004PID=$!
1005echo "Spawned acme_checks with PID ${PID}"
1006BACKGROUND_TASKS+=(${PID})
1007
1008(
1009while true; do
1010 if ! ipv6nat_checks; then
1011 log_msg "IPv6 NAT warning: ipv6nat-mailcow container was not started at least 30s after siblings (not an error)"
1012 echo ipv6nat-mailcow > /tmp/com_pipe
1013 fi
1014done
1015) &
1016PID=$!
1017echo "Spawned ipv6nat_checks with PID ${PID}"
1018BACKGROUND_TASKS+=(${PID})
1019
1020# Monitor watchdog agents, stop script when agents fails and wait for respawn by Docker (restart:always:n)
1021(
1022while true; do
1023 for bg_task in ${BACKGROUND_TASKS[*]}; do
1024 if ! kill -0 ${bg_task} 1>&2; then
1025 log_msg "Worker ${bg_task} died, stopping watchdog and waiting for respawn..."
1026 kill -TERM 1
1027 fi
1028 sleep 10
1029 done
1030done
1031) &
1032
1033# Monitor dockerapi
1034(
1035while true; do
1036 while nc -z dockerapi 443; do
1037 sleep 3
1038 done
1039 log_msg "Cannot find dockerapi-mailcow, waiting to recover..."
1040 kill -STOP ${BACKGROUND_TASKS[*]}
1041 until nc -z dockerapi 443; do
1042 sleep 3
1043 done
1044 kill -CONT ${BACKGROUND_TASKS[*]}
1045 kill -USR1 ${BACKGROUND_TASKS[*]}
1046done
1047) &
1048
1049# Actions when threshold limit is reached
1050while true; do
1051 CONTAINER_ID=
1052 HAS_INITDB=
1053 read com_pipe_answer </tmp/com_pipe
1054 if [ -s "/tmp/${com_pipe_answer}" ]; then
1055 cat "/tmp/${com_pipe_answer}"
1056 fi
1057 if [[ ${com_pipe_answer} == "ratelimit" ]]; then
1058 log_msg "At least one ratelimit was applied"
1059 [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}"
1060 elif [[ ${com_pipe_answer} == "mail_queue_status" ]]; then
1061 log_msg "Mail queue status is critical"
1062 [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}"
1063 elif [[ ${com_pipe_answer} == "external_checks" ]]; then
1064 log_msg "Your mailcow is an open relay!"
1065 # Define $2 to override message text, else print service was restarted at ...
1066 [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please stop mailcow now and check your network configuration!"
1067 elif [[ ${com_pipe_answer} == "mysql_repl_checks" ]]; then
1068 log_msg "MySQL replication is not working properly"
1069 # Define $2 to override message text, else print service was restarted at ...
1070 # Once mail per 10 minutes
1071 [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check the SQL replication status" 600
1072 elif [[ ${com_pipe_answer} == "dovecot_repl_checks" ]]; then
1073 log_msg "Dovecot replication is not working properly"
1074 # Define $2 to override message text, else print service was restarted at ...
1075 # Once mail per 10 minutes
1076 [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check the Dovecot replicator status" 600
1077 elif [[ ${com_pipe_answer} == "certcheck" ]]; then
1078 log_msg "Certificates are about to expire"
1079 # Define $2 to override message text, else print service was restarted at ...
1080 # Only mail once a day
1081 [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please renew your certificate" 86400
1082 elif [[ ${com_pipe_answer} == "acme-mailcow" ]]; then
1083 log_msg "acme-mailcow did not complete successfully"
1084 # Define $2 to override message text, else print service was restarted at ...
1085 [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check acme-mailcow for further information."
1086 elif [[ ${com_pipe_answer} == "fail2ban" ]]; then
1087 F2B_RES=($(timeout 4s ${REDIS_CMDLINE} --raw GET F2B_RES 2> /dev/null))
1088 if [[ ! -z "${F2B_RES}" ]]; then
1089 ${REDIS_CMDLINE} DEL F2B_RES > /dev/null
1090 host=
1091 for host in "${F2B_RES[@]}"; do
1092 log_msg "Banned ${host}"
1093 rm /tmp/fail2ban 2> /dev/null
1094 timeout 2s whois "${host}" > /tmp/fail2ban
1095 [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && [[ ${WATCHDOG_NOTIFY_BAN} =~ ^([yY][eE][sS]|[yY])+$ ]] && mail_error "${com_pipe_answer}" "IP ban: ${host}"
1096 done
1097 fi
1098 elif [[ ${com_pipe_answer} =~ .+-mailcow ]]; then
1099 kill -STOP ${BACKGROUND_TASKS[*]}
1100 sleep 10
1101 CONTAINER_ID=$(curl --silent --insecure https://dockerapi/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], id: .Id}" | jq -rc "select( .name | tostring | contains(\"${com_pipe_answer}\")) | select( .project | tostring | contains(\"${COMPOSE_PROJECT_NAME,,}\")) | .id")
1102 if [[ ! -z ${CONTAINER_ID} ]]; then
1103 if [[ "${com_pipe_answer}" == "php-fpm-mailcow" ]]; then
1104 HAS_INITDB=$(curl --silent --insecure -XPOST https://dockerapi/containers/${CONTAINER_ID}/top | jq '.msg.Processes[] | contains(["php -c /usr/local/etc/php -f /web/inc/init_db.inc.php"])' | grep true)
1105 fi
1106 S_RUNNING=$(($(date +%s) - $(curl --silent --insecure https://dockerapi/containers/${CONTAINER_ID}/json | jq .State.StartedAt | xargs -n1 date +%s -d)))
1107 if [ ${S_RUNNING} -lt 360 ]; then
1108 log_msg "Container is running for less than 360 seconds, skipping action..."
1109 elif [[ ! -z ${HAS_INITDB} ]]; then
1110 log_msg "Database is being initialized by php-fpm-mailcow, not restarting but delaying checks for a minute..."
1111 sleep 60
1112 else
1113 log_msg "Sending restart command to ${CONTAINER_ID}..."
1114 curl --silent --insecure -XPOST https://dockerapi/containers/${CONTAINER_ID}/restart
1115 if [[ ${com_pipe_answer} != "ipv6nat-mailcow" ]]; then
1116 [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}"
1117 fi
1118 log_msg "Wait for restarted container to settle and continue watching..."
1119 sleep 35
1120 fi
1121 fi
1122 kill -CONT ${BACKGROUND_TASKS[*]}
1123 sleep 1
1124 kill -USR1 ${BACKGROUND_TASKS[*]}
1125 fi
1126done