blob: 231d0ecd5fe9fa9056a85aecf78cbc7c3fc4d7d8 [file] [log] [blame]
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +01001#!/bin/bash
2
3trap "exit" INT TERM
4trap "kill 0" EXIT
5
6# Prepare
7BACKGROUND_TASKS=()
8echo "Waiting for containers to settle..."
Matthias Andreas Benkard12a57352021-12-28 18:02:04 +01009for i in {30..1}; do
10 echo "${i}"
11 sleep 1
12done
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +010013
14if [[ "${USE_WATCHDOG}" =~ ^([nN][oO]|[nN])+$ ]]; then
15 echo -e "$(date) - USE_WATCHDOG=n, skipping watchdog..."
16 sleep 365d
17 exec $(readlink -f "$0")
18fi
19
Matthias Andreas Benkard12a57352021-12-28 18:02:04 +010020if [[ "${WATCHDOG_VERBOSE}" =~ ^([yY][eE][sS]|[yY])+$ ]]; then
21 SMTP_VERBOSE="--verbose"
22 set -xv
23else
24 SMTP_VERBOSE=""
25 exec 2>/dev/null
26fi
27
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +010028# Checks pipe their corresponding container name in this pipe
29if [[ ! -p /tmp/com_pipe ]]; then
30 mkfifo /tmp/com_pipe
31fi
32
33# Wait for containers
34while ! mysqladmin status --socket=/var/run/mysqld/mysqld.sock -u${DBUSER} -p${DBPASS} --silent; do
35 echo "Waiting for SQL..."
36 sleep 2
37done
38
39# Do not attempt to write to slave
40if [[ ! -z ${REDIS_SLAVEOF_IP} ]]; then
41 REDIS_CMDLINE="redis-cli -h ${REDIS_SLAVEOF_IP} -p ${REDIS_SLAVEOF_PORT}"
42else
43 REDIS_CMDLINE="redis-cli -h redis -p 6379"
44fi
45
46until [[ $(${REDIS_CMDLINE} PING) == "PONG" ]]; do
47 echo "Waiting for Redis..."
48 sleep 2
49done
50
51${REDIS_CMDLINE} DEL F2B_RES > /dev/null
52
53# Common functions
54get_ipv6(){
55 local IPV6=
56 local IPV6_SRCS=
57 local TRY=
Matthias Andreas Benkard7b2a3a12021-08-16 10:57:25 +020058 IPV6_SRCS[0]="ip6.mailcow.email"
59 IPV6_SRCS[1]="ip6.nevondo.com"
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +010060 until [[ ! -z ${IPV6} ]] || [[ ${TRY} -ge 10 ]]; do
61 IPV6=$(curl --connect-timeout 3 -m 10 -L6s ${IPV6_SRCS[$RANDOM % ${#IPV6_SRCS[@]} ]} | grep "^\([0-9a-fA-F]\{0,4\}:\)\{1,7\}[0-9a-fA-F]\{0,4\}$")
62 [[ ! -z ${TRY} ]] && sleep 1
63 TRY=$((TRY+1))
64 done
65 echo ${IPV6}
66}
67
68array_diff() {
69 # https://stackoverflow.com/questions/2312762, Alex Offshore
70 eval local ARR1=\(\"\${$2[@]}\"\)
71 eval local ARR2=\(\"\${$3[@]}\"\)
72 local IFS=$'\n'
73 mapfile -t $1 < <(comm -23 <(echo "${ARR1[*]}" | sort) <(echo "${ARR2[*]}" | sort))
74}
75
76progress() {
77 SERVICE=${1}
78 TOTAL=${2}
79 CURRENT=${3}
80 DIFF=${4}
81 [[ -z ${DIFF} ]] && DIFF=0
82 [[ -z ${TOTAL} || -z ${CURRENT} ]] && return
83 [[ ${CURRENT} -gt ${TOTAL} ]] && return
84 [[ ${CURRENT} -lt 0 ]] && CURRENT=0
85 PERCENT=$(( 200 * ${CURRENT} / ${TOTAL} % 2 + 100 * ${CURRENT} / ${TOTAL} ))
86 ${REDIS_CMDLINE} LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"service\":\"${SERVICE}\",\"lvl\":\"${PERCENT}\",\"hpnow\":\"${CURRENT}\",\"hptotal\":\"${TOTAL}\",\"hpdiff\":\"${DIFF}\"}" > /dev/null
87 log_msg "${SERVICE} health level: ${PERCENT}% (${CURRENT}/${TOTAL}), health trend: ${DIFF}" no_redis
88 # Return 10 to indicate a dead service
89 [ ${CURRENT} -le 0 ] && return 10
90}
91
92log_msg() {
93 if [[ ${2} != "no_redis" ]]; then
94 ${REDIS_CMDLINE} LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"message\":\"$(printf '%s' "${1}" | \
95 tr '\r\n%&;$"_[]{}-' ' ')\"}" > /dev/null
96 fi
97 echo $(date) $(printf '%s\n' "${1}")
98}
99
100function mail_error() {
101 THROTTLE=
102 [[ -z ${1} ]] && return 1
103 # If exists, body will be the content of "/tmp/${1}", even if ${2} is set
104 [[ -z ${2} ]] && BODY="Service was restarted on $(date), please check your mailcow installation." || BODY="$(date) - ${2}"
105 # If exists, mail will be throttled by argument in seconds
106 [[ ! -z ${3} ]] && THROTTLE=${3}
107 if [[ ! -z ${THROTTLE} ]]; then
108 TTL_LEFT="$(${REDIS_CMDLINE} TTL THROTTLE_${1} 2> /dev/null)"
109 if [[ "${TTL_LEFT}" == "-2" ]]; then
110 # Delay key not found, setting a delay key now
111 ${REDIS_CMDLINE} SET THROTTLE_${1} 1 EX ${THROTTLE}
112 else
113 log_msg "Not sending notification email now, blocked for ${TTL_LEFT} seconds..."
114 return 1
115 fi
116 fi
117 WATCHDOG_NOTIFY_EMAIL=$(echo "${WATCHDOG_NOTIFY_EMAIL}" | sed 's/"//;s|"$||')
118 # Some exceptions for subject and body formats
119 if [[ ${1} == "fail2ban" ]]; then
120 SUBJECT="${BODY}"
121 BODY="Please see netfilter-mailcow for more details and triggered rules."
122 else
Matthias Andreas Benkard7b2a3a12021-08-16 10:57:25 +0200123 SUBJECT="${WATCHDOG_SUBJECT}: ${1}"
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +0100124 fi
125 IFS=',' read -r -a MAIL_RCPTS <<< "${WATCHDOG_NOTIFY_EMAIL}"
126 for rcpt in "${MAIL_RCPTS[@]}"; do
127 RCPT_DOMAIN=
Matthias Andreas Benkard12a57352021-12-28 18:02:04 +0100128 RCPT_MX=
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +0100129 RCPT_DOMAIN=$(echo ${rcpt} | awk -F @ {'print $NF'})
Matthias Andreas Benkard12a57352021-12-28 18:02:04 +0100130 CHECK_FOR_VALID_MX=$(dig +short ${RCPT_DOMAIN} mx)
131 if [[ -z ${CHECK_FOR_VALID_MX} ]]; then
132 log_msg "Cannot determine MX for ${rcpt}, skipping email notification..."
133 return 1
134 fi
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +0100135 [ -f "/tmp/${1}" ] && BODY="/tmp/${1}"
136 timeout 10s ./smtp-cli --missing-modules-ok \
Matthias Andreas Benkard12a57352021-12-28 18:02:04 +0100137 "${SMTP_VERBOSE}" \
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +0100138 --charset=UTF-8 \
139 --subject="${SUBJECT}" \
140 --body-plain="${BODY}" \
141 --add-header="X-Priority: 1" \
142 --to=${rcpt} \
143 --from="watchdog@${MAILCOW_HOSTNAME}" \
144 --hello-host=${MAILCOW_HOSTNAME} \
145 --ipv4
Matthias Andreas Benkard12a57352021-12-28 18:02:04 +0100146 if [[ $? -eq 1 ]]; then # exit code 1 is fine
147 log_msg "Sent notification email to ${rcpt}"
148 else
149 if [[ "${SMTP_VERBOSE}" == "" ]]; then
150 log_msg "Error while sending notification email to ${rcpt}. You can enable verbose logging by setting 'WATCHDOG_VERBOSE=y' in mailcow.conf."
151 else
152 log_msg "Error while sending notification email to ${rcpt}."
153 fi
154 fi
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +0100155 done
156}
157
158get_container_ip() {
159 # ${1} is container
160 CONTAINER_ID=()
161 CONTAINER_IPS=()
162 CONTAINER_IP=
163 LOOP_C=1
164 until [[ ${CONTAINER_IP} =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]] || [[ ${LOOP_C} -gt 5 ]]; do
165 if [ ${IP_BY_DOCKER_API} -eq 0 ]; then
166 CONTAINER_IP=$(dig a "${1}" +short)
167 else
168 sleep 0.5
169 # get long container id for exact match
170 CONTAINER_ID=($(curl --silent --insecure https://dockerapi/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], id: .Id}" | jq -rc "select( .name | tostring == \"${1}\") | select( .project | tostring | contains(\"${COMPOSE_PROJECT_NAME,,}\")) | .id"))
171 # returned id can have multiple elements (if scaled), shuffle for random test
172 CONTAINER_ID=($(printf "%s\n" "${CONTAINER_ID[@]}" | shuf))
173 if [[ ! -z ${CONTAINER_ID} ]]; then
174 for matched_container in "${CONTAINER_ID[@]}"; do
Matthias Andreas Benkard12a57352021-12-28 18:02:04 +0100175 CONTAINER_IPS=($(curl --silent --insecure https://dockerapi/containers/${matched_container}/json | jq -r '.NetworkSettings.Networks[].IPAddress'))
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +0100176 for ip_match in "${CONTAINER_IPS[@]}"; do
177 # grep will do nothing if one of these vars is empty
178 [[ -z ${ip_match} ]] && continue
179 [[ -z ${IPV4_NETWORK} ]] && continue
180 # only return ips that are part of our network
181 if ! grep -q ${IPV4_NETWORK} <(echo ${ip_match}); then
182 continue
183 else
184 CONTAINER_IP=${ip_match}
185 break
186 fi
187 done
188 [[ ! -z ${CONTAINER_IP} ]] && break
189 done
190 fi
191 fi
192 LOOP_C=$((LOOP_C + 1))
193 done
194 [[ ${LOOP_C} -gt 5 ]] && echo 240.0.0.0 || echo ${CONTAINER_IP}
195}
196
197# One-time check
198if grep -qi "$(echo ${IPV6_NETWORK} | cut -d: -f1-3)" <<< "$(ip a s)"; then
199 if [[ -z "$(get_ipv6)" ]]; then
200 mail_error "ipv6-config" "enable_ipv6 is true in docker-compose.yml, but an IPv6 link could not be established. Please verify your IPv6 connection."
201 fi
202fi
203
204external_checks() {
205 err_count=0
206 diff_c=0
207 THRESHOLD=${EXTERNAL_CHECKS_THRESHOLD}
208 # Reduce error count by 2 after restarting an unhealthy container
209 GUID=$(mysql -u${DBUSER} -p${DBPASS} ${DBNAME} -e "SELECT version FROM versions WHERE application = 'GUID'" -BN)
210 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
211 while [ ${err_count} -lt ${THRESHOLD} ]; do
212 err_c_cur=${err_count}
213 CHECK_REPONSE="$(curl --connect-timeout 3 -m 10 -4 -s https://checks.mailcow.email -X POST -dguid=${GUID} 2> /dev/null)"
214 if [[ ! -z "${CHECK_REPONSE}" ]] && [[ "$(echo ${CHECK_REPONSE} | jq -r .response)" == "critical" ]]; then
215 echo ${CHECK_REPONSE} | jq -r .out > /tmp/external_checks
216 err_count=$(( ${err_count} + 1 ))
217 fi
218 CHECK_REPONSE6="$(curl --connect-timeout 3 -m 10 -6 -s https://checks.mailcow.email -X POST -dguid=${GUID} 2> /dev/null)"
219 if [[ ! -z "${CHECK_REPONSE6}" ]] && [[ "$(echo ${CHECK_REPONSE6} | jq -r .response)" == "critical" ]]; then
220 echo ${CHECK_REPONSE} | jq -r .out > /tmp/external_checks
221 err_count=$(( ${err_count} + 1 ))
222 fi
223 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
224 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
225 progress "External checks" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
226 if [[ $? == 10 ]]; then
227 diff_c=0
228 sleep 60
229 else
230 diff_c=0
Matthias Andreas Benkard7b2a3a12021-08-16 10:57:25 +0200231 sleep $(( ( RANDOM % 20 ) + 1800 ))
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +0100232 fi
233 done
234 return 1
235}
236
237nginx_checks() {
238 err_count=0
239 diff_c=0
240 THRESHOLD=${NGINX_THRESHOLD}
241 # Reduce error count by 2 after restarting an unhealthy container
242 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
243 while [ ${err_count} -lt ${THRESHOLD} ]; do
244 touch /tmp/nginx-mailcow; echo "$(tail -50 /tmp/nginx-mailcow)" > /tmp/nginx-mailcow
245 host_ip=$(get_container_ip nginx-mailcow)
246 err_c_cur=${err_count}
247 /usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u / -p 8081 2>> /tmp/nginx-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
248 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
249 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
250 progress "Nginx" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
251 if [[ $? == 10 ]]; then
252 diff_c=0
253 sleep 1
254 else
255 diff_c=0
256 sleep $(( ( RANDOM % 60 ) + 20 ))
257 fi
258 done
259 return 1
260}
261
262unbound_checks() {
263 err_count=0
264 diff_c=0
265 THRESHOLD=${UNBOUND_THRESHOLD}
266 # Reduce error count by 2 after restarting an unhealthy container
267 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
268 while [ ${err_count} -lt ${THRESHOLD} ]; do
269 touch /tmp/unbound-mailcow; echo "$(tail -50 /tmp/unbound-mailcow)" > /tmp/unbound-mailcow
270 host_ip=$(get_container_ip unbound-mailcow)
271 err_c_cur=${err_count}
Matthias Andreas Benkarda515bc62023-11-18 16:44:25 +0100272 /usr/lib/nagios/plugins/check_dns -s ${host_ip} -H stackoverflow.com 2>> /tmp/unbound-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
273 DNSSEC=$(dig com +dnssec | egrep 'flags:.+ad')
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +0100274 if [[ -z ${DNSSEC} ]]; then
275 echo "DNSSEC failure" 2>> /tmp/unbound-mailcow 1>&2
276 err_count=$(( ${err_count} + 1))
277 else
278 echo "DNSSEC check succeeded" 2>> /tmp/unbound-mailcow 1>&2
279 fi
280 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
281 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
282 progress "Unbound" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
283 if [[ $? == 10 ]]; then
284 diff_c=0
285 sleep 1
286 else
287 diff_c=0
288 sleep $(( ( RANDOM % 60 ) + 20 ))
289 fi
290 done
291 return 1
292}
293
294redis_checks() {
295 # A check for the local redis container
296 err_count=0
297 diff_c=0
298 THRESHOLD=${REDIS_THRESHOLD}
299 # Reduce error count by 2 after restarting an unhealthy container
300 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
301 while [ ${err_count} -lt ${THRESHOLD} ]; do
302 touch /tmp/redis-mailcow; echo "$(tail -50 /tmp/redis-mailcow)" > /tmp/redis-mailcow
303 host_ip=$(get_container_ip redis-mailcow)
304 err_c_cur=${err_count}
305 /usr/lib/nagios/plugins/check_tcp -4 -H redis-mailcow -p 6379 -E -s "PING\n" -q "QUIT" -e "PONG" 2>> /tmp/redis-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
306 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
307 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
308 progress "Redis" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
309 if [[ $? == 10 ]]; then
310 diff_c=0
311 sleep 1
312 else
313 diff_c=0
314 sleep $(( ( RANDOM % 60 ) + 20 ))
315 fi
316 done
317 return 1
318}
319
320mysql_checks() {
321 err_count=0
322 diff_c=0
323 THRESHOLD=${MYSQL_THRESHOLD}
324 # Reduce error count by 2 after restarting an unhealthy container
325 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
326 while [ ${err_count} -lt ${THRESHOLD} ]; do
327 touch /tmp/mysql-mailcow; echo "$(tail -50 /tmp/mysql-mailcow)" > /tmp/mysql-mailcow
328 err_c_cur=${err_count}
329 /usr/lib/nagios/plugins/check_mysql -s /var/run/mysqld/mysqld.sock -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} 2>> /tmp/mysql-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
330 /usr/lib/nagios/plugins/check_mysql_query -s /var/run/mysqld/mysqld.sock -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} -q "SELECT COUNT(*) FROM information_schema.tables" 2>> /tmp/mysql-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
331 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
332 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
333 progress "MySQL/MariaDB" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
334 if [[ $? == 10 ]]; then
335 diff_c=0
336 sleep 1
337 else
338 diff_c=0
339 sleep $(( ( RANDOM % 60 ) + 20 ))
340 fi
341 done
342 return 1
343}
344
345mysql_repl_checks() {
346 err_count=0
347 diff_c=0
348 THRESHOLD=${MYSQL_REPLICATION_THRESHOLD}
349 # Reduce error count by 2 after restarting an unhealthy container
350 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
351 while [ ${err_count} -lt ${THRESHOLD} ]; do
352 touch /tmp/mysql_repl_checks; echo "$(tail -50 /tmp/mysql_repl_checks)" > /tmp/mysql_repl_checks
353 err_c_cur=${err_count}
354 /usr/lib/nagios/plugins/check_mysql_slavestatus.sh -S /var/run/mysqld/mysqld.sock -u root -p ${DBROOT} 2>> /tmp/mysql_repl_checks 1>&2; err_count=$(( ${err_count} + $? ))
355 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
356 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
357 progress "MySQL/MariaDB replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
358 if [[ $? == 10 ]]; then
359 diff_c=0
360 sleep 60
361 else
362 diff_c=0
363 sleep $(( ( RANDOM % 60 ) + 20 ))
364 fi
365 done
366 return 1
367}
368
369sogo_checks() {
370 err_count=0
371 diff_c=0
372 THRESHOLD=${SOGO_THRESHOLD}
373 # Reduce error count by 2 after restarting an unhealthy container
374 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
375 while [ ${err_count} -lt ${THRESHOLD} ]; do
376 touch /tmp/sogo-mailcow; echo "$(tail -50 /tmp/sogo-mailcow)" > /tmp/sogo-mailcow
377 host_ip=$(get_container_ip sogo-mailcow)
378 err_c_cur=${err_count}
Matthias Andreas Benkard12a57352021-12-28 18:02:04 +0100379 /usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u /SOGo.index/ -p 20000 2>> /tmp/sogo-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +0100380 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
381 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
382 progress "SOGo" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
383 if [[ $? == 10 ]]; then
384 diff_c=0
385 sleep 1
386 else
387 diff_c=0
388 sleep $(( ( RANDOM % 60 ) + 20 ))
389 fi
390 done
391 return 1
392}
393
394postfix_checks() {
395 err_count=0
396 diff_c=0
397 THRESHOLD=${POSTFIX_THRESHOLD}
398 # Reduce error count by 2 after restarting an unhealthy container
399 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
400 while [ ${err_count} -lt ${THRESHOLD} ]; do
401 touch /tmp/postfix-mailcow; echo "$(tail -50 /tmp/postfix-mailcow)" > /tmp/postfix-mailcow
402 host_ip=$(get_container_ip postfix-mailcow)
403 err_c_cur=${err_count}
404 /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 589 -f "watchdog@invalid" -C "RCPT TO:watchdog@localhost" -C DATA -C . -R 250 2>> /tmp/postfix-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
405 /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 589 -S 2>> /tmp/postfix-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
406 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
407 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
408 progress "Postfix" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
409 if [[ $? == 10 ]]; then
410 diff_c=0
411 sleep 1
412 else
413 diff_c=0
414 sleep $(( ( RANDOM % 60 ) + 20 ))
415 fi
416 done
417 return 1
418}
419
420clamd_checks() {
421 err_count=0
422 diff_c=0
423 THRESHOLD=${CLAMD_THRESHOLD}
424 # Reduce error count by 2 after restarting an unhealthy container
425 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
426 while [ ${err_count} -lt ${THRESHOLD} ]; do
427 touch /tmp/clamd-mailcow; echo "$(tail -50 /tmp/clamd-mailcow)" > /tmp/clamd-mailcow
428 host_ip=$(get_container_ip clamd-mailcow)
429 err_c_cur=${err_count}
430 /usr/lib/nagios/plugins/check_clamd -4 -H ${host_ip} 2>> /tmp/clamd-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
431 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
432 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
433 progress "Clamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
434 if [[ $? == 10 ]]; then
435 diff_c=0
436 sleep 1
437 else
438 diff_c=0
439 sleep $(( ( RANDOM % 120 ) + 20 ))
440 fi
441 done
442 return 1
443}
444
445dovecot_checks() {
446 err_count=0
447 diff_c=0
448 THRESHOLD=${DOVECOT_THRESHOLD}
449 # Reduce error count by 2 after restarting an unhealthy container
450 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
451 while [ ${err_count} -lt ${THRESHOLD} ]; do
452 touch /tmp/dovecot-mailcow; echo "$(tail -50 /tmp/dovecot-mailcow)" > /tmp/dovecot-mailcow
453 host_ip=$(get_container_ip dovecot-mailcow)
454 err_c_cur=${err_count}
455 /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 24 -f "watchdog@invalid" -C "RCPT TO:<watchdog@invalid>" -L -R "User doesn't exist" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
456 /usr/lib/nagios/plugins/check_imap -4 -H ${host_ip} -p 993 -S -e "OK " 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
457 /usr/lib/nagios/plugins/check_imap -4 -H ${host_ip} -p 143 -e "OK " 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
458 /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 10001 -e "VERSION" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
459 /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 4190 -e "Dovecot ready" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
460 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
461 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
462 progress "Dovecot" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
463 if [[ $? == 10 ]]; then
464 diff_c=0
465 sleep 1
466 else
467 diff_c=0
468 sleep $(( ( RANDOM % 60 ) + 20 ))
469 fi
470 done
471 return 1
472}
473
474dovecot_repl_checks() {
475 err_count=0
476 diff_c=0
477 THRESHOLD=${DOVECOT_REPL_THRESHOLD}
478 D_REPL_STATUS=$(redis-cli -h redis -r GET DOVECOT_REPL_HEALTH)
479 # Reduce error count by 2 after restarting an unhealthy container
480 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
481 while [ ${err_count} -lt ${THRESHOLD} ]; do
482 err_c_cur=${err_count}
483 D_REPL_STATUS=$(redis-cli --raw -h redis GET DOVECOT_REPL_HEALTH)
484 if [[ "${D_REPL_STATUS}" != "1" ]]; then
485 err_count=$(( ${err_count} + 1 ))
486 fi
487 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
488 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
489 progress "Dovecot replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
490 if [[ $? == 10 ]]; then
491 diff_c=0
492 sleep 60
493 else
494 diff_c=0
495 sleep $(( ( RANDOM % 60 ) + 20 ))
496 fi
497 done
498 return 1
499}
500
501cert_checks() {
502 err_count=0
503 diff_c=0
504 THRESHOLD=7
505 # Reduce error count by 2 after restarting an unhealthy container
506 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
507 while [ ${err_count} -lt ${THRESHOLD} ]; do
508 touch /tmp/certcheck; echo "$(tail -50 /tmp/certcheck)" > /tmp/certcheck
509 host_ip_postfix=$(get_container_ip postfix)
510 host_ip_dovecot=$(get_container_ip dovecot)
511 err_c_cur=${err_count}
512 /usr/lib/nagios/plugins/check_smtp -H ${host_ip_postfix} -p 589 -4 -S -D 7 2>> /tmp/certcheck 1>&2; err_count=$(( ${err_count} + $? ))
513 /usr/lib/nagios/plugins/check_imap -H ${host_ip_dovecot} -p 993 -4 -S -D 7 2>> /tmp/certcheck 1>&2; err_count=$(( ${err_count} + $? ))
514 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
515 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
516 progress "Primary certificate expiry check" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
517 # Always sleep 5 minutes, mail notifications are limited
518 sleep 300
519 done
520 return 1
521}
522
523phpfpm_checks() {
524 err_count=0
525 diff_c=0
526 THRESHOLD=${PHPFPM_THRESHOLD}
527 # Reduce error count by 2 after restarting an unhealthy container
528 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
529 while [ ${err_count} -lt ${THRESHOLD} ]; do
530 touch /tmp/php-fpm-mailcow; echo "$(tail -50 /tmp/php-fpm-mailcow)" > /tmp/php-fpm-mailcow
531 host_ip=$(get_container_ip php-fpm-mailcow)
532 err_c_cur=${err_count}
533 /usr/lib/nagios/plugins/check_tcp -H ${host_ip} -p 9001 2>> /tmp/php-fpm-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
534 /usr/lib/nagios/plugins/check_tcp -H ${host_ip} -p 9002 2>> /tmp/php-fpm-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
535 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
536 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
537 progress "PHP-FPM" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
538 if [[ $? == 10 ]]; then
539 diff_c=0
540 sleep 1
541 else
542 diff_c=0
543 sleep $(( ( RANDOM % 60 ) + 20 ))
544 fi
545 done
546 return 1
547}
548
549ratelimit_checks() {
550 err_count=0
551 diff_c=0
552 THRESHOLD=${RATELIMIT_THRESHOLD}
553 RL_LOG_STATUS=$(redis-cli -h redis LRANGE RL_LOG 0 0 | jq .qid)
554 # Reduce error count by 2 after restarting an unhealthy container
555 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
556 while [ ${err_count} -lt ${THRESHOLD} ]; do
557 err_c_cur=${err_count}
558 RL_LOG_STATUS_PREV=${RL_LOG_STATUS}
559 RL_LOG_STATUS=$(redis-cli -h redis LRANGE RL_LOG 0 0 | jq .qid)
560 if [[ ${RL_LOG_STATUS_PREV} != ${RL_LOG_STATUS} ]]; then
561 err_count=$(( ${err_count} + 1 ))
562 echo 'Last 10 applied ratelimits (may overlap with previous reports).' > /tmp/ratelimit
563 echo 'Full ratelimit buckets can be emptied by deleting the ratelimit hash from within mailcow UI (see /debug -> Protocols -> Ratelimit):' >> /tmp/ratelimit
564 echo >> /tmp/ratelimit
565 redis-cli --raw -h redis LRANGE RL_LOG 0 10 | jq . >> /tmp/ratelimit
566 fi
567 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
568 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
569 progress "Ratelimit" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
570 if [[ $? == 10 ]]; then
571 diff_c=0
572 sleep 1
573 else
574 diff_c=0
575 sleep $(( ( RANDOM % 60 ) + 20 ))
576 fi
577 done
578 return 1
579}
580
581mailq_checks() {
582 err_count=0
583 diff_c=0
584 THRESHOLD=${MAILQ_THRESHOLD}
585 # Reduce error count by 2 after restarting an unhealthy container
586 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
587 while [ ${err_count} -lt ${THRESHOLD} ]; do
588 touch /tmp/mail_queue_status; echo "$(tail -50 /tmp/mail_queue_status)" > /tmp/mail_queue_status
589 MAILQ_LOG_STATUS=$(find /var/spool/postfix/deferred -type f | wc -l)
590 echo "Mail queue contains ${MAILQ_LOG_STATUS} items (critical limit is ${MAILQ_CRIT}) at $(date)" >> /tmp/mail_queue_status
591 err_c_cur=${err_count}
592 if [ ${MAILQ_LOG_STATUS} -ge ${MAILQ_CRIT} ]; then
593 err_count=$(( ${err_count} + 1 ))
594 echo "Mail queue contains ${MAILQ_LOG_STATUS} items (critical limit is ${MAILQ_CRIT}) at $(date)" >> /tmp/mail_queue_status
595 fi
596 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
597 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
598 progress "Mail queue" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
599 if [[ $? == 10 ]]; then
600 diff_c=0
601 sleep 60
602 else
603 diff_c=0
604 sleep $(( ( RANDOM % 60 ) + 20 ))
605 fi
606 done
607 return 1
608}
609
610fail2ban_checks() {
611 err_count=0
612 diff_c=0
613 THRESHOLD=${FAIL2BAN_THRESHOLD}
614 F2B_LOG_STATUS=($(${REDIS_CMDLINE} --raw HKEYS F2B_ACTIVE_BANS))
615 F2B_RES=
616 # Reduce error count by 2 after restarting an unhealthy container
617 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
618 while [ ${err_count} -lt ${THRESHOLD} ]; do
619 err_c_cur=${err_count}
620 F2B_LOG_STATUS_PREV=(${F2B_LOG_STATUS[@]})
621 F2B_LOG_STATUS=($(${REDIS_CMDLINE} --raw HKEYS F2B_ACTIVE_BANS))
622 array_diff F2B_RES F2B_LOG_STATUS F2B_LOG_STATUS_PREV
623 if [[ ! -z "${F2B_RES}" ]]; then
624 err_count=$(( ${err_count} + 1 ))
625 echo -n "${F2B_RES[@]}" | tr -cd "[a-fA-F0-9.:/] " | timeout 3s ${REDIS_CMDLINE} -x SET F2B_RES > /dev/null
626 if [ $? -ne 0 ]; then
627 ${REDIS_CMDLINE} -x DEL F2B_RES
628 fi
629 fi
630 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
631 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
632 progress "Fail2ban" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
633 if [[ $? == 10 ]]; then
634 diff_c=0
635 sleep 1
636 else
637 diff_c=0
638 sleep $(( ( RANDOM % 60 ) + 20 ))
639 fi
640 done
641 return 1
642}
643
644acme_checks() {
645 err_count=0
646 diff_c=0
647 THRESHOLD=${ACME_THRESHOLD}
648 ACME_LOG_STATUS=$(redis-cli -h redis GET ACME_FAIL_TIME)
649 if [[ -z "${ACME_LOG_STATUS}" ]]; then
650 ${REDIS_CMDLINE} SET ACME_FAIL_TIME 0
651 ACME_LOG_STATUS=0
652 fi
653 # Reduce error count by 2 after restarting an unhealthy container
654 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
655 while [ ${err_count} -lt ${THRESHOLD} ]; do
656 err_c_cur=${err_count}
657 ACME_LOG_STATUS_PREV=${ACME_LOG_STATUS}
658 ACME_LC=0
659 until [[ ! -z ${ACME_LOG_STATUS} ]] || [ ${ACME_LC} -ge 3 ]; do
660 ACME_LOG_STATUS=$(redis-cli -h redis GET ACME_FAIL_TIME 2> /dev/null)
661 sleep 3
662 ACME_LC=$((ACME_LC+1))
663 done
664 if [[ ${ACME_LOG_STATUS_PREV} != ${ACME_LOG_STATUS} ]]; then
665 err_count=$(( ${err_count} + 1 ))
666 fi
667 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
668 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
669 progress "ACME" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
670 if [[ $? == 10 ]]; then
671 diff_c=0
672 sleep 1
673 else
674 diff_c=0
675 sleep $(( ( RANDOM % 60 ) + 20 ))
676 fi
677 done
678 return 1
679}
680
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +0100681rspamd_checks() {
682 err_count=0
683 diff_c=0
684 THRESHOLD=${RSPAMD_THRESHOLD}
685 # Reduce error count by 2 after restarting an unhealthy container
686 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
687 while [ ${err_count} -lt ${THRESHOLD} ]; do
688 touch /tmp/rspamd-mailcow; echo "$(tail -50 /tmp/rspamd-mailcow)" > /tmp/rspamd-mailcow
689 host_ip=$(get_container_ip rspamd-mailcow)
690 err_c_cur=${err_count}
691 SCORE=$(echo 'To: null@localhost
692From: watchdog@localhost
693
694Empty
695' | usr/bin/curl --max-time 10 -s --data-binary @- --unix-socket /var/lib/rspamd/rspamd.sock http://rspamd/scan | jq -rc .default.required_score)
696 if [[ ${SCORE} != "9999" ]]; then
697 echo "Rspamd settings check failed, score returned: ${SCORE}" 2>> /tmp/rspamd-mailcow 1>&2
698 err_count=$(( ${err_count} + 1))
699 else
700 echo "Rspamd settings check succeeded, score returned: ${SCORE}" 2>> /tmp/rspamd-mailcow 1>&2
701 fi
702 # A dirty hack until a PING PONG event is implemented to worker proxy
703 # We expect an empty response, not a timeout
704 if [ "$(curl -s --max-time 10 ${host_ip}:9900 2> /dev/null ; echo $?)" == "28" ]; then
705 echo "Milter check failed" 2>> /tmp/rspamd-mailcow 1>&2; err_count=$(( ${err_count} + 1 ));
706 else
707 echo "Milter check succeeded" 2>> /tmp/rspamd-mailcow 1>&2
708 fi
709 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
710 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
711 progress "Rspamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
712 if [[ $? == 10 ]]; then
713 diff_c=0
714 sleep 1
715 else
716 diff_c=0
717 sleep $(( ( RANDOM % 60 ) + 20 ))
718 fi
719 done
720 return 1
721}
722
723olefy_checks() {
724 err_count=0
725 diff_c=0
726 THRESHOLD=${OLEFY_THRESHOLD}
727 # Reduce error count by 2 after restarting an unhealthy container
728 trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
729 while [ ${err_count} -lt ${THRESHOLD} ]; do
730 touch /tmp/olefy-mailcow; echo "$(tail -50 /tmp/olefy-mailcow)" > /tmp/olefy-mailcow
731 host_ip=$(get_container_ip olefy-mailcow)
732 err_c_cur=${err_count}
733 /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 10055 -s "PING\n" 2>> /tmp/olefy-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
734 [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
735 [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
736 progress "Olefy" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
737 if [[ $? == 10 ]]; then
738 diff_c=0
739 sleep 1
740 else
741 diff_c=0
742 sleep $(( ( RANDOM % 60 ) + 20 ))
743 fi
744 done
745 return 1
746}
747
748# Notify about start
749if [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]]; then
750 mail_error "watchdog-mailcow" "Watchdog started monitoring mailcow."
751fi
752
753# Create watchdog agents
754
755(
756while true; do
757 if ! nginx_checks; then
758 log_msg "Nginx hit error limit"
759 echo nginx-mailcow > /tmp/com_pipe
760 fi
761done
762) &
763PID=$!
764echo "Spawned nginx_checks with PID ${PID}"
765BACKGROUND_TASKS+=(${PID})
766
767if [[ ${WATCHDOG_EXTERNAL_CHECKS} =~ ^([yY][eE][sS]|[yY])+$ ]]; then
768(
769while true; do
770 if ! external_checks; then
771 log_msg "External checks hit error limit"
772 echo external_checks > /tmp/com_pipe
773 fi
774done
775) &
776PID=$!
777echo "Spawned external_checks with PID ${PID}"
778BACKGROUND_TASKS+=(${PID})
779fi
780
781if [[ ${WATCHDOG_MYSQL_REPLICATION_CHECKS} =~ ^([yY][eE][sS]|[yY])+$ ]]; then
782(
783while true; do
784 if ! mysql_repl_checks; then
785 log_msg "MySQL replication check hit error limit"
786 echo mysql_repl_checks > /tmp/com_pipe
787 fi
788done
789) &
790PID=$!
791echo "Spawned mysql_repl_checks with PID ${PID}"
792BACKGROUND_TASKS+=(${PID})
793fi
794
795(
796while true; do
797 if ! mysql_checks; then
798 log_msg "MySQL hit error limit"
799 echo mysql-mailcow > /tmp/com_pipe
800 fi
801done
802) &
803PID=$!
804echo "Spawned mysql_checks with PID ${PID}"
805BACKGROUND_TASKS+=(${PID})
806
807(
808while true; do
809 if ! redis_checks; then
810 log_msg "Local Redis hit error limit"
811 echo redis-mailcow > /tmp/com_pipe
812 fi
813done
814) &
815PID=$!
816echo "Spawned redis_checks with PID ${PID}"
817BACKGROUND_TASKS+=(${PID})
818
819(
820while true; do
821 if ! phpfpm_checks; then
822 log_msg "PHP-FPM hit error limit"
823 echo php-fpm-mailcow > /tmp/com_pipe
824 fi
825done
826) &
827PID=$!
828echo "Spawned phpfpm_checks with PID ${PID}"
829BACKGROUND_TASKS+=(${PID})
830
831if [[ "${SKIP_SOGO}" =~ ^([nN][oO]|[nN])+$ ]]; then
832(
833while true; do
834 if ! sogo_checks; then
835 log_msg "SOGo hit error limit"
836 echo sogo-mailcow > /tmp/com_pipe
837 fi
838done
839) &
840PID=$!
841echo "Spawned sogo_checks with PID ${PID}"
842BACKGROUND_TASKS+=(${PID})
843fi
844
845if [ ${CHECK_UNBOUND} -eq 1 ]; then
846(
847while true; do
848 if ! unbound_checks; then
849 log_msg "Unbound hit error limit"
850 echo unbound-mailcow > /tmp/com_pipe
851 fi
852done
853) &
854PID=$!
855echo "Spawned unbound_checks with PID ${PID}"
856BACKGROUND_TASKS+=(${PID})
857fi
858
859if [[ "${SKIP_CLAMD}" =~ ^([nN][oO]|[nN])+$ ]]; then
860(
861while true; do
862 if ! clamd_checks; then
863 log_msg "Clamd hit error limit"
864 echo clamd-mailcow > /tmp/com_pipe
865 fi
866done
867) &
868PID=$!
869echo "Spawned clamd_checks with PID ${PID}"
870BACKGROUND_TASKS+=(${PID})
871fi
872
873(
874while true; do
875 if ! postfix_checks; then
876 log_msg "Postfix hit error limit"
877 echo postfix-mailcow > /tmp/com_pipe
878 fi
879done
880) &
881PID=$!
882echo "Spawned postfix_checks with PID ${PID}"
883BACKGROUND_TASKS+=(${PID})
884
885(
886while true; do
887 if ! mailq_checks; then
888 log_msg "Mail queue hit error limit"
889 echo mail_queue_status > /tmp/com_pipe
890 fi
891done
892) &
893PID=$!
894echo "Spawned mailq_checks with PID ${PID}"
895BACKGROUND_TASKS+=(${PID})
896
897(
898while true; do
899 if ! dovecot_checks; then
900 log_msg "Dovecot hit error limit"
901 echo dovecot-mailcow > /tmp/com_pipe
902 fi
903done
904) &
905PID=$!
906echo "Spawned dovecot_checks with PID ${PID}"
907BACKGROUND_TASKS+=(${PID})
908
909(
910while true; do
911 if ! dovecot_repl_checks; then
912 log_msg "Dovecot hit error limit"
913 echo dovecot_repl_checks > /tmp/com_pipe
914 fi
915done
916) &
917PID=$!
918echo "Spawned dovecot_repl_checks with PID ${PID}"
919BACKGROUND_TASKS+=(${PID})
920
921(
922while true; do
923 if ! rspamd_checks; then
924 log_msg "Rspamd hit error limit"
925 echo rspamd-mailcow > /tmp/com_pipe
926 fi
927done
928) &
929PID=$!
930echo "Spawned rspamd_checks with PID ${PID}"
931BACKGROUND_TASKS+=(${PID})
932
933(
934while true; do
935 if ! ratelimit_checks; then
936 log_msg "Ratelimit hit error limit"
937 echo ratelimit > /tmp/com_pipe
938 fi
939done
940) &
941PID=$!
942echo "Spawned ratelimit_checks with PID ${PID}"
943BACKGROUND_TASKS+=(${PID})
944
945(
946while true; do
947 if ! fail2ban_checks; then
948 log_msg "Fail2ban hit error limit"
949 echo fail2ban > /tmp/com_pipe
950 fi
951done
952) &
953PID=$!
954echo "Spawned fail2ban_checks with PID ${PID}"
955BACKGROUND_TASKS+=(${PID})
956
957(
958while true; do
959 if ! cert_checks; then
960 log_msg "Cert check hit error limit"
961 echo certcheck > /tmp/com_pipe
962 fi
963done
964) &
965PID=$!
966echo "Spawned cert_checks with PID ${PID}"
967BACKGROUND_TASKS+=(${PID})
968
969(
970while true; do
971 if ! olefy_checks; then
972 log_msg "Olefy hit error limit"
973 echo olefy-mailcow > /tmp/com_pipe
974 fi
975done
976) &
977PID=$!
978echo "Spawned olefy_checks with PID ${PID}"
979BACKGROUND_TASKS+=(${PID})
980
981(
982while true; do
983 if ! acme_checks; then
984 log_msg "ACME client hit error limit"
985 echo acme-mailcow > /tmp/com_pipe
986 fi
987done
988) &
989PID=$!
990echo "Spawned acme_checks with PID ${PID}"
991BACKGROUND_TASKS+=(${PID})
992
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +0100993# Monitor watchdog agents, stop script when agents fails and wait for respawn by Docker (restart:always:n)
994(
995while true; do
996 for bg_task in ${BACKGROUND_TASKS[*]}; do
997 if ! kill -0 ${bg_task} 1>&2; then
998 log_msg "Worker ${bg_task} died, stopping watchdog and waiting for respawn..."
999 kill -TERM 1
1000 fi
1001 sleep 10
1002 done
1003done
1004) &
1005
1006# Monitor dockerapi
1007(
1008while true; do
1009 while nc -z dockerapi 443; do
1010 sleep 3
1011 done
1012 log_msg "Cannot find dockerapi-mailcow, waiting to recover..."
1013 kill -STOP ${BACKGROUND_TASKS[*]}
1014 until nc -z dockerapi 443; do
1015 sleep 3
1016 done
1017 kill -CONT ${BACKGROUND_TASKS[*]}
1018 kill -USR1 ${BACKGROUND_TASKS[*]}
1019done
1020) &
1021
1022# Actions when threshold limit is reached
1023while true; do
1024 CONTAINER_ID=
1025 HAS_INITDB=
1026 read com_pipe_answer </tmp/com_pipe
1027 if [ -s "/tmp/${com_pipe_answer}" ]; then
1028 cat "/tmp/${com_pipe_answer}"
1029 fi
1030 if [[ ${com_pipe_answer} == "ratelimit" ]]; then
1031 log_msg "At least one ratelimit was applied"
1032 [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}"
1033 elif [[ ${com_pipe_answer} == "mail_queue_status" ]]; then
1034 log_msg "Mail queue status is critical"
1035 [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}"
1036 elif [[ ${com_pipe_answer} == "external_checks" ]]; then
1037 log_msg "Your mailcow is an open relay!"
1038 # Define $2 to override message text, else print service was restarted at ...
1039 [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please stop mailcow now and check your network configuration!"
1040 elif [[ ${com_pipe_answer} == "mysql_repl_checks" ]]; then
1041 log_msg "MySQL replication is not working properly"
1042 # Define $2 to override message text, else print service was restarted at ...
1043 # Once mail per 10 minutes
1044 [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check the SQL replication status" 600
1045 elif [[ ${com_pipe_answer} == "dovecot_repl_checks" ]]; then
1046 log_msg "Dovecot replication is not working properly"
1047 # Define $2 to override message text, else print service was restarted at ...
1048 # Once mail per 10 minutes
1049 [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check the Dovecot replicator status" 600
1050 elif [[ ${com_pipe_answer} == "certcheck" ]]; then
1051 log_msg "Certificates are about to expire"
1052 # Define $2 to override message text, else print service was restarted at ...
1053 # Only mail once a day
1054 [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please renew your certificate" 86400
1055 elif [[ ${com_pipe_answer} == "acme-mailcow" ]]; then
1056 log_msg "acme-mailcow did not complete successfully"
1057 # Define $2 to override message text, else print service was restarted at ...
1058 [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check acme-mailcow for further information."
1059 elif [[ ${com_pipe_answer} == "fail2ban" ]]; then
1060 F2B_RES=($(timeout 4s ${REDIS_CMDLINE} --raw GET F2B_RES 2> /dev/null))
1061 if [[ ! -z "${F2B_RES}" ]]; then
1062 ${REDIS_CMDLINE} DEL F2B_RES > /dev/null
1063 host=
1064 for host in "${F2B_RES[@]}"; do
1065 log_msg "Banned ${host}"
1066 rm /tmp/fail2ban 2> /dev/null
1067 timeout 2s whois "${host}" > /tmp/fail2ban
1068 [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && [[ ${WATCHDOG_NOTIFY_BAN} =~ ^([yY][eE][sS]|[yY])+$ ]] && mail_error "${com_pipe_answer}" "IP ban: ${host}"
1069 done
1070 fi
1071 elif [[ ${com_pipe_answer} =~ .+-mailcow ]]; then
1072 kill -STOP ${BACKGROUND_TASKS[*]}
1073 sleep 10
1074 CONTAINER_ID=$(curl --silent --insecure https://dockerapi/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], id: .Id}" | jq -rc "select( .name | tostring | contains(\"${com_pipe_answer}\")) | select( .project | tostring | contains(\"${COMPOSE_PROJECT_NAME,,}\")) | .id")
1075 if [[ ! -z ${CONTAINER_ID} ]]; then
1076 if [[ "${com_pipe_answer}" == "php-fpm-mailcow" ]]; then
1077 HAS_INITDB=$(curl --silent --insecure -XPOST https://dockerapi/containers/${CONTAINER_ID}/top | jq '.msg.Processes[] | contains(["php -c /usr/local/etc/php -f /web/inc/init_db.inc.php"])' | grep true)
1078 fi
1079 S_RUNNING=$(($(date +%s) - $(curl --silent --insecure https://dockerapi/containers/${CONTAINER_ID}/json | jq .State.StartedAt | xargs -n1 date +%s -d)))
1080 if [ ${S_RUNNING} -lt 360 ]; then
1081 log_msg "Container is running for less than 360 seconds, skipping action..."
1082 elif [[ ! -z ${HAS_INITDB} ]]; then
1083 log_msg "Database is being initialized by php-fpm-mailcow, not restarting but delaying checks for a minute..."
1084 sleep 60
1085 else
1086 log_msg "Sending restart command to ${CONTAINER_ID}..."
1087 curl --silent --insecure -XPOST https://dockerapi/containers/${CONTAINER_ID}/restart
Matthias Andreas Benkard7b2a3a12021-08-16 10:57:25 +02001088 [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}"
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +01001089 log_msg "Wait for restarted container to settle and continue watching..."
1090 sleep 35
1091 fi
1092 fi
1093 kill -CONT ${BACKGROUND_TASKS[*]}
1094 sleep 1
1095 kill -USR1 ${BACKGROUND_TASKS[*]}
1096 fi
1097done