Blame - mailcow/src/mailcow-dockerized/data/Dockerfiles/watchdog/watchdog.sh - kubeia

blob: 4013cb4e61cf8030f24643095854b65e139dff86 [file] [log] [blame]

Matthias Andreas Benkard	b382b10	2021-01-02 15:32:21 +0100	[diff] [blame]	1	#!/bin/bash
				2
				3	trap "exit" INT TERM
				4	trap "kill 0" EXIT
				5
				6	# Prepare
				7	BACKGROUND_TASKS=()
				8	echo "Waiting for containers to settle..."
				9	sleep 30
				10
				11	if [[ "${USE_WATCHDOG}" =~ ^([nN][oO]\|[nN])+$ ]]; then
				12	echo -e "$(date) - USE_WATCHDOG=n, skipping watchdog..."
				13	sleep 365d
				14	exec $(readlink -f "$0")
				15	fi
				16
				17	# Checks pipe their corresponding container name in this pipe
				18	if [[ ! -p /tmp/com_pipe ]]; then
				19	mkfifo /tmp/com_pipe
				20	fi
				21
				22	# Wait for containers
				23	while ! mysqladmin status --socket=/var/run/mysqld/mysqld.sock -u${DBUSER} -p${DBPASS} --silent; do
				24	echo "Waiting for SQL..."
				25	sleep 2
				26	done
				27
				28	# Do not attempt to write to slave
				29	if [[ ! -z ${REDIS_SLAVEOF_IP} ]]; then
				30	REDIS_CMDLINE="redis-cli -h ${REDIS_SLAVEOF_IP} -p ${REDIS_SLAVEOF_PORT}"
				31	else
				32	REDIS_CMDLINE="redis-cli -h redis -p 6379"
				33	fi
				34
				35	until [[ $(${REDIS_CMDLINE} PING) == "PONG" ]]; do
				36	echo "Waiting for Redis..."
				37	sleep 2
				38	done
				39
				40	${REDIS_CMDLINE} DEL F2B_RES > /dev/null
				41
				42	# Common functions
				43	get_ipv6(){
				44	local IPV6=
				45	local IPV6_SRCS=
				46	local TRY=
				47	IPV6_SRCS[0]="ip6.korves.net"
				48	IPV6_SRCS[1]="ip6.mailcow.email"
				49	until [[ ! -z ${IPV6} ]] \|\| [[ ${TRY} -ge 10 ]]; do
				50	IPV6=$(curl --connect-timeout 3 -m 10 -L6s ${IPV6_SRCS[$RANDOM % ${#IPV6_SRCS[@]} ]} \| grep "^$[0-9a-fA-F]\{0,4\}:$\{1,7\}[0-9a-fA-F]\{0,4\}$")
				51	[[ ! -z ${TRY} ]] && sleep 1
				52	TRY=$((TRY+1))
				53	done
				54	echo ${IPV6}
				55	}
				56
				57	array_diff() {
				58	# https://stackoverflow.com/questions/2312762, Alex Offshore
				59	eval local ARR1=$\"\${$2[@]}\"$
				60	eval local ARR2=$\"\${$3[@]}\"$
				61	local IFS=$'\n'
				62	mapfile -t $1 < <(comm -23 <(echo "${ARR1[]}" \| sort) <(echo "${ARR2[]}" \| sort))
				63	}
				64
				65	progress() {
				66	SERVICE=${1}
				67	TOTAL=${2}
				68	CURRENT=${3}
				69	DIFF=${4}
				70	[[ -z ${DIFF} ]] && DIFF=0
				71	[[ -z ${TOTAL} \|\| -z ${CURRENT} ]] && return
				72	[[ ${CURRENT} -gt ${TOTAL} ]] && return
				73	[[ ${CURRENT} -lt 0 ]] && CURRENT=0
				74	PERCENT=$(( 200 * ${CURRENT} / ${TOTAL} % 2 + 100 * ${CURRENT} / ${TOTAL} ))
				75	${REDIS_CMDLINE} LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"service\":\"${SERVICE}\",\"lvl\":\"${PERCENT}\",\"hpnow\":\"${CURRENT}\",\"hptotal\":\"${TOTAL}\",\"hpdiff\":\"${DIFF}\"}" > /dev/null
				76	log_msg "${SERVICE} health level: ${PERCENT}% (${CURRENT}/${TOTAL}), health trend: ${DIFF}" no_redis
				77	# Return 10 to indicate a dead service
				78	[ ${CURRENT} -le 0 ] && return 10
				79	}
				80
				81	log_msg() {
				82	if [[ ${2} != "no_redis" ]]; then
				83	${REDIS_CMDLINE} LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"message\":\"$(printf '%s' "${1}" \| \
				84	tr '\r\n%&;$"_[]{}-' ' ')\"}" > /dev/null
				85	fi
				86	echo $(date) $(printf '%s\n' "${1}")
				87	}
				88
				89	function mail_error() {
				90	THROTTLE=
				91	[[ -z ${1} ]] && return 1
				92	# If exists, body will be the content of "/tmp/${1}", even if ${2} is set
				93	[[ -z ${2} ]] && BODY="Service was restarted on $(date), please check your mailcow installation." \|\| BODY="$(date) - ${2}"
				94	# If exists, mail will be throttled by argument in seconds
				95	[[ ! -z ${3} ]] && THROTTLE=${3}
				96	if [[ ! -z ${THROTTLE} ]]; then
				97	TTL_LEFT="$(${REDIS_CMDLINE} TTL THROTTLE_${1} 2> /dev/null)"
				98	if [[ "${TTL_LEFT}" == "-2" ]]; then
				99	# Delay key not found, setting a delay key now
				100	${REDIS_CMDLINE} SET THROTTLE_${1} 1 EX ${THROTTLE}
				101	else
				102	log_msg "Not sending notification email now, blocked for ${TTL_LEFT} seconds..."
				103	return 1
				104	fi
				105	fi
				106	WATCHDOG_NOTIFY_EMAIL=$(echo "${WATCHDOG_NOTIFY_EMAIL}" \| sed 's/"//;s\|"$\|\|')
				107	# Some exceptions for subject and body formats
				108	if [[ ${1} == "fail2ban" ]]; then
				109	SUBJECT="${BODY}"
				110	BODY="Please see netfilter-mailcow for more details and triggered rules."
				111	else
				112	SUBJECT="Watchdog ALERT: ${1}"
				113	fi
				114	IFS=',' read -r -a MAIL_RCPTS <<< "${WATCHDOG_NOTIFY_EMAIL}"
				115	for rcpt in "${MAIL_RCPTS[@]}"; do
				116	RCPT_DOMAIN=
				117	#RCPT_MX=
				118	RCPT_DOMAIN=$(echo ${rcpt} \| awk -F @ {'print $NF'})
				119	# Latest smtp-cli looks up mx via dns
				120	#RCPT_MX=$(dig +short ${RCPT_DOMAIN} mx \| sort -n \| awk '{print $2; exit}')
				121	#if [[ -z ${RCPT_MX} ]]; then
				122	# log_msg "Cannot determine MX for ${rcpt}, skipping email notification..."
				123	# return 1
				124	#fi
				125	[ -f "/tmp/${1}" ] && BODY="/tmp/${1}"
				126	timeout 10s ./smtp-cli --missing-modules-ok \
				127	--charset=UTF-8 \
				128	--subject="${SUBJECT}" \
				129	--body-plain="${BODY}" \
				130	--add-header="X-Priority: 1" \
				131	--to=${rcpt} \
				132	--from="watchdog@${MAILCOW_HOSTNAME}" \
				133	--hello-host=${MAILCOW_HOSTNAME} \
				134	--ipv4
				135	#--server="${RCPT_MX}"
				136	log_msg "Sent notification email to ${rcpt}"
				137	done
				138	}
				139
				140	get_container_ip() {
				141	# ${1} is container
				142	CONTAINER_ID=()
				143	CONTAINER_IPS=()
				144	CONTAINER_IP=
				145	LOOP_C=1
				146	until [[ ${CONTAINER_IP} =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]] \|\| [[ ${LOOP_C} -gt 5 ]]; do
				147	if [ ${IP_BY_DOCKER_API} -eq 0 ]; then
				148	CONTAINER_IP=$(dig a "${1}" +short)
				149	else
				150	sleep 0.5
				151	# get long container id for exact match
				152	CONTAINER_ID=($(curl --silent --insecure https://dockerapi/containers/json \| jq -r ".[] \| {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], id: .Id}" \| jq -rc "select( .name \| tostring == \"${1}\") \| select( .project \| tostring \| contains(\"${COMPOSE_PROJECT_NAME,,}\")) \| .id"))
				153	# returned id can have multiple elements (if scaled), shuffle for random test
				154	CONTAINER_ID=($(printf "%s\n" "${CONTAINER_ID[@]}" \| shuf))
				155	if [[ ! -z ${CONTAINER_ID} ]]; then
				156	for matched_container in "${CONTAINER_ID[@]}"; do
				157	CONTAINER_IPS=($(curl --silent --insecure https://dockerapi/containers/${matched_container}/json \| jq -r '.NetworkSettings.Networks[].IPAddress'))
				158	for ip_match in "${CONTAINER_IPS[@]}"; do
				159	# grep will do nothing if one of these vars is empty
				160	[[ -z ${ip_match} ]] && continue
				161	[[ -z ${IPV4_NETWORK} ]] && continue
				162	# only return ips that are part of our network
				163	if ! grep -q ${IPV4_NETWORK} <(echo ${ip_match}); then
				164	continue
				165	else
				166	CONTAINER_IP=${ip_match}
				167	break
				168	fi
				169	done
				170	[[ ! -z ${CONTAINER_IP} ]] && break
				171	done
				172	fi
				173	fi
				174	LOOP_C=$((LOOP_C + 1))
				175	done
				176	[[ ${LOOP_C} -gt 5 ]] && echo 240.0.0.0 \|\| echo ${CONTAINER_IP}
				177	}
				178
				179	# One-time check
				180	if grep -qi "$(echo ${IPV6_NETWORK} \| cut -d: -f1-3)" <<< "$(ip a s)"; then
				181	if [[ -z "$(get_ipv6)" ]]; then
				182	mail_error "ipv6-config" "enable_ipv6 is true in docker-compose.yml, but an IPv6 link could not be established. Please verify your IPv6 connection."
				183	fi
				184	fi
				185
				186	external_checks() {
				187	err_count=0
				188	diff_c=0
				189	THRESHOLD=${EXTERNAL_CHECKS_THRESHOLD}
				190	# Reduce error count by 2 after restarting an unhealthy container
				191	GUID=$(mysql -u${DBUSER} -p${DBPASS} ${DBNAME} -e "SELECT version FROM versions WHERE application = 'GUID'" -BN)
				192	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				193	while [ ${err_count} -lt ${THRESHOLD} ]; do
				194	err_c_cur=${err_count}
				195	CHECK_REPONSE="$(curl --connect-timeout 3 -m 10 -4 -s https://checks.mailcow.email -X POST -dguid=${GUID} 2> /dev/null)"
				196	if [[ ! -z "${CHECK_REPONSE}" ]] && [[ "$(echo ${CHECK_REPONSE} \| jq -r .response)" == "critical" ]]; then
				197	echo ${CHECK_REPONSE} \| jq -r .out > /tmp/external_checks
				198	err_count=$(( ${err_count} + 1 ))
				199	fi
				200	CHECK_REPONSE6="$(curl --connect-timeout 3 -m 10 -6 -s https://checks.mailcow.email -X POST -dguid=${GUID} 2> /dev/null)"
				201	if [[ ! -z "${CHECK_REPONSE6}" ]] && [[ "$(echo ${CHECK_REPONSE6} \| jq -r .response)" == "critical" ]]; then
				202	echo ${CHECK_REPONSE} \| jq -r .out > /tmp/external_checks
				203	err_count=$(( ${err_count} + 1 ))
				204	fi
				205	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				206	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				207	progress "External checks" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				208	if [[ $? == 10 ]]; then
				209	diff_c=0
				210	sleep 60
				211	else
				212	diff_c=0
				213	sleep $(( ( RANDOM % 20 ) + 120 ))
				214	fi
				215	done
				216	return 1
				217	}
				218
				219	nginx_checks() {
				220	err_count=0
				221	diff_c=0
				222	THRESHOLD=${NGINX_THRESHOLD}
				223	# Reduce error count by 2 after restarting an unhealthy container
				224	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				225	while [ ${err_count} -lt ${THRESHOLD} ]; do
				226	touch /tmp/nginx-mailcow; echo "$(tail -50 /tmp/nginx-mailcow)" > /tmp/nginx-mailcow
				227	host_ip=$(get_container_ip nginx-mailcow)
				228	err_c_cur=${err_count}
				229	/usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u / -p 8081 2>> /tmp/nginx-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
				230	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				231	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				232	progress "Nginx" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				233	if [[ $? == 10 ]]; then
				234	diff_c=0
				235	sleep 1
				236	else
				237	diff_c=0
				238	sleep $(( ( RANDOM % 60 ) + 20 ))
				239	fi
				240	done
				241	return 1
				242	}
				243
				244	unbound_checks() {
				245	err_count=0
				246	diff_c=0
				247	THRESHOLD=${UNBOUND_THRESHOLD}
				248	# Reduce error count by 2 after restarting an unhealthy container
				249	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				250	while [ ${err_count} -lt ${THRESHOLD} ]; do
				251	touch /tmp/unbound-mailcow; echo "$(tail -50 /tmp/unbound-mailcow)" > /tmp/unbound-mailcow
				252	host_ip=$(get_container_ip unbound-mailcow)
				253	err_c_cur=${err_count}
Matthias Andreas Benkard	38837a8	2021-01-02 11:13:53 +0100	[diff] [blame]	254	/usr/bin/nslookup -sil stackoverflow.com "${host_ip}" 2>> /tmp/unbound-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
Matthias Andreas Benkard	4e2e5d9	2021-01-02 07:43:06 +0100	[diff] [blame]	255	DNSSEC=$(dig com +dnssec "@${host_ip}" \| egrep 'flags:.+ad')
Matthias Andreas Benkard	b382b10	2021-01-02 15:32:21 +0100	[diff] [blame]	256	if [[ -z ${DNSSEC} ]]; then
				257	echo "DNSSEC failure" 2>> /tmp/unbound-mailcow 1>&2
				258	err_count=$(( ${err_count} + 1))
				259	else
				260	echo "DNSSEC check succeeded" 2>> /tmp/unbound-mailcow 1>&2
				261	fi
				262	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				263	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				264	progress "Unbound" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				265	if [[ $? == 10 ]]; then
				266	diff_c=0
				267	sleep 1
				268	else
				269	diff_c=0
				270	sleep $(( ( RANDOM % 60 ) + 20 ))
				271	fi
				272	done
				273	return 1
				274	}
				275
				276	redis_checks() {
				277	# A check for the local redis container
				278	err_count=0
				279	diff_c=0
				280	THRESHOLD=${REDIS_THRESHOLD}
				281	# Reduce error count by 2 after restarting an unhealthy container
				282	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				283	while [ ${err_count} -lt ${THRESHOLD} ]; do
				284	touch /tmp/redis-mailcow; echo "$(tail -50 /tmp/redis-mailcow)" > /tmp/redis-mailcow
				285	host_ip=$(get_container_ip redis-mailcow)
				286	err_c_cur=${err_count}
				287	/usr/lib/nagios/plugins/check_tcp -4 -H redis-mailcow -p 6379 -E -s "PING\n" -q "QUIT" -e "PONG" 2>> /tmp/redis-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
				288	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				289	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				290	progress "Redis" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				291	if [[ $? == 10 ]]; then
				292	diff_c=0
				293	sleep 1
				294	else
				295	diff_c=0
				296	sleep $(( ( RANDOM % 60 ) + 20 ))
				297	fi
				298	done
				299	return 1
				300	}
				301
				302	mysql_checks() {
				303	err_count=0
				304	diff_c=0
				305	THRESHOLD=${MYSQL_THRESHOLD}
				306	# Reduce error count by 2 after restarting an unhealthy container
				307	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				308	while [ ${err_count} -lt ${THRESHOLD} ]; do
				309	touch /tmp/mysql-mailcow; echo "$(tail -50 /tmp/mysql-mailcow)" > /tmp/mysql-mailcow
				310	err_c_cur=${err_count}
				311	/usr/lib/nagios/plugins/check_mysql -s /var/run/mysqld/mysqld.sock -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} 2>> /tmp/mysql-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
				312	/usr/lib/nagios/plugins/check_mysql_query -s /var/run/mysqld/mysqld.sock -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} -q "SELECT COUNT(*) FROM information_schema.tables" 2>> /tmp/mysql-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
				313	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				314	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				315	progress "MySQL/MariaDB" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				316	if [[ $? == 10 ]]; then
				317	diff_c=0
				318	sleep 1
				319	else
				320	diff_c=0
				321	sleep $(( ( RANDOM % 60 ) + 20 ))
				322	fi
				323	done
				324	return 1
				325	}
				326
				327	mysql_repl_checks() {
				328	err_count=0
				329	diff_c=0
				330	THRESHOLD=${MYSQL_REPLICATION_THRESHOLD}
				331	# Reduce error count by 2 after restarting an unhealthy container
				332	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				333	while [ ${err_count} -lt ${THRESHOLD} ]; do
				334	touch /tmp/mysql_repl_checks; echo "$(tail -50 /tmp/mysql_repl_checks)" > /tmp/mysql_repl_checks
				335	err_c_cur=${err_count}
				336	/usr/lib/nagios/plugins/check_mysql_slavestatus.sh -S /var/run/mysqld/mysqld.sock -u root -p ${DBROOT} 2>> /tmp/mysql_repl_checks 1>&2; err_count=$(( ${err_count} + $? ))
				337	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				338	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				339	progress "MySQL/MariaDB replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				340	if [[ $? == 10 ]]; then
				341	diff_c=0
				342	sleep 60
				343	else
				344	diff_c=0
				345	sleep $(( ( RANDOM % 60 ) + 20 ))
				346	fi
				347	done
				348	return 1
				349	}
				350
				351	sogo_checks() {
				352	err_count=0
				353	diff_c=0
				354	THRESHOLD=${SOGO_THRESHOLD}
				355	# Reduce error count by 2 after restarting an unhealthy container
				356	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				357	while [ ${err_count} -lt ${THRESHOLD} ]; do
				358	touch /tmp/sogo-mailcow; echo "$(tail -50 /tmp/sogo-mailcow)" > /tmp/sogo-mailcow
				359	host_ip=$(get_container_ip sogo-mailcow)
				360	err_c_cur=${err_count}
				361	/usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u /SOGo.index/ -p 20000 -R "SOGo\.MainUI" 2>> /tmp/sogo-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
				362	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				363	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				364	progress "SOGo" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				365	if [[ $? == 10 ]]; then
				366	diff_c=0
				367	sleep 1
				368	else
				369	diff_c=0
				370	sleep $(( ( RANDOM % 60 ) + 20 ))
				371	fi
				372	done
				373	return 1
				374	}
				375
				376	postfix_checks() {
				377	err_count=0
				378	diff_c=0
				379	THRESHOLD=${POSTFIX_THRESHOLD}
				380	# Reduce error count by 2 after restarting an unhealthy container
				381	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				382	while [ ${err_count} -lt ${THRESHOLD} ]; do
				383	touch /tmp/postfix-mailcow; echo "$(tail -50 /tmp/postfix-mailcow)" > /tmp/postfix-mailcow
				384	host_ip=$(get_container_ip postfix-mailcow)
				385	err_c_cur=${err_count}
				386	/usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 589 -f "watchdog@invalid" -C "RCPT TO:watchdog@localhost" -C DATA -C . -R 250 2>> /tmp/postfix-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
				387	/usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 589 -S 2>> /tmp/postfix-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
				388	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				389	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				390	progress "Postfix" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				391	if [[ $? == 10 ]]; then
				392	diff_c=0
				393	sleep 1
				394	else
				395	diff_c=0
				396	sleep $(( ( RANDOM % 60 ) + 20 ))
				397	fi
				398	done
				399	return 1
				400	}
				401
				402	clamd_checks() {
				403	err_count=0
				404	diff_c=0
				405	THRESHOLD=${CLAMD_THRESHOLD}
				406	# Reduce error count by 2 after restarting an unhealthy container
				407	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				408	while [ ${err_count} -lt ${THRESHOLD} ]; do
				409	touch /tmp/clamd-mailcow; echo "$(tail -50 /tmp/clamd-mailcow)" > /tmp/clamd-mailcow
				410	host_ip=$(get_container_ip clamd-mailcow)
				411	err_c_cur=${err_count}
				412	/usr/lib/nagios/plugins/check_clamd -4 -H ${host_ip} 2>> /tmp/clamd-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
				413	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				414	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				415	progress "Clamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				416	if [[ $? == 10 ]]; then
				417	diff_c=0
				418	sleep 1
				419	else
				420	diff_c=0
				421	sleep $(( ( RANDOM % 120 ) + 20 ))
				422	fi
				423	done
				424	return 1
				425	}
				426
				427	dovecot_checks() {
				428	err_count=0
				429	diff_c=0
				430	THRESHOLD=${DOVECOT_THRESHOLD}
				431	# Reduce error count by 2 after restarting an unhealthy container
				432	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				433	while [ ${err_count} -lt ${THRESHOLD} ]; do
				434	touch /tmp/dovecot-mailcow; echo "$(tail -50 /tmp/dovecot-mailcow)" > /tmp/dovecot-mailcow
				435	host_ip=$(get_container_ip dovecot-mailcow)
				436	err_c_cur=${err_count}
				437	/usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 24 -f "watchdog@invalid" -C "RCPT TO:<watchdog@invalid>" -L -R "User doesn't exist" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
				438	/usr/lib/nagios/plugins/check_imap -4 -H ${host_ip} -p 993 -S -e "OK " 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
				439	/usr/lib/nagios/plugins/check_imap -4 -H ${host_ip} -p 143 -e "OK " 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
				440	/usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 10001 -e "VERSION" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
				441	/usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 4190 -e "Dovecot ready" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
				442	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				443	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				444	progress "Dovecot" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				445	if [[ $? == 10 ]]; then
				446	diff_c=0
				447	sleep 1
				448	else
				449	diff_c=0
				450	sleep $(( ( RANDOM % 60 ) + 20 ))
				451	fi
				452	done
				453	return 1
				454	}
				455
				456	dovecot_repl_checks() {
				457	err_count=0
				458	diff_c=0
				459	THRESHOLD=${DOVECOT_REPL_THRESHOLD}
				460	D_REPL_STATUS=$(redis-cli -h redis -r GET DOVECOT_REPL_HEALTH)
				461	# Reduce error count by 2 after restarting an unhealthy container
				462	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				463	while [ ${err_count} -lt ${THRESHOLD} ]; do
				464	err_c_cur=${err_count}
				465	D_REPL_STATUS=$(redis-cli --raw -h redis GET DOVECOT_REPL_HEALTH)
				466	if [[ "${D_REPL_STATUS}" != "1" ]]; then
				467	err_count=$(( ${err_count} + 1 ))
				468	fi
				469	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				470	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				471	progress "Dovecot replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				472	if [[ $? == 10 ]]; then
				473	diff_c=0
				474	sleep 60
				475	else
				476	diff_c=0
				477	sleep $(( ( RANDOM % 60 ) + 20 ))
				478	fi
				479	done
				480	return 1
				481	}
				482
				483	cert_checks() {
				484	err_count=0
				485	diff_c=0
				486	THRESHOLD=7
				487	# Reduce error count by 2 after restarting an unhealthy container
				488	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				489	while [ ${err_count} -lt ${THRESHOLD} ]; do
				490	touch /tmp/certcheck; echo "$(tail -50 /tmp/certcheck)" > /tmp/certcheck
				491	host_ip_postfix=$(get_container_ip postfix)
				492	host_ip_dovecot=$(get_container_ip dovecot)
				493	err_c_cur=${err_count}
				494	/usr/lib/nagios/plugins/check_smtp -H ${host_ip_postfix} -p 589 -4 -S -D 7 2>> /tmp/certcheck 1>&2; err_count=$(( ${err_count} + $? ))
				495	/usr/lib/nagios/plugins/check_imap -H ${host_ip_dovecot} -p 993 -4 -S -D 7 2>> /tmp/certcheck 1>&2; err_count=$(( ${err_count} + $? ))
				496	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				497	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				498	progress "Primary certificate expiry check" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				499	# Always sleep 5 minutes, mail notifications are limited
				500	sleep 300
				501	done
				502	return 1
				503	}
				504
				505	phpfpm_checks() {
				506	err_count=0
				507	diff_c=0
				508	THRESHOLD=${PHPFPM_THRESHOLD}
				509	# Reduce error count by 2 after restarting an unhealthy container
				510	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				511	while [ ${err_count} -lt ${THRESHOLD} ]; do
				512	touch /tmp/php-fpm-mailcow; echo "$(tail -50 /tmp/php-fpm-mailcow)" > /tmp/php-fpm-mailcow
				513	host_ip=$(get_container_ip php-fpm-mailcow)
				514	err_c_cur=${err_count}
				515	/usr/lib/nagios/plugins/check_tcp -H ${host_ip} -p 9001 2>> /tmp/php-fpm-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
				516	/usr/lib/nagios/plugins/check_tcp -H ${host_ip} -p 9002 2>> /tmp/php-fpm-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
				517	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				518	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				519	progress "PHP-FPM" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				520	if [[ $? == 10 ]]; then
				521	diff_c=0
				522	sleep 1
				523	else
				524	diff_c=0
				525	sleep $(( ( RANDOM % 60 ) + 20 ))
				526	fi
				527	done
				528	return 1
				529	}
				530
				531	ratelimit_checks() {
				532	err_count=0
				533	diff_c=0
				534	THRESHOLD=${RATELIMIT_THRESHOLD}
				535	RL_LOG_STATUS=$(redis-cli -h redis LRANGE RL_LOG 0 0 \| jq .qid)
				536	# Reduce error count by 2 after restarting an unhealthy container
				537	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				538	while [ ${err_count} -lt ${THRESHOLD} ]; do
				539	err_c_cur=${err_count}
				540	RL_LOG_STATUS_PREV=${RL_LOG_STATUS}
				541	RL_LOG_STATUS=$(redis-cli -h redis LRANGE RL_LOG 0 0 \| jq .qid)
				542	if [[ ${RL_LOG_STATUS_PREV} != ${RL_LOG_STATUS} ]]; then
				543	err_count=$(( ${err_count} + 1 ))
				544	echo 'Last 10 applied ratelimits (may overlap with previous reports).' > /tmp/ratelimit
				545	echo 'Full ratelimit buckets can be emptied by deleting the ratelimit hash from within mailcow UI (see /debug -> Protocols -> Ratelimit):' >> /tmp/ratelimit
				546	echo >> /tmp/ratelimit
				547	redis-cli --raw -h redis LRANGE RL_LOG 0 10 \| jq . >> /tmp/ratelimit
				548	fi
				549	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				550	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				551	progress "Ratelimit" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				552	if [[ $? == 10 ]]; then
				553	diff_c=0
				554	sleep 1
				555	else
				556	diff_c=0
				557	sleep $(( ( RANDOM % 60 ) + 20 ))
				558	fi
				559	done
				560	return 1
				561	}
				562
				563	mailq_checks() {
				564	err_count=0
				565	diff_c=0
				566	THRESHOLD=${MAILQ_THRESHOLD}
				567	# Reduce error count by 2 after restarting an unhealthy container
				568	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				569	while [ ${err_count} -lt ${THRESHOLD} ]; do
				570	touch /tmp/mail_queue_status; echo "$(tail -50 /tmp/mail_queue_status)" > /tmp/mail_queue_status
				571	MAILQ_LOG_STATUS=$(find /var/spool/postfix/deferred -type f \| wc -l)
				572	echo "Mail queue contains ${MAILQ_LOG_STATUS} items (critical limit is ${MAILQ_CRIT}) at $(date)" >> /tmp/mail_queue_status
				573	err_c_cur=${err_count}
				574	if [ ${MAILQ_LOG_STATUS} -ge ${MAILQ_CRIT} ]; then
				575	err_count=$(( ${err_count} + 1 ))
				576	echo "Mail queue contains ${MAILQ_LOG_STATUS} items (critical limit is ${MAILQ_CRIT}) at $(date)" >> /tmp/mail_queue_status
				577	fi
				578	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				579	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				580	progress "Mail queue" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				581	if [[ $? == 10 ]]; then
				582	diff_c=0
				583	sleep 60
				584	else
				585	diff_c=0
				586	sleep $(( ( RANDOM % 60 ) + 20 ))
				587	fi
				588	done
				589	return 1
				590	}
				591
				592	fail2ban_checks() {
				593	err_count=0
				594	diff_c=0
				595	THRESHOLD=${FAIL2BAN_THRESHOLD}
				596	F2B_LOG_STATUS=($(${REDIS_CMDLINE} --raw HKEYS F2B_ACTIVE_BANS))
				597	F2B_RES=
				598	# Reduce error count by 2 after restarting an unhealthy container
				599	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				600	while [ ${err_count} -lt ${THRESHOLD} ]; do
				601	err_c_cur=${err_count}
				602	F2B_LOG_STATUS_PREV=(${F2B_LOG_STATUS[@]})
				603	F2B_LOG_STATUS=($(${REDIS_CMDLINE} --raw HKEYS F2B_ACTIVE_BANS))
				604	array_diff F2B_RES F2B_LOG_STATUS F2B_LOG_STATUS_PREV
				605	if [[ ! -z "${F2B_RES}" ]]; then
				606	err_count=$(( ${err_count} + 1 ))
				607	echo -n "${F2B_RES[@]}" \| tr -cd "[a-fA-F0-9.:/] " \| timeout 3s ${REDIS_CMDLINE} -x SET F2B_RES > /dev/null
				608	if [ $? -ne 0 ]; then
				609	${REDIS_CMDLINE} -x DEL F2B_RES
				610	fi
				611	fi
				612	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				613	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				614	progress "Fail2ban" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				615	if [[ $? == 10 ]]; then
				616	diff_c=0
				617	sleep 1
				618	else
				619	diff_c=0
				620	sleep $(( ( RANDOM % 60 ) + 20 ))
				621	fi
				622	done
				623	return 1
				624	}
				625
				626	acme_checks() {
				627	err_count=0
				628	diff_c=0
				629	THRESHOLD=${ACME_THRESHOLD}
				630	ACME_LOG_STATUS=$(redis-cli -h redis GET ACME_FAIL_TIME)
				631	if [[ -z "${ACME_LOG_STATUS}" ]]; then
				632	${REDIS_CMDLINE} SET ACME_FAIL_TIME 0
				633	ACME_LOG_STATUS=0
				634	fi
				635	# Reduce error count by 2 after restarting an unhealthy container
				636	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				637	while [ ${err_count} -lt ${THRESHOLD} ]; do
				638	err_c_cur=${err_count}
				639	ACME_LOG_STATUS_PREV=${ACME_LOG_STATUS}
				640	ACME_LC=0
				641	until [[ ! -z ${ACME_LOG_STATUS} ]] \|\| [ ${ACME_LC} -ge 3 ]; do
				642	ACME_LOG_STATUS=$(redis-cli -h redis GET ACME_FAIL_TIME 2> /dev/null)
				643	sleep 3
				644	ACME_LC=$((ACME_LC+1))
				645	done
				646	if [[ ${ACME_LOG_STATUS_PREV} != ${ACME_LOG_STATUS} ]]; then
				647	err_count=$(( ${err_count} + 1 ))
				648	fi
				649	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				650	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				651	progress "ACME" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				652	if [[ $? == 10 ]]; then
				653	diff_c=0
				654	sleep 1
				655	else
				656	diff_c=0
				657	sleep $(( ( RANDOM % 60 ) + 20 ))
				658	fi
				659	done
				660	return 1
				661	}
				662
				663	ipv6nat_checks() {
				664	err_count=0
				665	diff_c=0
				666	THRESHOLD=${IPV6NAT_THRESHOLD}
				667	# Reduce error count by 2 after restarting an unhealthy container
				668	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				669	while [ ${err_count} -lt ${THRESHOLD} ]; do
				670	err_c_cur=${err_count}
				671	CONTAINERS=$(curl --silent --insecure https://dockerapi/containers/json)
				672	IPV6NAT_CONTAINER_ID=$(echo ${CONTAINERS} \| jq -r ".[] \| {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], id: .Id}" \| jq -rc "select( .name \| tostring \| contains(\"ipv6nat-mailcow\")) \| select( .project \| tostring \| contains(\"${COMPOSE_PROJECT_NAME,,}\")) \| .id")
				673	if [[ ! -z ${IPV6NAT_CONTAINER_ID} ]]; then
				674	LATEST_STARTED="$(echo ${CONTAINERS} \| jq -r ".[] \| {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], StartedAt: .State.StartedAt}" \| jq -rc "select( .project \| tostring \| contains(\"${COMPOSE_PROJECT_NAME,,}\")) \| select( .name \| tostring \| contains(\"ipv6nat-mailcow\") \| not)" \| jq -rc .StartedAt \| xargs -n1 date +%s -d \| sort \| tail -n1)"
				675	LATEST_IPV6NAT="$(echo ${CONTAINERS} \| jq -r ".[] \| {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], StartedAt: .State.StartedAt}" \| jq -rc "select( .project \| tostring \| contains(\"${COMPOSE_PROJECT_NAME,,}\")) \| select( .name \| tostring \| contains(\"ipv6nat-mailcow\"))" \| jq -rc .StartedAt \| xargs -n1 date +%s -d \| sort \| tail -n1)"
				676	DIFFERENCE_START_TIME=$(expr ${LATEST_IPV6NAT} - ${LATEST_STARTED} 2>/dev/null)
				677	if [[ "${DIFFERENCE_START_TIME}" -lt 30 ]]; then
				678	err_count=$(( ${err_count} + 1 ))
				679	fi
				680	fi
				681	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				682	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				683	progress "IPv6 NAT" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				684	if [[ $? == 10 ]]; then
				685	diff_c=0
				686	sleep 30
				687	else
				688	diff_c=0
				689	sleep 300
				690	fi
				691	done
				692	return 1
				693	}
				694
				695
				696	rspamd_checks() {
				697	err_count=0
				698	diff_c=0
				699	THRESHOLD=${RSPAMD_THRESHOLD}
				700	# Reduce error count by 2 after restarting an unhealthy container
				701	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				702	while [ ${err_count} -lt ${THRESHOLD} ]; do
				703	touch /tmp/rspamd-mailcow; echo "$(tail -50 /tmp/rspamd-mailcow)" > /tmp/rspamd-mailcow
				704	host_ip=$(get_container_ip rspamd-mailcow)
				705	err_c_cur=${err_count}
				706	SCORE=$(echo 'To: null@localhost
				707	From: watchdog@localhost
				708
				709	Empty
				710	' \| usr/bin/curl --max-time 10 -s --data-binary @- --unix-socket /var/lib/rspamd/rspamd.sock http://rspamd/scan \| jq -rc .default.required_score)
				711	if [[ ${SCORE} != "9999" ]]; then
				712	echo "Rspamd settings check failed, score returned: ${SCORE}" 2>> /tmp/rspamd-mailcow 1>&2
				713	err_count=$(( ${err_count} + 1))
				714	else
				715	echo "Rspamd settings check succeeded, score returned: ${SCORE}" 2>> /tmp/rspamd-mailcow 1>&2
				716	fi
				717	# A dirty hack until a PING PONG event is implemented to worker proxy
				718	# We expect an empty response, not a timeout
				719	if [ "$(curl -s --max-time 10 ${host_ip}:9900 2> /dev/null ; echo $?)" == "28" ]; then
				720	echo "Milter check failed" 2>> /tmp/rspamd-mailcow 1>&2; err_count=$(( ${err_count} + 1 ));
				721	else
				722	echo "Milter check succeeded" 2>> /tmp/rspamd-mailcow 1>&2
				723	fi
				724	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				725	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				726	progress "Rspamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				727	if [[ $? == 10 ]]; then
				728	diff_c=0
				729	sleep 1
				730	else
				731	diff_c=0
				732	sleep $(( ( RANDOM % 60 ) + 20 ))
				733	fi
				734	done
				735	return 1
				736	}
				737
				738	olefy_checks() {
				739	err_count=0
				740	diff_c=0
				741	THRESHOLD=${OLEFY_THRESHOLD}
				742	# Reduce error count by 2 after restarting an unhealthy container
				743	trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
				744	while [ ${err_count} -lt ${THRESHOLD} ]; do
				745	touch /tmp/olefy-mailcow; echo "$(tail -50 /tmp/olefy-mailcow)" > /tmp/olefy-mailcow
				746	host_ip=$(get_container_ip olefy-mailcow)
				747	err_c_cur=${err_count}
				748	/usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 10055 -s "PING\n" 2>> /tmp/olefy-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
				749	[ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
				750	[ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
				751	progress "Olefy" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
				752	if [[ $? == 10 ]]; then
				753	diff_c=0
				754	sleep 1
				755	else
				756	diff_c=0
				757	sleep $(( ( RANDOM % 60 ) + 20 ))
				758	fi
				759	done
				760	return 1
				761	}
				762
				763	# Notify about start
				764	if [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]]; then
				765	mail_error "watchdog-mailcow" "Watchdog started monitoring mailcow."
				766	fi
				767
				768	# Create watchdog agents
				769
				770	(
				771	while true; do
				772	if ! nginx_checks; then
				773	log_msg "Nginx hit error limit"
				774	echo nginx-mailcow > /tmp/com_pipe
				775	fi
				776	done
				777	) &
				778	PID=$!
				779	echo "Spawned nginx_checks with PID ${PID}"
				780	BACKGROUND_TASKS+=(${PID})
				781
				782	if [[ ${WATCHDOG_EXTERNAL_CHECKS} =~ ^([yY][eE][sS]\|[yY])+$ ]]; then
				783	(
				784	while true; do
				785	if ! external_checks; then
				786	log_msg "External checks hit error limit"
				787	echo external_checks > /tmp/com_pipe
				788	fi
				789	done
				790	) &
				791	PID=$!
				792	echo "Spawned external_checks with PID ${PID}"
				793	BACKGROUND_TASKS+=(${PID})
				794	fi
				795
				796	if [[ ${WATCHDOG_MYSQL_REPLICATION_CHECKS} =~ ^([yY][eE][sS]\|[yY])+$ ]]; then
				797	(
				798	while true; do
				799	if ! mysql_repl_checks; then
				800	log_msg "MySQL replication check hit error limit"
				801	echo mysql_repl_checks > /tmp/com_pipe
				802	fi
				803	done
				804	) &
				805	PID=$!
				806	echo "Spawned mysql_repl_checks with PID ${PID}"
				807	BACKGROUND_TASKS+=(${PID})
				808	fi
				809
				810	(
				811	while true; do
				812	if ! mysql_checks; then
				813	log_msg "MySQL hit error limit"
				814	echo mysql-mailcow > /tmp/com_pipe
				815	fi
				816	done
				817	) &
				818	PID=$!
				819	echo "Spawned mysql_checks with PID ${PID}"
				820	BACKGROUND_TASKS+=(${PID})
				821
				822	(
				823	while true; do
				824	if ! redis_checks; then
				825	log_msg "Local Redis hit error limit"
				826	echo redis-mailcow > /tmp/com_pipe
				827	fi
				828	done
				829	) &
				830	PID=$!
				831	echo "Spawned redis_checks with PID ${PID}"
				832	BACKGROUND_TASKS+=(${PID})
				833
				834	(
				835	while true; do
				836	if ! phpfpm_checks; then
				837	log_msg "PHP-FPM hit error limit"
				838	echo php-fpm-mailcow > /tmp/com_pipe
				839	fi
				840	done
				841	) &
				842	PID=$!
				843	echo "Spawned phpfpm_checks with PID ${PID}"
				844	BACKGROUND_TASKS+=(${PID})
				845
				846	if [[ "${SKIP_SOGO}" =~ ^([nN][oO]\|[nN])+$ ]]; then
				847	(
				848	while true; do
				849	if ! sogo_checks; then
				850	log_msg "SOGo hit error limit"
				851	echo sogo-mailcow > /tmp/com_pipe
				852	fi
				853	done
				854	) &
				855	PID=$!
				856	echo "Spawned sogo_checks with PID ${PID}"
				857	BACKGROUND_TASKS+=(${PID})
				858	fi
				859
				860	if [ ${CHECK_UNBOUND} -eq 1 ]; then
				861	(
				862	while true; do
				863	if ! unbound_checks; then
				864	log_msg "Unbound hit error limit"
				865	echo unbound-mailcow > /tmp/com_pipe
				866	fi
				867	done
				868	) &
				869	PID=$!
				870	echo "Spawned unbound_checks with PID ${PID}"
				871	BACKGROUND_TASKS+=(${PID})
				872	fi
				873
				874	if [[ "${SKIP_CLAMD}" =~ ^([nN][oO]\|[nN])+$ ]]; then
				875	(
				876	while true; do
				877	if ! clamd_checks; then
				878	log_msg "Clamd hit error limit"
				879	echo clamd-mailcow > /tmp/com_pipe
				880	fi
				881	done
				882	) &
				883	PID=$!
				884	echo "Spawned clamd_checks with PID ${PID}"
				885	BACKGROUND_TASKS+=(${PID})
				886	fi
				887
				888	(
				889	while true; do
				890	if ! postfix_checks; then
				891	log_msg "Postfix hit error limit"
				892	echo postfix-mailcow > /tmp/com_pipe
				893	fi
				894	done
				895	) &
				896	PID=$!
				897	echo "Spawned postfix_checks with PID ${PID}"
				898	BACKGROUND_TASKS+=(${PID})
				899
				900	(
				901	while true; do
				902	if ! mailq_checks; then
				903	log_msg "Mail queue hit error limit"
				904	echo mail_queue_status > /tmp/com_pipe
				905	fi
				906	done
				907	) &
				908	PID=$!
				909	echo "Spawned mailq_checks with PID ${PID}"
				910	BACKGROUND_TASKS+=(${PID})
				911
				912	(
				913	while true; do
				914	if ! dovecot_checks; then
				915	log_msg "Dovecot hit error limit"
				916	echo dovecot-mailcow > /tmp/com_pipe
				917	fi
				918	done
				919	) &
				920	PID=$!
				921	echo "Spawned dovecot_checks with PID ${PID}"
				922	BACKGROUND_TASKS+=(${PID})
				923
				924	(
				925	while true; do
				926	if ! dovecot_repl_checks; then
				927	log_msg "Dovecot hit error limit"
				928	echo dovecot_repl_checks > /tmp/com_pipe
				929	fi
				930	done
				931	) &
				932	PID=$!
				933	echo "Spawned dovecot_repl_checks with PID ${PID}"
				934	BACKGROUND_TASKS+=(${PID})
				935
				936	(
				937	while true; do
				938	if ! rspamd_checks; then
				939	log_msg "Rspamd hit error limit"
				940	echo rspamd-mailcow > /tmp/com_pipe
				941	fi
				942	done
				943	) &
				944	PID=$!
				945	echo "Spawned rspamd_checks with PID ${PID}"
				946	BACKGROUND_TASKS+=(${PID})
				947
				948	(
				949	while true; do
				950	if ! ratelimit_checks; then
				951	log_msg "Ratelimit hit error limit"
				952	echo ratelimit > /tmp/com_pipe
				953	fi
				954	done
				955	) &
				956	PID=$!
				957	echo "Spawned ratelimit_checks with PID ${PID}"
				958	BACKGROUND_TASKS+=(${PID})
				959
				960	(
				961	while true; do
				962	if ! fail2ban_checks; then
				963	log_msg "Fail2ban hit error limit"
				964	echo fail2ban > /tmp/com_pipe
				965	fi
				966	done
				967	) &
				968	PID=$!
				969	echo "Spawned fail2ban_checks with PID ${PID}"
				970	BACKGROUND_TASKS+=(${PID})
				971
				972	(
				973	while true; do
				974	if ! cert_checks; then
				975	log_msg "Cert check hit error limit"
				976	echo certcheck > /tmp/com_pipe
				977	fi
				978	done
				979	) &
				980	PID=$!
				981	echo "Spawned cert_checks with PID ${PID}"
				982	BACKGROUND_TASKS+=(${PID})
				983
				984	(
				985	while true; do
				986	if ! olefy_checks; then
				987	log_msg "Olefy hit error limit"
				988	echo olefy-mailcow > /tmp/com_pipe
				989	fi
				990	done
				991	) &
				992	PID=$!
				993	echo "Spawned olefy_checks with PID ${PID}"
				994	BACKGROUND_TASKS+=(${PID})
				995
				996	(
				997	while true; do
				998	if ! acme_checks; then
				999	log_msg "ACME client hit error limit"
				1000	echo acme-mailcow > /tmp/com_pipe
				1001	fi
				1002	done
				1003	) &
				1004	PID=$!
				1005	echo "Spawned acme_checks with PID ${PID}"
				1006	BACKGROUND_TASKS+=(${PID})
				1007
				1008	(
				1009	while true; do
				1010	if ! ipv6nat_checks; then
				1011	log_msg "IPv6 NAT warning: ipv6nat-mailcow container was not started at least 30s after siblings (not an error)"
				1012	echo ipv6nat-mailcow > /tmp/com_pipe
				1013	fi
				1014	done
				1015	) &
				1016	PID=$!
				1017	echo "Spawned ipv6nat_checks with PID ${PID}"
				1018	BACKGROUND_TASKS+=(${PID})
				1019
				1020	# Monitor watchdog agents, stop script when agents fails and wait for respawn by Docker (restart:always:n)
				1021	(
				1022	while true; do
				1023	for bg_task in ${BACKGROUND_TASKS[*]}; do
				1024	if ! kill -0 ${bg_task} 1>&2; then
				1025	log_msg "Worker ${bg_task} died, stopping watchdog and waiting for respawn..."
				1026	kill -TERM 1
				1027	fi
				1028	sleep 10
				1029	done
				1030	done
				1031	) &
				1032
				1033	# Monitor dockerapi
				1034	(
				1035	while true; do
				1036	while nc -z dockerapi 443; do
				1037	sleep 3
				1038	done
				1039	log_msg "Cannot find dockerapi-mailcow, waiting to recover..."
				1040	kill -STOP ${BACKGROUND_TASKS[*]}
				1041	until nc -z dockerapi 443; do
				1042	sleep 3
				1043	done
				1044	kill -CONT ${BACKGROUND_TASKS[*]}
				1045	kill -USR1 ${BACKGROUND_TASKS[*]}
				1046	done
				1047	) &
				1048
				1049	# Actions when threshold limit is reached
				1050	while true; do
				1051	CONTAINER_ID=
				1052	HAS_INITDB=
				1053	read com_pipe_answer </tmp/com_pipe
				1054	if [ -s "/tmp/${com_pipe_answer}" ]; then
				1055	cat "/tmp/${com_pipe_answer}"
				1056	fi
				1057	if [[ ${com_pipe_answer} == "ratelimit" ]]; then
				1058	log_msg "At least one ratelimit was applied"
				1059	[[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}"
				1060	elif [[ ${com_pipe_answer} == "mail_queue_status" ]]; then
				1061	log_msg "Mail queue status is critical"
				1062	[[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}"
				1063	elif [[ ${com_pipe_answer} == "external_checks" ]]; then
				1064	log_msg "Your mailcow is an open relay!"
				1065	# Define $2 to override message text, else print service was restarted at ...
				1066	[[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please stop mailcow now and check your network configuration!"
				1067	elif [[ ${com_pipe_answer} == "mysql_repl_checks" ]]; then
				1068	log_msg "MySQL replication is not working properly"
				1069	# Define $2 to override message text, else print service was restarted at ...
				1070	# Once mail per 10 minutes
				1071	[[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check the SQL replication status" 600
				1072	elif [[ ${com_pipe_answer} == "dovecot_repl_checks" ]]; then
				1073	log_msg "Dovecot replication is not working properly"
				1074	# Define $2 to override message text, else print service was restarted at ...
				1075	# Once mail per 10 minutes
				1076	[[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check the Dovecot replicator status" 600
				1077	elif [[ ${com_pipe_answer} == "certcheck" ]]; then
				1078	log_msg "Certificates are about to expire"
				1079	# Define $2 to override message text, else print service was restarted at ...
				1080	# Only mail once a day
				1081	[[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please renew your certificate" 86400
				1082	elif [[ ${com_pipe_answer} == "acme-mailcow" ]]; then
				1083	log_msg "acme-mailcow did not complete successfully"
				1084	# Define $2 to override message text, else print service was restarted at ...
				1085	[[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check acme-mailcow for further information."
				1086	elif [[ ${com_pipe_answer} == "fail2ban" ]]; then
				1087	F2B_RES=($(timeout 4s ${REDIS_CMDLINE} --raw GET F2B_RES 2> /dev/null))
				1088	if [[ ! -z "${F2B_RES}" ]]; then
				1089	${REDIS_CMDLINE} DEL F2B_RES > /dev/null
				1090	host=
				1091	for host in "${F2B_RES[@]}"; do
				1092	log_msg "Banned ${host}"
				1093	rm /tmp/fail2ban 2> /dev/null
				1094	timeout 2s whois "${host}" > /tmp/fail2ban
				1095	[[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && [[ ${WATCHDOG_NOTIFY_BAN} =~ ^([yY][eE][sS]\|[yY])+$ ]] && mail_error "${com_pipe_answer}" "IP ban: ${host}"
				1096	done
				1097	fi
				1098	elif [[ ${com_pipe_answer} =~ .+-mailcow ]]; then
				1099	kill -STOP ${BACKGROUND_TASKS[*]}
				1100	sleep 10
				1101	CONTAINER_ID=$(curl --silent --insecure https://dockerapi/containers/json \| jq -r ".[] \| {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], id: .Id}" \| jq -rc "select( .name \| tostring \| contains(\"${com_pipe_answer}\")) \| select( .project \| tostring \| contains(\"${COMPOSE_PROJECT_NAME,,}\")) \| .id")
				1102	if [[ ! -z ${CONTAINER_ID} ]]; then
				1103	if [[ "${com_pipe_answer}" == "php-fpm-mailcow" ]]; then
				1104	HAS_INITDB=$(curl --silent --insecure -XPOST https://dockerapi/containers/${CONTAINER_ID}/top \| jq '.msg.Processes[] \| contains(["php -c /usr/local/etc/php -f /web/inc/init_db.inc.php"])' \| grep true)
				1105	fi
				1106	S_RUNNING=$(($(date +%s) - $(curl --silent --insecure https://dockerapi/containers/${CONTAINER_ID}/json \| jq .State.StartedAt \| xargs -n1 date +%s -d)))
				1107	if [ ${S_RUNNING} -lt 360 ]; then
				1108	log_msg "Container is running for less than 360 seconds, skipping action..."
				1109	elif [[ ! -z ${HAS_INITDB} ]]; then
				1110	log_msg "Database is being initialized by php-fpm-mailcow, not restarting but delaying checks for a minute..."
				1111	sleep 60
				1112	else
				1113	log_msg "Sending restart command to ${CONTAINER_ID}..."
				1114	curl --silent --insecure -XPOST https://dockerapi/containers/${CONTAINER_ID}/restart
				1115	if [[ ${com_pipe_answer} != "ipv6nat-mailcow" ]]; then
				1116	[[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}"
				1117	fi
				1118	log_msg "Wait for restarted container to settle and continue watching..."
				1119	sleep 35
				1120	fi
				1121	fi
				1122	kill -CONT ${BACKGROUND_TASKS[*]}
				1123	sleep 1
				1124	kill -USR1 ${BACKGROUND_TASKS[*]}
				1125	fi
				1126	done