linux:watchdoginlinux
no way to compare when less than two revisions
Differences
This shows you the differences between two versions of the page.
Last revision | |||
— | linux:watchdoginlinux [2016/09/17 17:00] – created rlunaro | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== Creating a watchdog in Linux ====== | ||
+ | |||
+ | ===== Intro ===== | ||
+ | |||
+ | I've created a simple process that monitors if certain processes are being executed, and, if they don't, restart them again. | ||
+ | |||
+ | My first purpose for this is for monitor two minecraft server I am running in my server; so there are specific glitches to work only for this case. However, there are easily changed to fit other purposes. | ||
+ | |||
+ | ===== The code ===== | ||
+ | |||
+ | |||
+ | < | ||
+ | #!/bin/bash | ||
+ | # | ||
+ | # watchdog - monitors a process | ||
+ | # | ||
+ | # | ||
+ | # | ||
+ | |||
+ | pidfileList[0]="/ | ||
+ | pidfileList[1]="/ | ||
+ | |||
+ | startcmd[0]="/ | ||
+ | startcmd[1]="/ | ||
+ | |||
+ | logfile=/ | ||
+ | |||
+ | tries=0 | ||
+ | |||
+ | umask 022 | ||
+ | |||
+ | PATH=/ | ||
+ | |||
+ | |||
+ | # first we delete the log file | ||
+ | rm " | ||
+ | |||
+ | # to write a message to the log | ||
+ | function log() | ||
+ | { | ||
+ | now=$(date +" | ||
+ | echo "$now $1" >> $logfile | ||
+ | } # log | ||
+ | |||
+ | |||
+ | # do a lazy start: the first time it will wait | ||
+ | # 20 minutes to let the sistem to stabilize | ||
+ | log " | ||
+ | sleep 20m | ||
+ | log " | ||
+ | |||
+ | while [ true ] ; do | ||
+ | |||
+ | |||
+ | for(( i = 0; i < 50; i++ )) ; do | ||
+ | |||
+ | its_ok_to_launch=0 | ||
+ | if [ -n " | ||
+ | |||
+ | # get the pidfile value | ||
+ | pidfile=" | ||
+ | log " | ||
+ | # check the existence of this pidfile | ||
+ | if [ -e " | ||
+ | # get the pid value | ||
+ | pidvalue=$(cat $pidfile) | ||
+ | log "The file exists and contains the value $pidvalue" | ||
+ | # check existence of this pidvalue | ||
+ | line=$(ps aux | grep $pidvalue | grep minecraft | grep -v grep) | ||
+ | if [ -z " | ||
+ | # the process doesn' | ||
+ | # or the pid number doesn' | ||
+ | # to a minecraft server | ||
+ | log "There is no process with id: $pidvalue" | ||
+ | its_ok_to_launch=1 | ||
+ | fi # -z " | ||
+ | else | ||
+ | # if the pidfile doesn' | ||
+ | # it is correct to launch it | ||
+ | log "The file doesn' | ||
+ | its_ok_to_launch=1 | ||
+ | fi # -e $pidfile | ||
+ | |||
+ | fi # -n pidfileList[i] | ||
+ | |||
+ | if [ $its_ok_to_launch -eq 1 ] ; then | ||
+ | |||
+ | tries=$((tries+1)) | ||
+ | if [ $tries -le 6 ]; then | ||
+ | # attempt to start the process | ||
+ | # if the maximum reach attempts | ||
+ | # haven' | ||
+ | log " | ||
+ | of 6)" | ||
+ | ${startcmd[i]} | ||
+ | else | ||
+ | if [ $tries -eq 6 ] ; then | ||
+ | log "This is time number $tries, giving up" | ||
+ | fi # tries -eq 6 | ||
+ | fi # tries -le 6 | ||
+ | |||
+ | fi # its_ok_to_launc -eq 1 | ||
+ | |||
+ | done # for | ||
+ | |||
+ | log " | ||
+ | sleep 10m | ||
+ | done # true | ||
+ | |||
+ | </ | ||
+ | |||
+ | |||
+ | ==== This is what the program does ==== | ||
+ | |||
+ | First, it waits 20 minutes, to save the case this program is configured to be run in the booting of the server and the monitored processes aren't being started: | ||
+ | |||
+ | < | ||
+ | # do a lazy start: the first time it will wait | ||
+ | # 20 minutes to let the sistem to stabilize | ||
+ | log " | ||
+ | sleep 20m | ||
+ | log " | ||
+ | </ | ||
+ | |||
+ | Next, it will run forever, awakening for every ten minutes: | ||
+ | |||
+ | < | ||
+ | |||
+ | while [ true ] ; do | ||
+ | |||
+ | .... | ||
+ | |||
+ | log " | ||
+ | sleep 10m | ||
+ | done # true | ||
+ | </ | ||
+ | |||
+ | |||
+ | Next, a for loop is run to traverse the arrah pidfileList: | ||
+ | |||
+ | < | ||
+ | |||
+ | for(( i = 0; i < 50; i++ )) ; do | ||
+ | |||
+ | .... | ||
+ | |||
+ | done # for | ||
+ | |||
+ | </ | ||
+ | |||
+ | For every element in the array that is not empty.... | ||
+ | |||
+ | < | ||
+ | |||
+ | its_ok_to_launch=0 | ||
+ | if [ -n " | ||
+ | |||
+ | .... | ||
+ | |||
+ | fi # its_ok_to_launc -eq 1 | ||
+ | |||
+ | </ | ||
+ | |||
+ | |||
+ | Comes the real part. Get the content of the pidfile and put into a pidvalue variable: | ||
+ | |||
+ | < | ||
+ | |||
+ | # get the pidfile value | ||
+ | pidfile=" | ||
+ | log " | ||
+ | # check the existence of this pidfile | ||
+ | if [ -e " | ||
+ | # get the pid value | ||
+ | pidvalue=$(cat $pidfile) | ||
+ | log "The file exists and contains the value $pidvalue" | ||
+ | .... | ||
+ | |||
+ | </ | ||
+ | |||
+ | |||
+ | Verify that this pidvalue correspond to a real, existing process: | ||
+ | |||
+ | < | ||
+ | # check existence of this pidvalue | ||
+ | line=$(ps aux | grep $pidvalue | grep minecraft | grep -v grep) | ||
+ | if [ -z " | ||
+ | # the process doesn' | ||
+ | # or the pid number doesn' | ||
+ | # to a minecraft server | ||
+ | log "There is no process with id: $pidvalue" | ||
+ | its_ok_to_launch=1 | ||
+ | fi # -z " | ||
+ | else | ||
+ | # if the pidfile doesn' | ||
+ | # it is correct to launch it | ||
+ | log "The file doesn' | ||
+ | its_ok_to_launch=1 | ||
+ | fi # -e $pidfile | ||
+ | |||
+ | fi # -n pidfileList[i] | ||
+ | | ||
+ | </ | ||
+ | |||
+ | And if the process doesn' | ||
+ | |||
+ | < | ||
+ | |||
+ | if [ $its_ok_to_launch -eq 1 ] ; then | ||
+ | |||
+ | tries=$((tries+1)) | ||
+ | if [ $tries -le 6 ]; then | ||
+ | # attempt to start the process | ||
+ | # if the maximum reach attempts | ||
+ | # haven' | ||
+ | log " | ||
+ | of 6)" | ||
+ | ${startcmd[i]} | ||
+ | else | ||
+ | if [ $tries -eq 6 ] ; then | ||
+ | log "This is time number $tries, giving up" | ||
+ | fi # tries -eq 6 | ||
+ | fi # tries -le 6 | ||
+ | | ||
+ | </ | ||
+ | |||
+ | |||
+ | ===== Confiuration ===== | ||
+ | |||
+ | You have to configure the pid files to be monitorized (here is my example with the minecraft server): | ||
+ | |||
+ | < | ||
+ | |||
+ | pidfileList[0]="/ | ||
+ | pidfileList[1]="/ | ||
+ | |||
+ | </ | ||
+ | |||
+ | How this command are run in the event of a failure: | ||
+ | |||
+ | < | ||
+ | startcmd[0]="/ | ||
+ | startcmd[1]="/ | ||
+ | </ | ||
+ | |||
+ | The location of the logfile: | ||
+ | |||
+ | < | ||
+ | logfile=/ | ||
+ | </ | ||
+ | |||
+ | And, in the case you have to use it for other purposes, how it's identified each process: | ||
+ | |||
+ | < | ||
+ | line=$(ps aux | grep $pidvalue | grep minecraft | grep -v grep) | ||
+ | </ | ||
+ | |||
+ | I've need to add this '' | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | ===== Installation ===== | ||
+ | |||
+ | I've used the file ''/ | ||
+ | |||
+ | |||
linux/watchdoginlinux.txt · Last modified: 2022/12/02 22:02 by 127.0.0.1