Problem mit Mon konfiguration

**anquijix** · 18.07.05, 14:54

Hallo zusammen

Ich habe meine vorhandene Mon Konfiguration erweitert mit ein paar zusätzlichen Monitorings.. seit dem funzen nur noch die Monitore, die per Ping eine IP-Adresse überprüfen.
Die anderen Monitore, die Apache, Tomcat und MySQL überprüfen, werden schon gar nicht mehr angesprochen.. und ich seh nicht warum. Meiner Meinung nach ist die Konfiguration richtig. Ein Start von Mon meldet keinerlei Fehler. Ich habe diverse Abhängigkeiten eingebaut (zb. depend cluster

ing). z.B findet ein httpd test nur statt, wenn vorhin der ping test (192.168.0.168) erfolgreich verlief. Der Tomcat Service wird wiederum nur überprüft, wenn der Test des Apache Services erfolgreich verlief.. und so ähnliche Sachen.. Kann sein, dass ich mich mit den Abhängigkeiten ein bisschen vertan habe.. nur seh ich überhaupt nicht, wo das Problem liegt..
Falls mehr Infos benötigt werden, bitte melden.

Hier die Konfiguration.. ist ein bisschen gross..

Code:

	    
hostgroup cluster 192.168.0.168

watch cluster
        service pingvip
		description Monitor for checking ICMP-Requests on 192.168.0.168
 		## All 10 secs                
		interval 10s  
		## Testing with fping 
                monitor fping.monitor -r 4 -t 6000  
		## Tests should be made in this this period (Sunday-Saturday)
		period RESTART: wd {Sun-Sat}
			alert net_restart.alert -S "MASTER: No link to virtual IP!! Trying to restart the interface..." admin@host
			alertafter 1
			upalert mail.alert -S "MASTER: Virtual IP is back up again!" admin@host
			upalertafter 30s
                period RESTART_FAILED: wd {Sun-Sat}		    
			## Calling the alert script, that shuts down heartbeat and sends out a mail to the admin
			alert bring-ha-down.alert -S "MASTER: Restart VIP failed!! Bringing down heartbeat for takeover..." admin@host
			alertafter 2

        service httpd
		description Is the service up and running?
		## Do this test only, if the ping test successed                
		depend cluster:pingvip
		## Calling a monitor script, that checks, if a process is still running
                monitor watch_process.monitor /usr/sbin/httpd ;; 
		interval 15s
		## If the test fails, mon tries to restart httpd                 
		period RESTART: wd {Sun-Sat}  
			alert mail.alert -S "MASTER: Service httpd NOT running!! Trying to restart..." admin@host
			alert httpd_restart.alert 
			## After 1 failed test mon tries to restart httpd
			alertafter 1 
			upalert mail.alert -S "MASTER: Webserver is back up!" admin@host
			## Upalert,when the service is avail since 1m
			upalertafter 30s 
		## If the test fails more than one time, heartbeat will go down for takeover
		period RESTART_FAILED: wd {Sun-Sat} 
			alert bring-ha-down.alert -S "MASTER: Restart httpd failed! Bringing down heartbeat for takeover..." admin@host
			## This is after 3 failed tests
			alertafter 3
	service httpd2
		description Are the websites accessible?
		## Do this test only, if the httpd test successed
		depend cluster:httpd
	## Calling http.monitor, to check a website
		monitor phttp.monitor 192.168.0.168
		interval 15s
		## If the test fails, mon tries to restart httpd
		period RESTART: wd {Sun-Sat}
			alert mail.alert -S "MASTER: HTTP running, but not accessible!!.. Trying to restart..." admin@host
			alert http_restart.alert
			alertafter 1
			upalert mail.alert -S "MASTER: Webserver is back up!" admin@host
			upalertafter 30s
		period RESTART_FAILED: wd {Sun-Sat}
			alert bring-ha-down.alert -S "MASTER: Restart httpd failed! Bringing down heartbeat for takeover..." admin@host
			alertafter 3 
        service tomcat
 		description Monitor for checking tomcat functionality
                depend cluster:httpd
                monitor watch_process.monitor /usr/lib/jvm/java/bin/java ;;
		interval 15s
                period RESTART: wd {Sun-Sat} 
			alert mail.alert -S "MASTER: Service tomcat NOT running!! Trying to restart..." admin@host
			alert tomcat_restart.alert 
			alertafter 1
			upalert mail.alert -S "MASTER: Tomcat service is back up!" admin@host
			upalertafter 30s
		period RESTART_FAILED: wd {Sun-Sat} 
			alert bring-ha-down.alert -S "MASTER: Restart tomcat failed! Bringing down heartbeat for takeover..." admin@host
			alertafter 3
	service mysql
		description Monitor for checking mysql functionality
		depend cluster:pingvip
		interval 15s
		monitor watch_process.monitor /usr/libexec/mysqld ;;
		period RESTART: wd {Sun-Sat}
			alert mail.alert -S "MASTER: Service MySQL is NOT running!! Trying to restart..." admin@host
			alert mysql_restart.alert
			alertafter 1
			upalert mail.alert -S "MASTER: Service MySQL is back up!" admin@host
			upalertafter 30s
		period RESTART_FAILED: wd {Sun-Sat}
			alert bring-ha-down.alert -S "MASTER Restart MySQL failed! Bringing down heartbeat for takeover..." admin@host
			alertafter 3
        service mysql2 
		description Check, if db is accessible
		depend cluster:mysql
                interval 15s
                monitor mysql.monitor  --mode mysql --username=user --password=pwd --database=test --port=3306
		period RESTART: wd {Sun-Sat}
			alert mail.alert -S "MASTER: Service MySQL NOT running!! Trying to restart..." admin@host
			alert mysql_restart.alert
			alertafter 1
			upalert mail.alert -S "MASTER: Service MySQL is back up!" admin@host
			upalertafter 1m
                period RESTART_FAILED: wd {Sun-Sat}
                        alert bring-ha-down.alert -S "MASTER: Restart MySQL failed! Bringing down heartbeat for takeover.." admin@host
			alertafter 3



## Check the eth1 connection (ping master)
hostgroup directmaster 10.0.0.1

watch directmaster
	service ping
		description Monitor for checking ICMP-Requests on local eth1, if no link is avail, send an alert to the admin
		interval 10s
		monitor fping.monitor -r 4 -t 6000
		period RESTART: wd {Sun-Sat}
			alert net_restart.alert -S "MASTER: Lost direct link to Slave!! No more mirroring possible!!" admin@host
			alertafter 1
			upalert mail.alert -S "MASTER: Direct link to Slave is back up!" admin@host
			upalertafter 30s			
                period RESTART_FAILED: wd {Sun-Sat}
			alert bring-ha-down.alert -S "MASTER: Restart eth1 failed!! Still no mirroring possible! Solve the problem manually" admin@host
			alertafter 3

## Check the eth1 connection (ping slave)
hostgroup directslave 10.0.0.2  

watch directslave
        service ping
		depend directmaster:ping
		description Monitor for checking ICMP-Requests on eth1, if no link is avail, send an alert to the admin
                interval 10s
                monitor fping.monitor -r 4 -t 6000
		period RESTART: wd {Sun-Sat}
			alert net_eth1slave_restart.alert -S "MASTER: Lost direct link to Slave!! No more mirroring possible!!" admin@host
			alertafter 1
			upalert mail.alert -S "MASTER: Direct link to Slave is back up!" admin@host
			upalertafter 30s			
                period RESTART_FAILED: wd {Sun-Sat}
			alert bring-ha-down.alert -S "MASTER: Restart eth1 failed!! Still no mirroring possible! Solve the problem manually" admin@host
			alertafter 3


## Check the eth0 connection to Slave
hostgroup slave 192.108.234.167

watch slave
	service ping 
		description Monitor for checking ICMP-Requests on eth0, if no link is avail, send an alert to the admin
		interval 10s
		monitor fping.monitor -r 4 -t 6000
		period RESTART: wd {Sun-Sat}
			alert net_eth0slave_restart.alert -S "MASTER: Lost link Slave over eth0!! Try to restart Slave's interface..." admin@host
			alertafter 1
			upalert mail.alert -S "MASTER: Link to Slave over eth0 is back up!" admin@host
			upalertafter 30s
		period RESTART_FAILED: wd {Sun-Sat}	
			alert mail.alert -S "MASTER: Link to Slave over eth0 couldnt be fixed! Try it manually" admin@host
			alertafter 3 

hostgroup firewall 192.108.234.50

watch firewall
	service ping
		description Check, if ping to firewall successes
		depend cluster:pingvip
		interval 10s
		monitor fping.monitor -r 4 -t 6000
		period wd  {Sun-Sat}
			alert bring-ha-down.alert -S "Ping to Firewall 192.108.234.50 failed!! Is there some problem in the intranet?" admin@host
			alertevery 10m