00001 /* Copyright (C) 2004 MySQL AB 00002 00003 This program is free software; you can redistribute it and/or modify 00004 it under the terms of the GNU General Public License as published by 00005 the Free Software Foundation; either version 2 of the License, or 00006 (at your option) any later version. 00007 00008 This program is distributed in the hope that it will be useful, 00009 but WITHOUT ANY WARRANTY; without even the implied warranty of 00010 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00011 GNU General Public License for more details. 00012 00013 You should have received a copy of the GNU General Public License 00014 along with this program; if not, write to the Free Software 00015 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ 00016 00017 00018 #if defined(__GNUC__) && defined(USE_PRAGMA_IMPLEMENTATION) 00019 #pragma implementation 00020 #endif 00021 00022 #include "guardian.h" 00023 00024 #include <string.h> 00025 #include <sys/types.h> 00026 #include <signal.h> 00027 00028 #include "instance.h" 00029 #include "instance_map.h" 00030 #include "log.h" 00031 #include "mysql_manager_error.h" 00032 00033 00034 pthread_handler_t guardian(void *arg) 00035 { 00036 Guardian_thread *guardian_thread= (Guardian_thread *) arg; 00037 guardian_thread->run(); 00038 return 0; 00039 } 00040 00041 00042 const char * 00043 Guardian_thread::get_instance_state_name(enum_instance_state state) 00044 { 00045 switch (state) { 00046 case NOT_STARTED: 00047 return "offline"; 00048 00049 case STARTING: 00050 return "starting"; 00051 00052 case STARTED: 00053 return "online"; 00054 00055 case JUST_CRASHED: 00056 return "failed"; 00057 00058 case CRASHED: 00059 return "crashed"; 00060 00061 case CRASHED_AND_ABANDONED: 00062 return "abandoned"; 00063 00064 case STOPPING: 00065 return "stopping"; 00066 } 00067 00068 return NULL; /* just to ignore compiler warning. */ 00069 } 00070 00071 00072 Guardian_thread::Guardian_thread(Thread_registry &thread_registry_arg, 00073 Instance_map *instance_map_arg, 00074 uint monitoring_interval_arg) : 00075 Guardian_thread_args(thread_registry_arg, instance_map_arg, 00076 monitoring_interval_arg), 00077 thread_info(pthread_self()), guarded_instances(0) 00078 { 00079 pthread_mutex_init(&LOCK_guardian, 0); 00080 pthread_cond_init(&COND_guardian, 0); 00081 shutdown_requested= FALSE; 00082 stopped= FALSE; 00083 init_alloc_root(&alloc, MEM_ROOT_BLOCK_SIZE, 0); 00084 } 00085 00086 00087 Guardian_thread::~Guardian_thread() 00088 { 00089 /* delay guardian destruction to the moment when no one needs it */ 00090 pthread_mutex_lock(&LOCK_guardian); 00091 free_root(&alloc, MYF(0)); 00092 pthread_mutex_unlock(&LOCK_guardian); 00093 pthread_mutex_destroy(&LOCK_guardian); 00094 pthread_cond_destroy(&COND_guardian); 00095 } 00096 00097 00098 void Guardian_thread::request_shutdown(bool stop_instances_arg) 00099 { 00100 pthread_mutex_lock(&LOCK_guardian); 00101 /* stop instances or just clean up Guardian repository */ 00102 stop_instances(stop_instances_arg); 00103 shutdown_requested= TRUE; 00104 pthread_mutex_unlock(&LOCK_guardian); 00105 } 00106 00107 00108 void Guardian_thread::process_instance(Instance *instance, 00109 GUARD_NODE *current_node, 00110 LIST **guarded_instances, 00111 LIST *node) 00112 { 00113 uint waitchild= (uint) Instance::DEFAULT_SHUTDOWN_DELAY; 00114 /* The amount of times, Guardian attempts to restart an instance */ 00115 int restart_retry= 100; 00116 time_t current_time= time(NULL); 00117 00118 if (current_node->state == STOPPING) 00119 { 00120 /* this brach is executed during shutdown */ 00121 if (instance->options.shutdown_delay) 00122 { 00123 /* 00124 NOTE: it is important to check shutdown_delay here, but use 00125 shutdown_delay_val. The idea is that if the option is unset, 00126 shutdown_delay will be NULL, but shutdown_delay_val will not be reset. 00127 */ 00128 waitchild= instance->options.shutdown_delay_val; 00129 } 00130 00131 /* this returns TRUE if and only if an instance was stopped for sure */ 00132 if (instance->is_crashed()) 00133 *guarded_instances= list_delete(*guarded_instances, node); 00134 else if ( (uint) (current_time - current_node->last_checked) > waitchild) 00135 { 00136 instance->kill_instance(SIGKILL); 00137 /* 00138 Later we do node= node->next. This is ok, as we are only removing 00139 the node from the list. The pointer to the next one is still valid. 00140 */ 00141 *guarded_instances= list_delete(*guarded_instances, node); 00142 } 00143 00144 return; 00145 } 00146 00147 if (instance->is_running()) 00148 { 00149 /* clear status fields */ 00150 current_node->restart_counter= 0; 00151 current_node->crash_moment= 0; 00152 current_node->state= STARTED; 00153 } 00154 else 00155 { 00156 switch (current_node->state) { 00157 case NOT_STARTED: 00158 instance->start(); 00159 current_node->last_checked= current_time; 00160 log_info("guardian: starting instance %s", 00161 instance->options.instance_name); 00162 current_node->state= STARTING; 00163 break; 00164 case STARTED: /* fallthrough */ 00165 case STARTING: /* let the instance start or crash */ 00166 if (instance->is_crashed()) 00167 { 00168 current_node->crash_moment= current_time; 00169 current_node->last_checked= current_time; 00170 current_node->state= JUST_CRASHED; 00171 /* fallthrough -- restart an instance immediately */ 00172 } 00173 else 00174 break; 00175 case JUST_CRASHED: 00176 if (current_time - current_node->crash_moment <= 2) 00177 { 00178 if (instance->is_crashed()) 00179 { 00180 instance->start(); 00181 log_info("guardian: starting instance %s", 00182 instance->options.instance_name); 00183 } 00184 } 00185 else 00186 current_node->state= CRASHED; 00187 break; 00188 case CRASHED: /* just regular restarts */ 00189 if (current_time - current_node->last_checked > 00190 monitoring_interval) 00191 { 00192 if ((current_node->restart_counter < restart_retry)) 00193 { 00194 if (instance->is_crashed()) 00195 { 00196 instance->start(); 00197 current_node->last_checked= current_time; 00198 current_node->restart_counter++; 00199 log_info("guardian: restarting instance %s", 00200 instance->options.instance_name); 00201 } 00202 } 00203 else 00204 { 00205 log_info("guardian: cannot start instance %s. Abandoning attempts " 00206 "to (re)start it", instance->options.instance_name); 00207 current_node->state= CRASHED_AND_ABANDONED; 00208 } 00209 } 00210 break; 00211 case CRASHED_AND_ABANDONED: 00212 break; /* do nothing */ 00213 default: 00214 DBUG_ASSERT(0); 00215 } 00216 } 00217 } 00218 00219 00220 /* 00221 Run guardian thread 00222 00223 SYNOPSYS 00224 run() 00225 00226 DESCRIPTION 00227 00228 Check for all guarded instances and restart them if needed. If everything 00229 is fine go and sleep for some time. 00230 */ 00231 00232 void Guardian_thread::run() 00233 { 00234 Instance *instance; 00235 LIST *node; 00236 struct timespec timeout; 00237 00238 thread_registry.register_thread(&thread_info); 00239 00240 my_thread_init(); 00241 pthread_mutex_lock(&LOCK_guardian); 00242 00243 /* loop, until all instances were shut down at the end */ 00244 while (!(shutdown_requested && (guarded_instances == NULL))) 00245 { 00246 node= guarded_instances; 00247 00248 while (node != NULL) 00249 { 00250 GUARD_NODE *current_node= (GUARD_NODE *) node->data; 00251 instance= ((GUARD_NODE *) node->data)->instance; 00252 process_instance(instance, current_node, &guarded_instances, node); 00253 00254 node= node->next; 00255 } 00256 timeout.tv_sec= time(NULL) + monitoring_interval; 00257 timeout.tv_nsec= 0; 00258 00259 /* check the loop predicate before sleeping */ 00260 if (!(shutdown_requested && (!(guarded_instances)))) 00261 thread_registry.cond_timedwait(&thread_info, &COND_guardian, 00262 &LOCK_guardian, &timeout); 00263 } 00264 00265 stopped= TRUE; 00266 pthread_mutex_unlock(&LOCK_guardian); 00267 /* now, when the Guardian is stopped we can stop the IM */ 00268 thread_registry.unregister_thread(&thread_info); 00269 thread_registry.request_shutdown(); 00270 my_thread_end(); 00271 } 00272 00273 00274 int Guardian_thread::is_stopped() 00275 { 00276 int var; 00277 pthread_mutex_lock(&LOCK_guardian); 00278 var= stopped; 00279 pthread_mutex_unlock(&LOCK_guardian); 00280 return var; 00281 } 00282 00283 00284 /* 00285 Initialize the list of guarded instances: loop through the Instance_map and 00286 add all of the instances, which don't have 'nonguarded' option specified. 00287 00288 SYNOPSYS 00289 Guardian_thread::init() 00290 00291 NOTE: The operation should be invoked with the following locks acquired: 00292 - Guardian_thread; 00293 - Instance_map; 00294 00295 RETURN 00296 0 - ok 00297 1 - error occured 00298 */ 00299 00300 int Guardian_thread::init() 00301 { 00302 Instance *instance; 00303 Instance_map::Iterator iterator(instance_map); 00304 00305 /* clear the list of guarded instances */ 00306 free_root(&alloc, MYF(0)); 00307 init_alloc_root(&alloc, MEM_ROOT_BLOCK_SIZE, 0); 00308 guarded_instances= NULL; 00309 00310 while ((instance= iterator.next())) 00311 { 00312 if (instance->options.nonguarded) 00313 continue; 00314 00315 if (guard(instance, TRUE)) /* do not lock guardian */ 00316 return 1; 00317 } 00318 00319 return 0; 00320 } 00321 00322 00323 /* 00324 Add instance to the Guardian list 00325 00326 SYNOPSYS 00327 guard() 00328 instance the instance to be guarded 00329 nolock whether we prefer do not lock Guardian here, 00330 but use external locking instead 00331 00332 DESCRIPTION 00333 00334 The instance is added to the guarded instances list. Usually guard() is 00335 called after we start an instance. 00336 00337 RETURN 00338 0 - ok 00339 1 - error occured 00340 */ 00341 00342 int Guardian_thread::guard(Instance *instance, bool nolock) 00343 { 00344 LIST *node; 00345 GUARD_NODE *content; 00346 00347 node= (LIST *) alloc_root(&alloc, sizeof(LIST)); 00348 content= (GUARD_NODE *) alloc_root(&alloc, sizeof(GUARD_NODE)); 00349 00350 if ((!(node)) || (!(content))) 00351 return 1; 00352 /* we store the pointers to instances from the instance_map's MEM_ROOT */ 00353 content->instance= instance; 00354 content->restart_counter= 0; 00355 content->crash_moment= 0; 00356 content->state= NOT_STARTED; 00357 node->data= (void*) content; 00358 00359 if (nolock) 00360 guarded_instances= list_add(guarded_instances, node); 00361 else 00362 { 00363 pthread_mutex_lock(&LOCK_guardian); 00364 guarded_instances= list_add(guarded_instances, node); 00365 pthread_mutex_unlock(&LOCK_guardian); 00366 } 00367 00368 return 0; 00369 } 00370 00371 00372 /* 00373 TODO: perhaps it would make sense to create a pool of the LIST nodeents 00374 and give them upon request. Now we are loosing a bit of memory when 00375 guarded instance was stopped and then restarted (since we cannot free just 00376 a piece of the MEM_ROOT). 00377 */ 00378 00379 int Guardian_thread::stop_guard(Instance *instance) 00380 { 00381 LIST *node; 00382 00383 pthread_mutex_lock(&LOCK_guardian); 00384 00385 node= find_instance_node(instance); 00386 00387 if (node != NULL) 00388 guarded_instances= list_delete(guarded_instances, node); 00389 00390 pthread_mutex_unlock(&LOCK_guardian); 00391 00392 /* if there is nothing to delete it is also fine */ 00393 return 0; 00394 } 00395 00396 /* 00397 An internal method which is called at shutdown to unregister instances and 00398 attempt to stop them if requested. 00399 00400 SYNOPSYS 00401 stop_instances() 00402 stop_instances_arg whether we should stop instances at shutdown 00403 00404 DESCRIPTION 00405 Loops through the guarded_instances list and prepares them for shutdown. 00406 If stop_instances was requested, we need to issue a stop command and change 00407 the state accordingly. Otherwise we simply delete an entry. 00408 00409 NOTE 00410 Guardian object should be locked by the calling function. 00411 00412 RETURN 00413 0 - ok 00414 1 - error occured 00415 */ 00416 00417 int Guardian_thread::stop_instances(bool stop_instances_arg) 00418 { 00419 LIST *node; 00420 node= guarded_instances; 00421 while (node != NULL) 00422 { 00423 if (!stop_instances_arg) 00424 { 00425 /* just forget about an instance */ 00426 guarded_instances= list_delete(guarded_instances, node); 00427 /* 00428 This should still work fine, as we have only removed the 00429 node from the list. The pointer to the next one is still valid 00430 */ 00431 node= node->next; 00432 } 00433 else 00434 { 00435 GUARD_NODE *current_node= (GUARD_NODE *) node->data; 00436 /* 00437 If instance is running or was running (and now probably hanging), 00438 request stop. 00439 */ 00440 if (current_node->instance->is_running() || 00441 (current_node->state == STARTED)) 00442 { 00443 current_node->state= STOPPING; 00444 current_node->last_checked= time(NULL); 00445 } 00446 else 00447 /* otherwise remove it from the list */ 00448 guarded_instances= list_delete(guarded_instances, node); 00449 /* But try to kill it anyway. Just in case */ 00450 current_node->instance->kill_instance(SIGTERM); 00451 node= node->next; 00452 } 00453 } 00454 return 0; 00455 } 00456 00457 00458 void Guardian_thread::lock() 00459 { 00460 pthread_mutex_lock(&LOCK_guardian); 00461 } 00462 00463 00464 void Guardian_thread::unlock() 00465 { 00466 pthread_mutex_unlock(&LOCK_guardian); 00467 } 00468 00469 00470 LIST *Guardian_thread::find_instance_node(Instance *instance) 00471 { 00472 LIST *node= guarded_instances; 00473 00474 while (node != NULL) 00475 { 00476 /* 00477 We compare only pointers, as we always use pointers from the 00478 instance_map's MEM_ROOT. 00479 */ 00480 if (((GUARD_NODE *) node->data)->instance == instance) 00481 return node; 00482 00483 node= node->next; 00484 } 00485 00486 return NULL; 00487 } 00488 00489 00490 bool Guardian_thread::is_active(Instance *instance) 00491 { 00492 bool guarded; 00493 00494 lock(); 00495 00496 guarded= find_instance_node(instance) != NULL; 00497 00498 /* is_running() can take a long time, so let's unlock mutex first. */ 00499 unlock(); 00500 00501 if (guarded) 00502 return true; 00503 00504 return instance->is_running(); 00505 }
1.4.7

