00001 /****************************************************** 00002 The database server main program 00003 00004 NOTE: SQL Server 7 uses something which the documentation 00005 calls user mode scheduled threads (UMS threads). One such 00006 thread is usually allocated per processor. Win32 00007 documentation does not know any UMS threads, which suggests 00008 that the concept is internal to SQL Server 7. It may mean that 00009 SQL Server 7 does all the scheduling of threads itself, even 00010 in i/o waits. We should maybe modify InnoDB to use the same 00011 technique, because thread switches within NT may be too slow. 00012 00013 SQL Server 7 also mentions fibers, which are cooperatively 00014 scheduled threads. They can boost performance by 5 %, 00015 according to the Delaney and Soukup's book. 00016 00017 Windows 2000 will have something called thread pooling 00018 (see msdn website), which we could possibly use. 00019 00020 Another possibility could be to use some very fast user space 00021 thread library. This might confuse NT though. 00022 00023 (c) 1995 Innobase Oy 00024 00025 Created 10/8/1995 Heikki Tuuri 00026 *******************************************************/ 00027 /* Dummy comment */ 00028 #include "srv0srv.h" 00029 00030 #include "ut0mem.h" 00031 #include "os0proc.h" 00032 #include "mem0mem.h" 00033 #include "mem0pool.h" 00034 #include "sync0sync.h" 00035 #include "thr0loc.h" 00036 #include "que0que.h" 00037 #include "srv0que.h" 00038 #include "log0recv.h" 00039 #include "pars0pars.h" 00040 #include "usr0sess.h" 00041 #include "lock0lock.h" 00042 #include "trx0purge.h" 00043 #include "ibuf0ibuf.h" 00044 #include "buf0flu.h" 00045 #include "btr0sea.h" 00046 #include "dict0load.h" 00047 #include "dict0boot.h" 00048 #include "srv0start.h" 00049 #include "row0mysql.h" 00050 00051 /* This is set to TRUE if the MySQL user has set it in MySQL; currently 00052 affects only FOREIGN KEY definition parsing */ 00053 ibool srv_lower_case_table_names = FALSE; 00054 00055 /* The following counter is incremented whenever there is some user activity 00056 in the server */ 00057 ulint srv_activity_count = 0; 00058 00059 /* The following is the maximum allowed duration of a lock wait. */ 00060 ulint srv_fatal_semaphore_wait_threshold = 600; 00061 00062 /* How much data manipulation language (DML) statements need to be delayed, 00063 in microseconds, in order to reduce the lagging of the purge thread. */ 00064 ulint srv_dml_needed_delay = 0; 00065 00066 ibool srv_lock_timeout_and_monitor_active = FALSE; 00067 ibool srv_error_monitor_active = FALSE; 00068 00069 const char* srv_main_thread_op_info = ""; 00070 00071 /* Prefix used by MySQL to indicate pre-5.1 table name encoding */ 00072 const char srv_mysql50_table_name_prefix[9] = "#mysql50#"; 00073 00074 /* Server parameters which are read from the initfile */ 00075 00076 /* The following three are dir paths which are catenated before file 00077 names, where the file name itself may also contain a path */ 00078 00079 char* srv_data_home = NULL; 00080 #ifdef UNIV_LOG_ARCHIVE 00081 char* srv_arch_dir = NULL; 00082 #endif /* UNIV_LOG_ARCHIVE */ 00083 00084 ibool srv_file_per_table = FALSE; /* store to its own file each table 00085 created by an user; data dictionary 00086 tables are in the system tablespace 00087 0 */ 00088 ibool srv_locks_unsafe_for_binlog = FALSE; /* Place locks to records only 00089 i.e. do not use next-key locking 00090 except on duplicate key checking and 00091 foreign key checking */ 00092 ulint srv_n_data_files = 0; 00093 char** srv_data_file_names = NULL; 00094 ulint* srv_data_file_sizes = NULL; /* size in database pages */ 00095 00096 ibool srv_auto_extend_last_data_file = FALSE; /* if TRUE, then we 00097 auto-extend the last data 00098 file */ 00099 ulint srv_last_file_size_max = 0; /* if != 0, this tells 00100 the max size auto-extending 00101 may increase the last data 00102 file size */ 00103 ulong srv_auto_extend_increment = 8; /* If the last data file is 00104 auto-extended, we add this 00105 many pages to it at a time */ 00106 ulint* srv_data_file_is_raw_partition = NULL; 00107 00108 /* If the following is TRUE we do not allow inserts etc. This protects 00109 the user from forgetting the 'newraw' keyword to my.cnf */ 00110 00111 ibool srv_created_new_raw = FALSE; 00112 00113 char** srv_log_group_home_dirs = NULL; 00114 00115 ulint srv_n_log_groups = ULINT_MAX; 00116 ulint srv_n_log_files = ULINT_MAX; 00117 ulint srv_log_file_size = ULINT_MAX; /* size in database pages */ 00118 ulint srv_log_buffer_size = ULINT_MAX; /* size in database pages */ 00119 ulong srv_flush_log_at_trx_commit = 1; 00120 00121 byte srv_latin1_ordering[256] /* The sort order table of the latin1 00122 character set. The following table is 00123 the MySQL order as of Feb 10th, 2002 */ 00124 = { 00125 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 00126 , 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F 00127 , 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 00128 , 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F 00129 , 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27 00130 , 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F 00131 , 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37 00132 , 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F 00133 , 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47 00134 , 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F 00135 , 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57 00136 , 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F 00137 , 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47 00138 , 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F 00139 , 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57 00140 , 0x58, 0x59, 0x5A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F 00141 , 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 00142 , 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F 00143 , 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97 00144 , 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F 00145 , 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7 00146 , 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF 00147 , 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7 00148 , 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF 00149 , 0x41, 0x41, 0x41, 0x41, 0x5C, 0x5B, 0x5C, 0x43 00150 , 0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49 00151 , 0x44, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x5D, 0xD7 00152 , 0xD8, 0x55, 0x55, 0x55, 0x59, 0x59, 0xDE, 0xDF 00153 , 0x41, 0x41, 0x41, 0x41, 0x5C, 0x5B, 0x5C, 0x43 00154 , 0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49 00155 , 0x44, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x5D, 0xF7 00156 , 0xD8, 0x55, 0x55, 0x55, 0x59, 0x59, 0xDE, 0xFF 00157 }; 00158 00159 ulint srv_pool_size = ULINT_MAX; /* size in pages; MySQL inits 00160 this to size in kilobytes but 00161 we normalize this to pages in 00162 srv_boot() */ 00163 ulint srv_awe_window_size = 0; /* size in pages; MySQL inits 00164 this to bytes, but we 00165 normalize it to pages in 00166 srv_boot() */ 00167 ulint srv_mem_pool_size = ULINT_MAX; /* size in bytes */ 00168 ulint srv_lock_table_size = ULINT_MAX; 00169 00170 ulint srv_n_file_io_threads = ULINT_MAX; 00171 00172 #ifdef UNIV_LOG_ARCHIVE 00173 ibool srv_log_archive_on = FALSE; 00174 ibool srv_archive_recovery = 0; 00175 dulint srv_archive_recovery_limit_lsn; 00176 #endif /* UNIV_LOG_ARCHIVE */ 00177 00178 ulint srv_lock_wait_timeout = 1024 * 1024 * 1024; 00179 00180 char* srv_file_flush_method_str = NULL; 00181 ulint srv_unix_file_flush_method = SRV_UNIX_FDATASYNC; 00182 ulint srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; 00183 00184 ulint srv_max_n_open_files = 300; 00185 00186 /* The InnoDB main thread tries to keep the ratio of modified pages 00187 in the buffer pool to all database pages in the buffer pool smaller than 00188 the following number. But it is not guaranteed that the value stays below 00189 that during a time of heavy update/insert activity. */ 00190 00191 ulong srv_max_buf_pool_modified_pct = 90; 00192 00193 /* variable counts amount of data read in total (in bytes) */ 00194 ulint srv_data_read = 0; 00195 00196 /* here we count the amount of data written in total (in bytes) */ 00197 ulint srv_data_written = 0; 00198 00199 /* the number of the log write requests done */ 00200 ulint srv_log_write_requests = 0; 00201 00202 /* the number of physical writes to the log performed */ 00203 ulint srv_log_writes = 0; 00204 00205 /* amount of data written to the log files in bytes */ 00206 ulint srv_os_log_written = 0; 00207 00208 /* amount of writes being done to the log files */ 00209 ulint srv_os_log_pending_writes = 0; 00210 00211 /* we increase this counter, when there we don't have enough space in the 00212 log buffer and have to flush it */ 00213 ulint srv_log_waits = 0; 00214 00215 /* this variable counts the amount of times, when the doublewrite buffer 00216 was flushed */ 00217 ulint srv_dblwr_writes = 0; 00218 00219 /* here we store the number of pages that have been flushed to the 00220 doublewrite buffer */ 00221 ulint srv_dblwr_pages_written = 0; 00222 00223 /* in this variable we store the number of write requests issued */ 00224 ulint srv_buf_pool_write_requests = 0; 00225 00226 /* here we store the number of times when we had to wait for a free page 00227 in the buffer pool. It happens when the buffer pool is full and we need 00228 to make a flush, in order to be able to read or create a page. */ 00229 ulint srv_buf_pool_wait_free = 0; 00230 00231 /* variable to count the number of pages that were written from buffer 00232 pool to the disk */ 00233 ulint srv_buf_pool_flushed = 0; 00234 00235 /* variable to count the number of buffer pool reads that led to the 00236 reading of a disk page */ 00237 ulint srv_buf_pool_reads = 0; 00238 00239 /* variable to count the number of sequential read-aheads */ 00240 ulint srv_read_ahead_seq = 0; 00241 00242 /* variable to count the number of random read-aheads */ 00243 ulint srv_read_ahead_rnd = 0; 00244 00245 /* structure to pass status variables to MySQL */ 00246 export_struc export_vars; 00247 00248 /* If the following is != 0 we do not allow inserts etc. This protects 00249 the user from forgetting the innodb_force_recovery keyword to my.cnf */ 00250 00251 ulint srv_force_recovery = 0; 00252 /*-----------------------*/ 00253 /* We are prepared for a situation that we have this many threads waiting for 00254 a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the 00255 value. */ 00256 00257 ulint srv_max_n_threads = 0; 00258 00259 /* The following controls how many threads we let inside InnoDB concurrently: 00260 threads waiting for locks are not counted into the number because otherwise 00261 we could get a deadlock. MySQL creates a thread for each user session, and 00262 semaphore contention and convoy problems can occur withput this restriction. 00263 Value 10 should be good if there are less than 4 processors + 4 disks in the 00264 computer. Bigger computers need bigger values. Value 0 will disable the 00265 concurrency check. */ 00266 00267 ulong srv_thread_concurrency = 0; 00268 ulong srv_commit_concurrency = 0; 00269 00270 os_fast_mutex_t srv_conc_mutex; /* this mutex protects srv_conc data 00271 structures */ 00272 lint srv_conc_n_threads = 0; /* number of OS threads currently 00273 inside InnoDB; it is not an error 00274 if this drops temporarily below zero 00275 because we do not demand that every 00276 thread increments this, but a thread 00277 waiting for a lock decrements this 00278 temporarily */ 00279 ulint srv_conc_n_waiting_threads = 0; /* number of OS threads waiting in the 00280 FIFO for a permission to enter InnoDB 00281 */ 00282 00283 typedef struct srv_conc_slot_struct srv_conc_slot_t; 00284 struct srv_conc_slot_struct{ 00285 os_event_t event; /* event to wait */ 00286 ibool reserved; /* TRUE if slot 00287 reserved */ 00288 ibool wait_ended; /* TRUE when another 00289 thread has already set 00290 the event and the 00291 thread in this slot is 00292 free to proceed; but 00293 reserved may still be 00294 TRUE at that point */ 00295 UT_LIST_NODE_T(srv_conc_slot_t) srv_conc_queue; /* queue node */ 00296 }; 00297 00298 UT_LIST_BASE_NODE_T(srv_conc_slot_t) srv_conc_queue; /* queue of threads 00299 waiting to get in */ 00300 srv_conc_slot_t* srv_conc_slots; /* array of wait 00301 slots */ 00302 00303 /* Number of times a thread is allowed to enter InnoDB within the same 00304 SQL query after it has once got the ticket at srv_conc_enter_innodb */ 00305 #define SRV_FREE_TICKETS_TO_ENTER srv_n_free_tickets_to_enter 00306 #define SRV_THREAD_SLEEP_DELAY srv_thread_sleep_delay 00307 /*-----------------------*/ 00308 /* If the following is set to 1 then we do not run purge and insert buffer 00309 merge to completion before shutdown. If it is set to 2, do not even flush the 00310 buffer pool to data files at the shutdown: we effectively 'crash' 00311 InnoDB (but lose no committed transactions). */ 00312 ulint srv_fast_shutdown = 0; 00313 00314 /* Generate a innodb_status.<pid> file */ 00315 ibool srv_innodb_status = FALSE; 00316 00317 ibool srv_use_doublewrite_buf = TRUE; 00318 ibool srv_use_checksums = TRUE; 00319 00320 ibool srv_set_thread_priorities = TRUE; 00321 int srv_query_thread_priority = 0; 00322 00323 /* TRUE if the Address Windowing Extensions of Windows are used; then we must 00324 disable adaptive hash indexes */ 00325 ibool srv_use_awe = FALSE; 00326 ibool srv_use_adaptive_hash_indexes = TRUE; 00327 00328 /*-------------------------------------------*/ 00329 ulong srv_n_spin_wait_rounds = 20; 00330 ulong srv_n_free_tickets_to_enter = 500; 00331 ulong srv_thread_sleep_delay = 10000; 00332 ulint srv_spin_wait_delay = 5; 00333 ibool srv_priority_boost = TRUE; 00334 00335 ibool srv_print_thread_releases = FALSE; 00336 ibool srv_print_lock_waits = FALSE; 00337 ibool srv_print_buf_io = FALSE; 00338 ibool srv_print_log_io = FALSE; 00339 ibool srv_print_latch_waits = FALSE; 00340 00341 ulint srv_n_rows_inserted = 0; 00342 ulint srv_n_rows_updated = 0; 00343 ulint srv_n_rows_deleted = 0; 00344 ulint srv_n_rows_read = 0; 00345 #ifndef UNIV_HOTBACKUP 00346 static ulint srv_n_rows_inserted_old = 0; 00347 static ulint srv_n_rows_updated_old = 0; 00348 static ulint srv_n_rows_deleted_old = 0; 00349 static ulint srv_n_rows_read_old = 0; 00350 #endif /* !UNIV_HOTBACKUP */ 00351 00352 ulint srv_n_lock_wait_count = 0; 00353 ulint srv_n_lock_wait_current_count = 0; 00354 ib_longlong srv_n_lock_wait_time = 0; 00355 ulint srv_n_lock_max_wait_time = 0; 00356 00357 00358 /* 00359 Set the following to 0 if you want InnoDB to write messages on 00360 stderr on startup/shutdown 00361 */ 00362 ibool srv_print_verbose_log = TRUE; 00363 ibool srv_print_innodb_monitor = FALSE; 00364 ibool srv_print_innodb_lock_monitor = FALSE; 00365 ibool srv_print_innodb_tablespace_monitor = FALSE; 00366 ibool srv_print_innodb_table_monitor = FALSE; 00367 00368 /* Array of English strings describing the current state of an 00369 i/o handler thread */ 00370 00371 const char* srv_io_thread_op_info[SRV_MAX_N_IO_THREADS]; 00372 const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS]; 00373 00374 time_t srv_last_monitor_time; 00375 00376 mutex_t srv_innodb_monitor_mutex; 00377 00378 /* Mutex for locking srv_monitor_file */ 00379 mutex_t srv_monitor_file_mutex; 00380 /* Temporary file for innodb monitor output */ 00381 FILE* srv_monitor_file; 00382 /* Mutex for locking srv_dict_tmpfile. 00383 This mutex has a very high rank; threads reserving it should not 00384 be holding any InnoDB latches. */ 00385 mutex_t srv_dict_tmpfile_mutex; 00386 /* Temporary file for output from the data dictionary */ 00387 FILE* srv_dict_tmpfile; 00388 /* Mutex for locking srv_misc_tmpfile. 00389 This mutex has a very low rank; threads reserving it should not 00390 acquire any further latches or sleep before releasing this one. */ 00391 mutex_t srv_misc_tmpfile_mutex; 00392 /* Temporary file for miscellanous diagnostic output */ 00393 FILE* srv_misc_tmpfile; 00394 00395 ulint srv_main_thread_process_no = 0; 00396 ulint srv_main_thread_id = 0; 00397 00398 /* 00399 IMPLEMENTATION OF THE SERVER MAIN PROGRAM 00400 ========================================= 00401 00402 There is the following analogue between this database 00403 server and an operating system kernel: 00404 00405 DB concept equivalent OS concept 00406 ---------- --------------------- 00407 transaction -- process; 00408 00409 query thread -- thread; 00410 00411 lock -- semaphore; 00412 00413 transaction set to 00414 the rollback state -- kill signal delivered to a process; 00415 00416 kernel -- kernel; 00417 00418 query thread execution: 00419 (a) without kernel mutex 00420 reserved -- process executing in user mode; 00421 (b) with kernel mutex reserved 00422 -- process executing in kernel mode; 00423 00424 The server is controlled by a master thread which runs at 00425 a priority higher than normal, that is, higher than user threads. 00426 It sleeps most of the time, and wakes up, say, every 300 milliseconds, 00427 to check whether there is anything happening in the server which 00428 requires intervention of the master thread. Such situations may be, 00429 for example, when flushing of dirty blocks is needed in the buffer 00430 pool or old version of database rows have to be cleaned away. 00431 00432 The threads which we call user threads serve the queries of 00433 the clients and input from the console of the server. 00434 They run at normal priority. The server may have several 00435 communications endpoints. A dedicated set of user threads waits 00436 at each of these endpoints ready to receive a client request. 00437 Each request is taken by a single user thread, which then starts 00438 processing and, when the result is ready, sends it to the client 00439 and returns to wait at the same endpoint the thread started from. 00440 00441 So, we do not have dedicated communication threads listening at 00442 the endpoints and dealing the jobs to dedicated worker threads. 00443 Our architecture saves one thread swithch per request, compared 00444 to the solution with dedicated communication threads 00445 which amounts to 15 microseconds on 100 MHz Pentium 00446 running NT. If the client 00447 is communicating over a network, this saving is negligible, but 00448 if the client resides in the same machine, maybe in an SMP machine 00449 on a different processor from the server thread, the saving 00450 can be important as the threads can communicate over shared 00451 memory with an overhead of a few microseconds. 00452 00453 We may later implement a dedicated communication thread solution 00454 for those endpoints which communicate over a network. 00455 00456 Our solution with user threads has two problems: for each endpoint 00457 there has to be a number of listening threads. If there are many 00458 communication endpoints, it may be difficult to set the right number 00459 of concurrent threads in the system, as many of the threads 00460 may always be waiting at less busy endpoints. Another problem 00461 is queuing of the messages, as the server internally does not 00462 offer any queue for jobs. 00463 00464 Another group of user threads is intended for splitting the 00465 queries and processing them in parallel. Let us call these 00466 parallel communication threads. These threads are waiting for 00467 parallelized tasks, suspended on event semaphores. 00468 00469 A single user thread waits for input from the console, 00470 like a command to shut the database. 00471 00472 Utility threads are a different group of threads which takes 00473 care of the buffer pool flushing and other, mainly background 00474 operations, in the server. 00475 Some of these utility threads always run at a lower than normal 00476 priority, so that they are always in background. Some of them 00477 may dynamically boost their priority by the pri_adjust function, 00478 even to higher than normal priority, if their task becomes urgent. 00479 The running of utilities is controlled by high- and low-water marks 00480 of urgency. The urgency may be measured by the number of dirty blocks 00481 in the buffer pool, in the case of the flush thread, for example. 00482 When the high-water mark is exceeded, an utility starts running, until 00483 the urgency drops under the low-water mark. Then the utility thread 00484 suspend itself to wait for an event. The master thread is 00485 responsible of signaling this event when the utility thread is 00486 again needed. 00487 00488 For each individual type of utility, some threads always remain 00489 at lower than normal priority. This is because pri_adjust is implemented 00490 so that the threads at normal or higher priority control their 00491 share of running time by calling sleep. Thus, if the load of the 00492 system sudenly drops, these threads cannot necessarily utilize 00493 the system fully. The background priority threads make up for this, 00494 starting to run when the load drops. 00495 00496 When there is no activity in the system, also the master thread 00497 suspends itself to wait for an event making 00498 the server totally silent. The responsibility to signal this 00499 event is on the user thread which again receives a message 00500 from a client. 00501 00502 There is still one complication in our server design. If a 00503 background utility thread obtains a resource (e.g., mutex) needed by a user 00504 thread, and there is also some other user activity in the system, 00505 the user thread may have to wait indefinitely long for the 00506 resource, as the OS does not schedule a background thread if 00507 there is some other runnable user thread. This problem is called 00508 priority inversion in real-time programming. 00509 00510 One solution to the priority inversion problem would be to 00511 keep record of which thread owns which resource and 00512 in the above case boost the priority of the background thread 00513 so that it will be scheduled and it can release the resource. 00514 This solution is called priority inheritance in real-time programming. 00515 A drawback of this solution is that the overhead of acquiring a mutex 00516 increases slightly, maybe 0.2 microseconds on a 100 MHz Pentium, because 00517 the thread has to call os_thread_get_curr_id. 00518 This may be compared to 0.5 microsecond overhead for a mutex lock-unlock 00519 pair. Note that the thread 00520 cannot store the information in the resource, say mutex, itself, 00521 because competing threads could wipe out the information if it is 00522 stored before acquiring the mutex, and if it stored afterwards, 00523 the information is outdated for the time of one machine instruction, 00524 at least. (To be precise, the information could be stored to 00525 lock_word in mutex if the machine supports atomic swap.) 00526 00527 The above solution with priority inheritance may become actual in the 00528 future, but at the moment we plan to implement a more coarse solution, 00529 which could be called a global priority inheritance. If a thread 00530 has to wait for a long time, say 300 milliseconds, for a resource, 00531 we just guess that it may be waiting for a resource owned by a background 00532 thread, and boost the the priority of all runnable background threads 00533 to the normal level. The background threads then themselves adjust 00534 their fixed priority back to background after releasing all resources 00535 they had (or, at some fixed points in their program code). 00536 00537 What is the performance of the global priority inheritance solution? 00538 We may weigh the length of the wait time 300 milliseconds, during 00539 which the system processes some other thread 00540 to the cost of boosting the priority of each runnable background 00541 thread, rescheduling it, and lowering the priority again. 00542 On 100 MHz Pentium + NT this overhead may be of the order 100 00543 microseconds per thread. So, if the number of runnable background 00544 threads is not very big, say < 100, the cost is tolerable. 00545 Utility threads probably will access resources used by 00546 user threads not very often, so collisions of user threads 00547 to preempted utility threads should not happen very often. 00548 00549 The thread table contains 00550 information of the current status of each thread existing in the system, 00551 and also the event semaphores used in suspending the master thread 00552 and utility and parallel communication threads when they have nothing to do. 00553 The thread table can be seen as an analogue to the process table 00554 in a traditional Unix implementation. 00555 00556 The thread table is also used in the global priority inheritance 00557 scheme. This brings in one additional complication: threads accessing 00558 the thread table must have at least normal fixed priority, 00559 because the priority inheritance solution does not work if a background 00560 thread is preempted while possessing the mutex protecting the thread table. 00561 So, if a thread accesses the thread table, its priority has to be 00562 boosted at least to normal. This priority requirement can be seen similar to 00563 the privileged mode used when processing the kernel calls in traditional 00564 Unix.*/ 00565 00566 /* Thread slot in the thread table */ 00567 struct srv_slot_struct{ 00568 os_thread_id_t id; /* thread id */ 00569 os_thread_t handle; /* thread handle */ 00570 ulint type; /* thread type: user, utility etc. */ 00571 ibool in_use; /* TRUE if this slot is in use */ 00572 ibool suspended; /* TRUE if the thread is waiting 00573 for the event of this slot */ 00574 ib_time_t suspend_time; /* time when the thread was 00575 suspended */ 00576 os_event_t event; /* event used in suspending the 00577 thread when it has nothing to do */ 00578 que_thr_t* thr; /* suspended query thread (only 00579 used for MySQL threads) */ 00580 }; 00581 00582 /* Table for MySQL threads where they will be suspended to wait for locks */ 00583 srv_slot_t* srv_mysql_table = NULL; 00584 00585 os_event_t srv_lock_timeout_thread_event; 00586 00587 srv_sys_t* srv_sys = NULL; 00588 00589 byte srv_pad1[64]; /* padding to prevent other memory update 00590 hotspots from residing on the same memory 00591 cache line */ 00592 mutex_t* kernel_mutex_temp;/* mutex protecting the server, trx structs, 00593 query threads, and lock table */ 00594 byte srv_pad2[64]; /* padding to prevent other memory update 00595 hotspots from residing on the same memory 00596 cache line */ 00597 00598 /* The following three values measure the urgency of the jobs of 00599 buffer, version, and insert threads. They may vary from 0 - 1000. 00600 The server mutex protects all these variables. The low-water values 00601 tell that the server can acquiesce the utility when the value 00602 drops below this low-water mark. */ 00603 00604 ulint srv_meter[SRV_MASTER + 1]; 00605 ulint srv_meter_low_water[SRV_MASTER + 1]; 00606 ulint srv_meter_high_water[SRV_MASTER + 1]; 00607 ulint srv_meter_high_water2[SRV_MASTER + 1]; 00608 ulint srv_meter_foreground[SRV_MASTER + 1]; 00609 00610 /* The following values give info about the activity going on in 00611 the database. They are protected by the server mutex. The arrays 00612 are indexed by the type of the thread. */ 00613 00614 ulint srv_n_threads_active[SRV_MASTER + 1]; 00615 ulint srv_n_threads[SRV_MASTER + 1]; 00616 00617 /************************************************************************* 00618 Sets the info describing an i/o thread current state. */ 00619 00620 void 00621 srv_set_io_thread_op_info( 00622 /*======================*/ 00623 ulint i, /* in: the 'segment' of the i/o thread */ 00624 const char* str) /* in: constant char string describing the 00625 state */ 00626 { 00627 ut_a(i < SRV_MAX_N_IO_THREADS); 00628 00629 srv_io_thread_op_info[i] = str; 00630 } 00631 00632 /************************************************************************* 00633 Accessor function to get pointer to n'th slot in the server thread 00634 table. */ 00635 static 00636 srv_slot_t* 00637 srv_table_get_nth_slot( 00638 /*===================*/ 00639 /* out: pointer to the slot */ 00640 ulint index) /* in: index of the slot */ 00641 { 00642 ut_a(index < OS_THREAD_MAX_N); 00643 00644 return(srv_sys->threads + index); 00645 } 00646 00647 #ifndef UNIV_HOTBACKUP 00648 /************************************************************************* 00649 Gets the number of threads in the system. */ 00650 00651 ulint 00652 srv_get_n_threads(void) 00653 /*===================*/ 00654 { 00655 ulint i; 00656 ulint n_threads = 0; 00657 00658 mutex_enter(&kernel_mutex); 00659 00660 for (i = SRV_COM; i < SRV_MASTER + 1; i++) { 00661 00662 n_threads += srv_n_threads[i]; 00663 } 00664 00665 mutex_exit(&kernel_mutex); 00666 00667 return(n_threads); 00668 } 00669 00670 /************************************************************************* 00671 Reserves a slot in the thread table for the current thread. Also creates the 00672 thread local storage struct for the current thread. NOTE! The server mutex 00673 has to be reserved by the caller! */ 00674 static 00675 ulint 00676 srv_table_reserve_slot( 00677 /*===================*/ 00678 /* out: reserved slot index */ 00679 ulint type) /* in: type of the thread: one of SRV_COM, ... */ 00680 { 00681 srv_slot_t* slot; 00682 ulint i; 00683 00684 ut_a(type > 0); 00685 ut_a(type <= SRV_MASTER); 00686 00687 i = 0; 00688 slot = srv_table_get_nth_slot(i); 00689 00690 while (slot->in_use) { 00691 i++; 00692 slot = srv_table_get_nth_slot(i); 00693 } 00694 00695 ut_a(slot->in_use == FALSE); 00696 00697 slot->in_use = TRUE; 00698 slot->suspended = FALSE; 00699 slot->id = os_thread_get_curr_id(); 00700 slot->handle = os_thread_get_curr(); 00701 slot->type = type; 00702 00703 thr_local_create(); 00704 00705 thr_local_set_slot_no(os_thread_get_curr_id(), i); 00706 00707 return(i); 00708 } 00709 00710 /************************************************************************* 00711 Suspends the calling thread to wait for the event in its thread slot. 00712 NOTE! The server mutex has to be reserved by the caller! */ 00713 static 00714 os_event_t 00715 srv_suspend_thread(void) 00716 /*====================*/ 00717 /* out: event for the calling thread to wait */ 00718 { 00719 srv_slot_t* slot; 00720 os_event_t event; 00721 ulint slot_no; 00722 ulint type; 00723 00724 #ifdef UNIV_SYNC_DEBUG 00725 ut_ad(mutex_own(&kernel_mutex)); 00726 #endif /* UNIV_SYNC_DEBUG */ 00727 00728 slot_no = thr_local_get_slot_no(os_thread_get_curr_id()); 00729 00730 if (srv_print_thread_releases) { 00731 fprintf(stderr, 00732 "Suspending thread %lu to slot %lu meter %lu\n", 00733 (ulong) os_thread_get_curr_id(), (ulong) slot_no, 00734 (ulong) srv_meter[SRV_RECOVERY]); 00735 } 00736 00737 slot = srv_table_get_nth_slot(slot_no); 00738 00739 type = slot->type; 00740 00741 ut_ad(type >= SRV_WORKER); 00742 ut_ad(type <= SRV_MASTER); 00743 00744 event = slot->event; 00745 00746 slot->suspended = TRUE; 00747 00748 ut_ad(srv_n_threads_active[type] > 0); 00749 00750 srv_n_threads_active[type]--; 00751 00752 os_event_reset(event); 00753 00754 return(event); 00755 } 00756 #endif /* !UNIV_HOTBACKUP */ 00757 00758 /************************************************************************* 00759 Releases threads of the type given from suspension in the thread table. 00760 NOTE! The server mutex has to be reserved by the caller! */ 00761 00762 ulint 00763 srv_release_threads( 00764 /*================*/ 00765 /* out: number of threads released: this may be 00766 < n if not enough threads were suspended at the 00767 moment */ 00768 ulint type, /* in: thread type */ 00769 ulint n) /* in: number of threads to release */ 00770 { 00771 srv_slot_t* slot; 00772 ulint i; 00773 ulint count = 0; 00774 00775 ut_ad(type >= SRV_WORKER); 00776 ut_ad(type <= SRV_MASTER); 00777 ut_ad(n > 0); 00778 #ifdef UNIV_SYNC_DEBUG 00779 ut_ad(mutex_own(&kernel_mutex)); 00780 #endif /* UNIV_SYNC_DEBUG */ 00781 00782 for (i = 0; i < OS_THREAD_MAX_N; i++) { 00783 00784 slot = srv_table_get_nth_slot(i); 00785 00786 if (slot->in_use && slot->type == type && slot->suspended) { 00787 00788 slot->suspended = FALSE; 00789 00790 srv_n_threads_active[type]++; 00791 00792 os_event_set(slot->event); 00793 00794 if (srv_print_thread_releases) { 00795 fprintf(stderr, 00796 "Releasing thread %lu type %lu from slot %lu meter %lu\n", 00797 (ulong) slot->id, (ulong) type, (ulong) i, 00798 (ulong) srv_meter[SRV_RECOVERY]); 00799 } 00800 00801 count++; 00802 00803 if (count == n) { 00804 break; 00805 } 00806 } 00807 } 00808 00809 return(count); 00810 } 00811 00812 /************************************************************************* 00813 Returns the calling thread type. */ 00814 00815 ulint 00816 srv_get_thread_type(void) 00817 /*=====================*/ 00818 /* out: SRV_COM, ... */ 00819 { 00820 ulint slot_no; 00821 srv_slot_t* slot; 00822 ulint type; 00823 00824 mutex_enter(&kernel_mutex); 00825 00826 slot_no = thr_local_get_slot_no(os_thread_get_curr_id()); 00827 00828 slot = srv_table_get_nth_slot(slot_no); 00829 00830 type = slot->type; 00831 00832 ut_ad(type >= SRV_WORKER); 00833 ut_ad(type <= SRV_MASTER); 00834 00835 mutex_exit(&kernel_mutex); 00836 00837 return(type); 00838 } 00839 00840 /************************************************************************* 00841 Initializes the server. */ 00842 00843 void 00844 srv_init(void) 00845 /*==========*/ 00846 { 00847 srv_conc_slot_t* conc_slot; 00848 srv_slot_t* slot; 00849 dict_table_t* table; 00850 ulint i; 00851 00852 srv_sys = mem_alloc(sizeof(srv_sys_t)); 00853 00854 kernel_mutex_temp = mem_alloc(sizeof(mutex_t)); 00855 mutex_create(&kernel_mutex, SYNC_KERNEL); 00856 00857 mutex_create(&srv_innodb_monitor_mutex, SYNC_NO_ORDER_CHECK); 00858 00859 srv_sys->threads = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_slot_t)); 00860 00861 for (i = 0; i < OS_THREAD_MAX_N; i++) { 00862 slot = srv_table_get_nth_slot(i); 00863 slot->in_use = FALSE; 00864 slot->type=0; /* Avoid purify errors */ 00865 slot->event = os_event_create(NULL); 00866 ut_a(slot->event); 00867 } 00868 00869 srv_mysql_table = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_slot_t)); 00870 00871 for (i = 0; i < OS_THREAD_MAX_N; i++) { 00872 slot = srv_mysql_table + i; 00873 slot->in_use = FALSE; 00874 slot->type = 0; 00875 slot->event = os_event_create(NULL); 00876 ut_a(slot->event); 00877 } 00878 00879 srv_lock_timeout_thread_event = os_event_create(NULL); 00880 00881 for (i = 0; i < SRV_MASTER + 1; i++) { 00882 srv_n_threads_active[i] = 0; 00883 srv_n_threads[i] = 0; 00884 srv_meter[i] = 30; 00885 srv_meter_low_water[i] = 50; 00886 srv_meter_high_water[i] = 100; 00887 srv_meter_high_water2[i] = 200; 00888 srv_meter_foreground[i] = 250; 00889 } 00890 00891 UT_LIST_INIT(srv_sys->tasks); 00892 00893 /* create dummy table and index for old-style infimum and supremum */ 00894 table = dict_mem_table_create("SYS_DUMMY1", 00895 DICT_HDR_SPACE, 1, 0); 00896 dict_mem_table_add_col(table, "DUMMY", DATA_CHAR, 00897 DATA_ENGLISH | DATA_NOT_NULL, 8, 0); 00898 00899 srv_sys->dummy_ind1 = dict_mem_index_create("SYS_DUMMY1", 00900 "SYS_DUMMY1", DICT_HDR_SPACE, 0, 1); 00901 dict_index_add_col(srv_sys->dummy_ind1, 00902 dict_table_get_nth_col(table, 0), 0); 00903 srv_sys->dummy_ind1->table = table; 00904 /* create dummy table and index for new-style infimum and supremum */ 00905 table = dict_mem_table_create("SYS_DUMMY2", 00906 DICT_HDR_SPACE, 1, DICT_TF_COMPACT); 00907 dict_mem_table_add_col(table, "DUMMY", DATA_CHAR, 00908 DATA_ENGLISH | DATA_NOT_NULL, 8, 0); 00909 srv_sys->dummy_ind2 = dict_mem_index_create("SYS_DUMMY2", 00910 "SYS_DUMMY2", DICT_HDR_SPACE, 0, 1); 00911 dict_index_add_col(srv_sys->dummy_ind2, 00912 dict_table_get_nth_col(table, 0), 0); 00913 srv_sys->dummy_ind2->table = table; 00914 00915 /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ 00916 srv_sys->dummy_ind1->cached = srv_sys->dummy_ind2->cached = TRUE; 00917 00918 /* Init the server concurrency restriction data structures */ 00919 00920 os_fast_mutex_init(&srv_conc_mutex); 00921 00922 UT_LIST_INIT(srv_conc_queue); 00923 00924 srv_conc_slots = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_conc_slot_t)); 00925 00926 for (i = 0; i < OS_THREAD_MAX_N; i++) { 00927 conc_slot = srv_conc_slots + i; 00928 conc_slot->reserved = FALSE; 00929 conc_slot->event = os_event_create(NULL); 00930 ut_a(conc_slot->event); 00931 } 00932 } 00933 00934 /************************************************************************* 00935 Frees the OS fast mutex created in srv_init(). */ 00936 00937 void 00938 srv_free(void) 00939 /*==========*/ 00940 { 00941 os_fast_mutex_free(&srv_conc_mutex); 00942 } 00943 00944 /************************************************************************* 00945 Initializes the synchronization primitives, memory system, and the thread 00946 local storage. */ 00947 00948 void 00949 srv_general_init(void) 00950 /*==================*/ 00951 { 00952 os_sync_init(); 00953 sync_init(); 00954 mem_init(srv_mem_pool_size); 00955 thr_local_init(); 00956 } 00957 00958 /*======================= InnoDB Server FIFO queue =======================*/ 00959 00960 /* Maximum allowable purge history length. <=0 means 'infinite'. */ 00961 ulong srv_max_purge_lag = 0; 00962 00963 /************************************************************************* 00964 Puts an OS thread to wait if there are too many concurrent threads 00965 (>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */ 00966 00967 void 00968 srv_conc_enter_innodb( 00969 /*==================*/ 00970 trx_t* trx) /* in: transaction object associated with the 00971 thread */ 00972 { 00973 ibool has_slept = FALSE; 00974 srv_conc_slot_t* slot = NULL; 00975 ulint i; 00976 00977 /* If trx has 'free tickets' to enter the engine left, then use one 00978 such ticket */ 00979 00980 if (trx->n_tickets_to_enter_innodb > 0) { 00981 trx->n_tickets_to_enter_innodb--; 00982 00983 return; 00984 } 00985 00986 os_fast_mutex_lock(&srv_conc_mutex); 00987 retry: 00988 if (trx->declared_to_be_inside_innodb) { 00989 ut_print_timestamp(stderr); 00990 fputs( 00991 " InnoDB: Error: trying to declare trx to enter InnoDB, but\n" 00992 "InnoDB: it already is declared.\n", stderr); 00993 trx_print(stderr, trx, 0); 00994 putc('\n', stderr); 00995 os_fast_mutex_unlock(&srv_conc_mutex); 00996 00997 return; 00998 } 00999 01000 if (srv_conc_n_threads < (lint)srv_thread_concurrency) { 01001 01002 srv_conc_n_threads++; 01003 trx->declared_to_be_inside_innodb = TRUE; 01004 trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER; 01005 01006 os_fast_mutex_unlock(&srv_conc_mutex); 01007 01008 return; 01009 } 01010 01011 /* If the transaction is not holding resources, 01012 let it sleep for SRV_THREAD_SLEEP_DELAY microseconds, and try again then */ 01013 01014 if (!has_slept && !trx->has_search_latch 01015 && NULL == UT_LIST_GET_FIRST(trx->trx_locks)) { 01016 01017 has_slept = TRUE; /* We let is sleep only once to avoid 01018 starvation */ 01019 01020 srv_conc_n_waiting_threads++; 01021 01022 os_fast_mutex_unlock(&srv_conc_mutex); 01023 01024 trx->op_info = "sleeping before joining InnoDB queue"; 01025 01026 /* Peter Zaitsev suggested that we take the sleep away 01027 altogether. But the sleep may be good in pathological 01028 situations of lots of thread switches. Simply put some 01029 threads aside for a while to reduce the number of thread 01030 switches. */ 01031 if (SRV_THREAD_SLEEP_DELAY > 0) { 01032 os_thread_sleep(SRV_THREAD_SLEEP_DELAY); 01033 } 01034 01035 trx->op_info = ""; 01036 01037 os_fast_mutex_lock(&srv_conc_mutex); 01038 01039 srv_conc_n_waiting_threads--; 01040 01041 goto retry; 01042 } 01043 01044 /* Too many threads inside: put the current thread to a queue */ 01045 01046 for (i = 0; i < OS_THREAD_MAX_N; i++) { 01047 slot = srv_conc_slots + i; 01048 01049 if (!slot->reserved) { 01050 01051 break; 01052 } 01053 } 01054 01055 if (i == OS_THREAD_MAX_N) { 01056 /* Could not find a free wait slot, we must let the 01057 thread enter */ 01058 01059 srv_conc_n_threads++; 01060 trx->declared_to_be_inside_innodb = TRUE; 01061 trx->n_tickets_to_enter_innodb = 0; 01062 01063 os_fast_mutex_unlock(&srv_conc_mutex); 01064 01065 return; 01066 } 01067 01068 /* Release possible search system latch this thread has */ 01069 if (trx->has_search_latch) { 01070 trx_search_latch_release_if_reserved(trx); 01071 } 01072 01073 /* Add to the queue */ 01074 slot->reserved = TRUE; 01075 slot->wait_ended = FALSE; 01076 01077 UT_LIST_ADD_LAST(srv_conc_queue, srv_conc_queue, slot); 01078 01079 os_event_reset(slot->event); 01080 01081 srv_conc_n_waiting_threads++; 01082 01083 os_fast_mutex_unlock(&srv_conc_mutex); 01084 01085 /* Go to wait for the event; when a thread leaves InnoDB it will 01086 release this thread */ 01087 01088 trx->op_info = "waiting in InnoDB queue"; 01089 01090 os_event_wait(slot->event); 01091 01092 trx->op_info = ""; 01093 01094 os_fast_mutex_lock(&srv_conc_mutex); 01095 01096 srv_conc_n_waiting_threads--; 01097 01098 /* NOTE that the thread which released this thread already 01099 incremented the thread counter on behalf of this thread */ 01100 01101 slot->reserved = FALSE; 01102 01103 UT_LIST_REMOVE(srv_conc_queue, srv_conc_queue, slot); 01104 01105 trx->declared_to_be_inside_innodb = TRUE; 01106 trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER; 01107 01108 os_fast_mutex_unlock(&srv_conc_mutex); 01109 } 01110 01111 /************************************************************************* 01112 This lets a thread enter InnoDB regardless of the number of threads inside 01113 InnoDB. This must be called when a thread ends a lock wait. */ 01114 01115 void 01116 srv_conc_force_enter_innodb( 01117 /*========================*/ 01118 trx_t* trx) /* in: transaction object associated with the 01119 thread */ 01120 { 01121 if (UNIV_LIKELY(!srv_thread_concurrency)) { 01122 01123 return; 01124 } 01125 01126 os_fast_mutex_lock(&srv_conc_mutex); 01127 01128 srv_conc_n_threads++; 01129 trx->declared_to_be_inside_innodb = TRUE; 01130 trx->n_tickets_to_enter_innodb = 0; 01131 01132 os_fast_mutex_unlock(&srv_conc_mutex); 01133 } 01134 01135 /************************************************************************* 01136 This must be called when a thread exits InnoDB in a lock wait or at the 01137 end of an SQL statement. */ 01138 01139 void 01140 srv_conc_force_exit_innodb( 01141 /*=======================*/ 01142 trx_t* trx) /* in: transaction object associated with the 01143 thread */ 01144 { 01145 srv_conc_slot_t* slot = NULL; 01146 01147 if (UNIV_LIKELY(!srv_thread_concurrency)) { 01148 01149 return; 01150 } 01151 01152 if (trx->declared_to_be_inside_innodb == FALSE) { 01153 01154 return; 01155 } 01156 01157 os_fast_mutex_lock(&srv_conc_mutex); 01158 01159 srv_conc_n_threads--; 01160 trx->declared_to_be_inside_innodb = FALSE; 01161 trx->n_tickets_to_enter_innodb = 0; 01162 01163 if (srv_conc_n_threads < (lint)srv_thread_concurrency) { 01164 /* Look for a slot where a thread is waiting and no other 01165 thread has yet released the thread */ 01166 01167 slot = UT_LIST_GET_FIRST(srv_conc_queue); 01168 01169 while (slot && slot->wait_ended == TRUE) { 01170 slot = UT_LIST_GET_NEXT(srv_conc_queue, slot); 01171 } 01172 01173 if (slot != NULL) { 01174 slot->wait_ended = TRUE; 01175 01176 /* We increment the count on behalf of the released 01177 thread */ 01178 01179 srv_conc_n_threads++; 01180 } 01181 } 01182 01183 os_fast_mutex_unlock(&srv_conc_mutex); 01184 01185 if (slot != NULL) { 01186 os_event_set(slot->event); 01187 } 01188 } 01189 01190 /************************************************************************* 01191 This must be called when a thread exits InnoDB. */ 01192 01193 void 01194 srv_conc_exit_innodb( 01195 /*=================*/ 01196 trx_t* trx) /* in: transaction object associated with the 01197 thread */ 01198 { 01199 if (trx->n_tickets_to_enter_innodb > 0) { 01200 /* We will pretend the thread is still inside InnoDB though it 01201 now leaves the InnoDB engine. In this way we save 01202 a lot of semaphore operations. srv_conc_force_exit_innodb is 01203 used to declare the thread definitely outside InnoDB. It 01204 should be called when there is a lock wait or an SQL statement 01205 ends. */ 01206 01207 return; 01208 } 01209 01210 srv_conc_force_exit_innodb(trx); 01211 } 01212 01213 /*========================================================================*/ 01214 01215 /************************************************************************* 01216 Normalizes init parameter values to use units we use inside InnoDB. */ 01217 static 01218 ulint 01219 srv_normalize_init_values(void) 01220 /*===========================*/ 01221 /* out: DB_SUCCESS or error code */ 01222 { 01223 ulint n; 01224 ulint i; 01225 01226 n = srv_n_data_files; 01227 01228 for (i = 0; i < n; i++) { 01229 srv_data_file_sizes[i] = srv_data_file_sizes[i] 01230 * ((1024 * 1024) / UNIV_PAGE_SIZE); 01231 } 01232 01233 srv_last_file_size_max = srv_last_file_size_max 01234 * ((1024 * 1024) / UNIV_PAGE_SIZE); 01235 01236 srv_log_file_size = srv_log_file_size / UNIV_PAGE_SIZE; 01237 01238 srv_log_buffer_size = srv_log_buffer_size / UNIV_PAGE_SIZE; 01239 01240 srv_pool_size = srv_pool_size / (UNIV_PAGE_SIZE / 1024); 01241 01242 srv_awe_window_size = srv_awe_window_size / UNIV_PAGE_SIZE; 01243 01244 if (srv_use_awe) { 01245 /* If we are using AWE we must save memory in the 32-bit 01246 address space of the process, and cannot bind the lock 01247 table size to the real buffer pool size. */ 01248 01249 srv_lock_table_size = 20 * srv_awe_window_size; 01250 } else { 01251 srv_lock_table_size = 5 * srv_pool_size; 01252 } 01253 01254 return(DB_SUCCESS); 01255 } 01256 01257 /************************************************************************* 01258 Boots the InnoDB server. */ 01259 01260 ulint 01261 srv_boot(void) 01262 /*==========*/ 01263 /* out: DB_SUCCESS or error code */ 01264 { 01265 ulint err; 01266 01267 /* Transform the init parameter values given by MySQL to 01268 use units we use inside InnoDB: */ 01269 01270 err = srv_normalize_init_values(); 01271 01272 if (err != DB_SUCCESS) { 01273 return(err); 01274 } 01275 01276 /* Initialize synchronization primitives, memory management, and thread 01277 local storage */ 01278 01279 srv_general_init(); 01280 01281 /* Initialize this module */ 01282 01283 srv_init(); 01284 01285 return(DB_SUCCESS); 01286 } 01287 01288 #ifndef UNIV_HOTBACKUP 01289 /************************************************************************* 01290 Reserves a slot in the thread table for the current MySQL OS thread. 01291 NOTE! The kernel mutex has to be reserved by the caller! */ 01292 static 01293 srv_slot_t* 01294 srv_table_reserve_slot_for_mysql(void) 01295 /*==================================*/ 01296 /* out: reserved slot */ 01297 { 01298 srv_slot_t* slot; 01299 ulint i; 01300 01301 #ifdef UNIV_SYNC_DEBUG 01302 ut_ad(mutex_own(&kernel_mutex)); 01303 #endif /* UNIV_SYNC_DEBUG */ 01304 01305 i = 0; 01306 slot = srv_mysql_table + i; 01307 01308 while (slot->in_use) { 01309 i++; 01310 01311 if (i >= OS_THREAD_MAX_N) { 01312 01313 ut_print_timestamp(stderr); 01314 01315 fprintf(stderr, 01316 " InnoDB: There appear to be %lu MySQL threads currently waiting\n" 01317 "InnoDB: inside InnoDB, which is the upper limit. Cannot continue operation.\n" 01318 "InnoDB: We intentionally generate a seg fault to print a stack trace\n" 01319 "InnoDB: on Linux. But first we print a list of waiting threads.\n", (ulong) i); 01320 01321 for (i = 0; i < OS_THREAD_MAX_N; i++) { 01322 01323 slot = srv_mysql_table + i; 01324 01325 fprintf(stderr, 01326 "Slot %lu: thread id %lu, type %lu, in use %lu, susp %lu, time %lu\n", 01327 (ulong) i, (ulong) os_thread_pf(slot->id), 01328 (ulong) slot->type, (ulong) slot->in_use, 01329 (ulong) slot->suspended, 01330 (ulong) difftime(ut_time(), slot->suspend_time)); 01331 } 01332 01333 ut_error; 01334 } 01335 01336 slot = srv_mysql_table + i; 01337 } 01338 01339 ut_a(slot->in_use == FALSE); 01340 01341 slot->in_use = TRUE; 01342 slot->id = os_thread_get_curr_id(); 01343 slot->handle = os_thread_get_curr(); 01344 01345 return(slot); 01346 } 01347 #endif /* !UNIV_HOTBACKUP */ 01348 01349 /******************************************************************* 01350 Puts a MySQL OS thread to wait for a lock to be released. If an error 01351 occurs during the wait trx->error_state associated with thr is 01352 != DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK 01353 are possible errors. DB_DEADLOCK is returned if selective deadlock 01354 resolution chose this transaction as a victim. */ 01355 01356 void 01357 srv_suspend_mysql_thread( 01358 /*=====================*/ 01359 que_thr_t* thr) /* in: query thread associated with the MySQL 01360 OS thread */ 01361 { 01362 #ifndef UNIV_HOTBACKUP 01363 srv_slot_t* slot; 01364 os_event_t event; 01365 double wait_time; 01366 trx_t* trx; 01367 ibool had_dict_lock = FALSE; 01368 ibool was_declared_inside_innodb = FALSE; 01369 ib_longlong start_time = 0; 01370 ib_longlong finish_time; 01371 ulint diff_time; 01372 ulint sec; 01373 ulint ms; 01374 01375 #ifdef UNIV_SYNC_DEBUG 01376 ut_ad(!mutex_own(&kernel_mutex)); 01377 #endif /* UNIV_SYNC_DEBUG */ 01378 01379 trx = thr_get_trx(thr); 01380 01381 os_event_set(srv_lock_timeout_thread_event); 01382 01383 mutex_enter(&kernel_mutex); 01384 01385 trx->error_state = DB_SUCCESS; 01386 01387 if (thr->state == QUE_THR_RUNNING) { 01388 01389 ut_ad(thr->is_active == TRUE); 01390 01391 /* The lock has already been released or this transaction 01392 was chosen as a deadlock victim: no need to suspend */ 01393 01394 if (trx->was_chosen_as_deadlock_victim) { 01395 01396 trx->error_state = DB_DEADLOCK; 01397 trx->was_chosen_as_deadlock_victim = FALSE; 01398 } 01399 01400 mutex_exit(&kernel_mutex); 01401 01402 return; 01403 } 01404 01405 ut_ad(thr->is_active == FALSE); 01406 01407 slot = srv_table_reserve_slot_for_mysql(); 01408 01409 event = slot->event; 01410 01411 slot->thr = thr; 01412 01413 os_event_reset(event); 01414 01415 slot->suspend_time = ut_time(); 01416 01417 if (thr->lock_state == QUE_THR_LOCK_ROW) { 01418 srv_n_lock_wait_count++; 01419 srv_n_lock_wait_current_count++; 01420 01421 ut_usectime(&sec, &ms); 01422 start_time = (ib_longlong)sec * 1000000 + ms; 01423 } 01424 /* Wake the lock timeout monitor thread, if it is suspended */ 01425 01426 os_event_set(srv_lock_timeout_thread_event); 01427 01428 mutex_exit(&kernel_mutex); 01429 01430 if (trx->declared_to_be_inside_innodb) { 01431 01432 was_declared_inside_innodb = TRUE; 01433 01434 /* We must declare this OS thread to exit InnoDB, since a 01435 possible other thread holding a lock which this thread waits 01436 for must be allowed to enter, sooner or later */ 01437 01438 srv_conc_force_exit_innodb(trx); 01439 } 01440 01441 /* Release possible foreign key check latch */ 01442 if (trx->dict_operation_lock_mode == RW_S_LATCH) { 01443 01444 had_dict_lock = TRUE; 01445 01446 row_mysql_unfreeze_data_dictionary(trx); 01447 } 01448 01449 ut_a(trx->dict_operation_lock_mode == 0); 01450 01451 /* Wait for the release */ 01452 01453 os_event_wait(event); 01454 01455 if (had_dict_lock) { 01456 01457 row_mysql_freeze_data_dictionary(trx); 01458 } 01459 01460 if (was_declared_inside_innodb) { 01461 01462 /* Return back inside InnoDB */ 01463 01464 srv_conc_force_enter_innodb(trx); 01465 } 01466 01467 mutex_enter(&kernel_mutex); 01468 01469 /* Release the slot for others to use */ 01470 01471 slot->in_use = FALSE; 01472 01473 wait_time = ut_difftime(ut_time(), slot->suspend_time); 01474 01475 if (thr->lock_state == QUE_THR_LOCK_ROW) { 01476 ut_usectime(&sec, &ms); 01477 finish_time = (ib_longlong)sec * 1000000 + ms; 01478 01479 diff_time = (ulint) (finish_time - start_time); 01480 01481 srv_n_lock_wait_current_count--; 01482 srv_n_lock_wait_time = srv_n_lock_wait_time + diff_time; 01483 if (diff_time > srv_n_lock_max_wait_time) { 01484 srv_n_lock_max_wait_time = diff_time; 01485 } 01486 } 01487 01488 if (trx->was_chosen_as_deadlock_victim) { 01489 01490 trx->error_state = DB_DEADLOCK; 01491 trx->was_chosen_as_deadlock_victim = FALSE; 01492 } 01493 01494 mutex_exit(&kernel_mutex); 01495 01496 if (srv_lock_wait_timeout < 100000000 && 01497 wait_time > (double)srv_lock_wait_timeout) { 01498 01499 trx->error_state = DB_LOCK_WAIT_TIMEOUT; 01500 } 01501 #else /* UNIV_HOTBACKUP */ 01502 /* This function depends on MySQL code that is not included in 01503 InnoDB Hot Backup builds. Besides, this function should never 01504 be called in InnoDB Hot Backup. */ 01505 ut_error; 01506 #endif /* UNIV_HOTBACKUP */ 01507 } 01508 01509 /************************************************************************ 01510 Releases a MySQL OS thread waiting for a lock to be released, if the 01511 thread is already suspended. */ 01512 01513 void 01514 srv_release_mysql_thread_if_suspended( 01515 /*==================================*/ 01516 que_thr_t* thr) /* in: query thread associated with the 01517 MySQL OS thread */ 01518 { 01519 #ifndef UNIV_HOTBACKUP 01520 srv_slot_t* slot; 01521 ulint i; 01522 01523 #ifdef UNIV_SYNC_DEBUG 01524 ut_ad(mutex_own(&kernel_mutex)); 01525 #endif /* UNIV_SYNC_DEBUG */ 01526 01527 for (i = 0; i < OS_THREAD_MAX_N; i++) { 01528 01529 slot = srv_mysql_table + i; 01530 01531 if (slot->in_use && slot->thr == thr) { 01532 /* Found */ 01533 01534 os_event_set(slot->event); 01535 01536 return; 01537 } 01538 } 01539 01540 /* not found */ 01541 #else /* UNIV_HOTBACKUP */ 01542 /* This function depends on MySQL code that is not included in 01543 InnoDB Hot Backup builds. Besides, this function should never 01544 be called in InnoDB Hot Backup. */ 01545 ut_error; 01546 #endif /* UNIV_HOTBACKUP */ 01547 } 01548 01549 #ifndef UNIV_HOTBACKUP 01550 /********************************************************************** 01551 Refreshes the values used to calculate per-second averages. */ 01552 static 01553 void 01554 srv_refresh_innodb_monitor_stats(void) 01555 /*==================================*/ 01556 { 01557 mutex_enter(&srv_innodb_monitor_mutex); 01558 01559 srv_last_monitor_time = time(NULL); 01560 01561 os_aio_refresh_stats(); 01562 01563 btr_cur_n_sea_old = btr_cur_n_sea; 01564 btr_cur_n_non_sea_old = btr_cur_n_non_sea; 01565 01566 log_refresh_stats(); 01567 01568 buf_refresh_io_stats(); 01569 01570 srv_n_rows_inserted_old = srv_n_rows_inserted; 01571 srv_n_rows_updated_old = srv_n_rows_updated; 01572 srv_n_rows_deleted_old = srv_n_rows_deleted; 01573 srv_n_rows_read_old = srv_n_rows_read; 01574 01575 mutex_exit(&srv_innodb_monitor_mutex); 01576 } 01577 01578 /********************************************************************** 01579 Outputs to a file the output of the InnoDB Monitor. */ 01580 01581 void 01582 srv_printf_innodb_monitor( 01583 /*======================*/ 01584 FILE* file, /* in: output stream */ 01585 ulint* trx_start, /* out: file position of the start of 01586 the list of active transactions */ 01587 ulint* trx_end) /* out: file position of the end of 01588 the list of active transactions */ 01589 { 01590 double time_elapsed; 01591 time_t current_time; 01592 ulint n_reserved; 01593 01594 mutex_enter(&srv_innodb_monitor_mutex); 01595 01596 current_time = time(NULL); 01597 01598 /* We add 0.001 seconds to time_elapsed to prevent division 01599 by zero if two users happen to call SHOW INNODB STATUS at the same 01600 time */ 01601 01602 time_elapsed = difftime(current_time, srv_last_monitor_time) 01603 + 0.001; 01604 01605 srv_last_monitor_time = time(NULL); 01606 01607 fputs("\n=====================================\n", file); 01608 01609 ut_print_timestamp(file); 01610 fprintf(file, 01611 " INNODB MONITOR OUTPUT\n" 01612 "=====================================\n" 01613 "Per second averages calculated from the last %lu seconds\n", 01614 (ulong)time_elapsed); 01615 01616 fputs("----------\n" 01617 "SEMAPHORES\n" 01618 "----------\n", file); 01619 sync_print(file); 01620 01621 /* Conceptually, srv_innodb_monitor_mutex has a very high latching 01622 order level in sync0sync.h, while dict_foreign_err_mutex has a very 01623 low level 135. Therefore we can reserve the latter mutex here without 01624 a danger of a deadlock of threads. */ 01625 01626 mutex_enter(&dict_foreign_err_mutex); 01627 01628 if (ftell(dict_foreign_err_file) != 0L) { 01629 fputs("------------------------\n" 01630 "LATEST FOREIGN KEY ERROR\n" 01631 "------------------------\n", file); 01632 ut_copy_file(file, dict_foreign_err_file); 01633 } 01634 01635 mutex_exit(&dict_foreign_err_mutex); 01636 01637 lock_print_info_summary(file); 01638 if (trx_start) { 01639 long t = ftell(file); 01640 if (t < 0) { 01641 *trx_start = ULINT_UNDEFINED; 01642 } else { 01643 *trx_start = (ulint) t; 01644 } 01645 } 01646 lock_print_info_all_transactions(file); 01647 if (trx_end) { 01648 long t = ftell(file); 01649 if (t < 0) { 01650 *trx_end = ULINT_UNDEFINED; 01651 } else { 01652 *trx_end = (ulint) t; 01653 } 01654 } 01655 fputs("--------\n" 01656 "FILE I/O\n" 01657 "--------\n", file); 01658 os_aio_print(file); 01659 01660 fputs("-------------------------------------\n" 01661 "INSERT BUFFER AND ADAPTIVE HASH INDEX\n" 01662 "-------------------------------------\n", file); 01663 ibuf_print(file); 01664 01665 ha_print_info(file, btr_search_sys->hash_index); 01666 01667 fprintf(file, 01668 "%.2f hash searches/s, %.2f non-hash searches/s\n", 01669 (btr_cur_n_sea - btr_cur_n_sea_old) 01670 / time_elapsed, 01671 (btr_cur_n_non_sea - btr_cur_n_non_sea_old) 01672 / time_elapsed); 01673 btr_cur_n_sea_old = btr_cur_n_sea; 01674 btr_cur_n_non_sea_old = btr_cur_n_non_sea; 01675 01676 fputs("---\n" 01677 "LOG\n" 01678 "---\n", file); 01679 log_print(file); 01680 01681 fputs("----------------------\n" 01682 "BUFFER POOL AND MEMORY\n" 01683 "----------------------\n", file); 01684 fprintf(file, 01685 "Total memory allocated " ULINTPF 01686 "; in additional pool allocated " ULINTPF "\n", 01687 ut_total_allocated_memory, 01688 mem_pool_get_reserved(mem_comm_pool)); 01689 fprintf(file, "Dictionary memory allocated " ULINTPF "\n", 01690 dict_sys->size); 01691 01692 if (srv_use_awe) { 01693 fprintf(file, 01694 "In addition to that %lu MB of AWE memory allocated\n", 01695 (ulong) (srv_pool_size / ((1024 * 1024) / UNIV_PAGE_SIZE))); 01696 } 01697 01698 buf_print_io(file); 01699 01700 fputs("--------------\n" 01701 "ROW OPERATIONS\n" 01702 "--------------\n", file); 01703 fprintf(file, "%ld queries inside InnoDB, %lu queries in queue\n", 01704 (long) srv_conc_n_threads, 01705 (ulong) srv_conc_n_waiting_threads); 01706 01707 fprintf(file, "%lu read views open inside InnoDB\n", 01708 UT_LIST_GET_LEN(trx_sys->view_list)); 01709 01710 n_reserved = fil_space_get_n_reserved_extents(0); 01711 if (n_reserved > 0) { 01712 fprintf(file, 01713 "%lu tablespace extents now reserved for B-tree split operations\n", 01714 (ulong) n_reserved); 01715 } 01716 01717 #ifdef UNIV_LINUX 01718 fprintf(file, "Main thread process no. %lu, id %lu, state: %s\n", 01719 (ulong) srv_main_thread_process_no, 01720 (ulong) srv_main_thread_id, 01721 srv_main_thread_op_info); 01722 #else 01723 fprintf(file, "Main thread id %lu, state: %s\n", 01724 (ulong) srv_main_thread_id, 01725 srv_main_thread_op_info); 01726 #endif 01727 fprintf(file, 01728 "Number of rows inserted " ULINTPF 01729 ", updated " ULINTPF ", deleted " ULINTPF ", read " ULINTPF "\n", 01730 srv_n_rows_inserted, 01731 srv_n_rows_updated, 01732 srv_n_rows_deleted, 01733 srv_n_rows_read); 01734 fprintf(file, 01735 "%.2f inserts/s, %.2f updates/s, %.2f deletes/s, %.2f reads/s\n", 01736 (srv_n_rows_inserted - srv_n_rows_inserted_old) 01737 / time_elapsed, 01738 (srv_n_rows_updated - srv_n_rows_updated_old) 01739 / time_elapsed, 01740 (srv_n_rows_deleted - srv_n_rows_deleted_old) 01741 / time_elapsed, 01742 (srv_n_rows_read - srv_n_rows_read_old) 01743 / time_elapsed); 01744 01745 srv_n_rows_inserted_old = srv_n_rows_inserted; 01746 srv_n_rows_updated_old = srv_n_rows_updated; 01747 srv_n_rows_deleted_old = srv_n_rows_deleted; 01748 srv_n_rows_read_old = srv_n_rows_read; 01749 01750 fputs("----------------------------\n" 01751 "END OF INNODB MONITOR OUTPUT\n" 01752 "============================\n", file); 01753 mutex_exit(&srv_innodb_monitor_mutex); 01754 fflush(file); 01755 } 01756 01757 /********************************************************************** 01758 Function to pass InnoDB status variables to MySQL */ 01759 01760 void 01761 srv_export_innodb_status(void) 01762 { 01763 01764 mutex_enter(&srv_innodb_monitor_mutex); 01765 export_vars.innodb_data_pending_reads= os_n_pending_reads; 01766 export_vars.innodb_data_pending_writes= os_n_pending_writes; 01767 export_vars.innodb_data_pending_fsyncs= 01768 fil_n_pending_log_flushes + fil_n_pending_tablespace_flushes; 01769 export_vars.innodb_data_fsyncs= os_n_fsyncs; 01770 export_vars.innodb_data_read= srv_data_read; 01771 export_vars.innodb_data_reads= os_n_file_reads; 01772 export_vars.innodb_data_writes= os_n_file_writes; 01773 export_vars.innodb_data_written= srv_data_written; 01774 export_vars.innodb_buffer_pool_read_requests= buf_pool->n_page_gets; 01775 export_vars.innodb_buffer_pool_write_requests= srv_buf_pool_write_requests; 01776 export_vars.innodb_buffer_pool_wait_free= srv_buf_pool_wait_free; 01777 export_vars.innodb_buffer_pool_pages_flushed= srv_buf_pool_flushed; 01778 export_vars.innodb_buffer_pool_reads= srv_buf_pool_reads; 01779 export_vars.innodb_buffer_pool_read_ahead_rnd= srv_read_ahead_rnd; 01780 export_vars.innodb_buffer_pool_read_ahead_seq= srv_read_ahead_seq; 01781 export_vars.innodb_buffer_pool_pages_data= UT_LIST_GET_LEN(buf_pool->LRU); 01782 export_vars.innodb_buffer_pool_pages_dirty= UT_LIST_GET_LEN(buf_pool->flush_list); 01783 export_vars.innodb_buffer_pool_pages_free= UT_LIST_GET_LEN(buf_pool->free); 01784 export_vars.innodb_buffer_pool_pages_latched= buf_get_latched_pages_number(); 01785 export_vars.innodb_buffer_pool_pages_total= buf_pool->curr_size; 01786 export_vars.innodb_buffer_pool_pages_misc= buf_pool->max_size - 01787 UT_LIST_GET_LEN(buf_pool->LRU) - UT_LIST_GET_LEN(buf_pool->free); 01788 export_vars.innodb_page_size= UNIV_PAGE_SIZE; 01789 export_vars.innodb_log_waits= srv_log_waits; 01790 export_vars.innodb_os_log_written= srv_os_log_written; 01791 export_vars.innodb_os_log_fsyncs= fil_n_log_flushes; 01792 export_vars.innodb_os_log_pending_fsyncs= fil_n_pending_log_flushes; 01793 export_vars.innodb_os_log_pending_writes= srv_os_log_pending_writes; 01794 export_vars.innodb_log_write_requests= srv_log_write_requests; 01795 export_vars.innodb_log_writes= srv_log_writes; 01796 export_vars.innodb_dblwr_pages_written= srv_dblwr_pages_written; 01797 export_vars.innodb_dblwr_writes= srv_dblwr_writes; 01798 export_vars.innodb_pages_created= buf_pool->n_pages_created; 01799 export_vars.innodb_pages_read= buf_pool->n_pages_read; 01800 export_vars.innodb_pages_written= buf_pool->n_pages_written; 01801 export_vars.innodb_row_lock_waits= srv_n_lock_wait_count; 01802 export_vars.innodb_row_lock_current_waits= srv_n_lock_wait_current_count; 01803 export_vars.innodb_row_lock_time= srv_n_lock_wait_time / 10000; 01804 if (srv_n_lock_wait_count > 0) { 01805 export_vars.innodb_row_lock_time_avg = (ulint) 01806 (srv_n_lock_wait_time / 10000 / srv_n_lock_wait_count); 01807 } else { 01808 export_vars.innodb_row_lock_time_avg = 0; 01809 } 01810 export_vars.innodb_row_lock_time_max= srv_n_lock_max_wait_time / 10000; 01811 export_vars.innodb_rows_read= srv_n_rows_read; 01812 export_vars.innodb_rows_inserted= srv_n_rows_inserted; 01813 export_vars.innodb_rows_updated= srv_n_rows_updated; 01814 export_vars.innodb_rows_deleted= srv_n_rows_deleted; 01815 mutex_exit(&srv_innodb_monitor_mutex); 01816 01817 } 01818 01819 /************************************************************************* 01820 A thread which wakes up threads whose lock wait may have lasted too long. 01821 This also prints the info output by various InnoDB monitors. */ 01822 01823 os_thread_ret_t 01824 srv_lock_timeout_and_monitor_thread( 01825 /*================================*/ 01826 /* out: a dummy parameter */ 01827 void* arg __attribute__((unused))) 01828 /* in: a dummy parameter required by 01829 os_thread_create */ 01830 { 01831 srv_slot_t* slot; 01832 double time_elapsed; 01833 time_t current_time; 01834 time_t last_table_monitor_time; 01835 time_t last_monitor_time; 01836 ibool some_waits; 01837 double wait_time; 01838 ulint i; 01839 01840 #ifdef UNIV_DEBUG_THREAD_CREATION 01841 fprintf(stderr, "Lock timeout thread starts, id %lu\n", 01842 os_thread_pf(os_thread_get_curr_id())); 01843 #endif 01844 UT_NOT_USED(arg); 01845 srv_last_monitor_time = time(NULL); 01846 last_table_monitor_time = time(NULL); 01847 last_monitor_time = time(NULL); 01848 loop: 01849 srv_lock_timeout_and_monitor_active = TRUE; 01850 01851 /* When someone is waiting for a lock, we wake up every second 01852 and check if a timeout has passed for a lock wait */ 01853 01854 os_thread_sleep(1000000); 01855 01856 /* In case mutex_exit is not a memory barrier, it is 01857 theoretically possible some threads are left waiting though 01858 the semaphore is already released. Wake up those threads: */ 01859 01860 sync_arr_wake_threads_if_sema_free(); 01861 01862 current_time = time(NULL); 01863 01864 time_elapsed = difftime(current_time, last_monitor_time); 01865 01866 if (time_elapsed > 15) { 01867 last_monitor_time = time(NULL); 01868 01869 if (srv_print_innodb_monitor) { 01870 srv_printf_innodb_monitor(stderr, NULL, NULL); 01871 } 01872 01873 if (srv_innodb_status) { 01874 mutex_enter(&srv_monitor_file_mutex); 01875 rewind(srv_monitor_file); 01876 srv_printf_innodb_monitor(srv_monitor_file, NULL, NULL); 01877 os_file_set_eof(srv_monitor_file); 01878 mutex_exit(&srv_monitor_file_mutex); 01879 } 01880 01881 if (srv_print_innodb_tablespace_monitor 01882 && difftime(current_time, last_table_monitor_time) > 60) { 01883 01884 last_table_monitor_time = time(NULL); 01885 01886 fputs("================================================\n", 01887 stderr); 01888 01889 ut_print_timestamp(stderr); 01890 01891 fputs(" INNODB TABLESPACE MONITOR OUTPUT\n" 01892 "================================================\n", 01893 stderr); 01894 01895 fsp_print(0); 01896 fputs("Validating tablespace\n", stderr); 01897 fsp_validate(0); 01898 fputs("Validation ok\n" 01899 "---------------------------------------\n" 01900 "END OF INNODB TABLESPACE MONITOR OUTPUT\n" 01901 "=======================================\n", 01902 stderr); 01903 } 01904 01905 if (srv_print_innodb_table_monitor 01906 && difftime(current_time, last_table_monitor_time) > 60) { 01907 01908 last_table_monitor_time = time(NULL); 01909 01910 fputs("===========================================\n", stderr); 01911 01912 ut_print_timestamp(stderr); 01913 01914 fputs(" INNODB TABLE MONITOR OUTPUT\n" 01915 "===========================================\n", 01916 stderr); 01917 dict_print(); 01918 01919 fputs("-----------------------------------\n" 01920 "END OF INNODB TABLE MONITOR OUTPUT\n" 01921 "==================================\n", 01922 stderr); 01923 } 01924 } 01925 01926 mutex_enter(&kernel_mutex); 01927 01928 some_waits = FALSE; 01929 01930 /* Check of all slots if a thread is waiting there, and if it 01931 has exceeded the time limit */ 01932 01933 for (i = 0; i < OS_THREAD_MAX_N; i++) { 01934 01935 slot = srv_mysql_table + i; 01936 01937 if (slot->in_use) { 01938 some_waits = TRUE; 01939 01940 wait_time = ut_difftime(ut_time(), slot->suspend_time); 01941 01942 if (srv_lock_wait_timeout < 100000000 && 01943 (wait_time > (double) srv_lock_wait_timeout 01944 || wait_time < 0)) { 01945 01946 /* Timeout exceeded or a wrap-around in system 01947 time counter: cancel the lock request queued 01948 by the transaction and release possible 01949 other transactions waiting behind; it is 01950 possible that the lock has already been 01951 granted: in that case do nothing */ 01952 01953 if (thr_get_trx(slot->thr)->wait_lock) { 01954 lock_cancel_waiting_and_release( 01955 thr_get_trx(slot->thr)->wait_lock); 01956 } 01957 } 01958 } 01959 } 01960 01961 os_event_reset(srv_lock_timeout_thread_event); 01962 01963 mutex_exit(&kernel_mutex); 01964 01965 if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) { 01966 goto exit_func; 01967 } 01968 01969 if (some_waits || srv_print_innodb_monitor 01970 || srv_print_innodb_lock_monitor 01971 || srv_print_innodb_tablespace_monitor 01972 || srv_print_innodb_table_monitor) { 01973 goto loop; 01974 } 01975 01976 /* No one was waiting for a lock and no monitor was active: 01977 suspend this thread */ 01978 01979 srv_lock_timeout_and_monitor_active = FALSE; 01980 01981 #if 0 01982 /* The following synchronisation is disabled, since 01983 the InnoDB monitor output is to be updated every 15 seconds. */ 01984 os_event_wait(srv_lock_timeout_thread_event); 01985 #endif 01986 goto loop; 01987 01988 exit_func: 01989 srv_lock_timeout_and_monitor_active = FALSE; 01990 01991 /* We count the number of threads in os_thread_exit(). A created 01992 thread should always use that to exit and not use return() to exit. */ 01993 01994 os_thread_exit(NULL); 01995 01996 OS_THREAD_DUMMY_RETURN; 01997 } 01998 01999 /************************************************************************* 02000 A thread which prints warnings about semaphore waits which have lasted 02001 too long. These can be used to track bugs which cause hangs. */ 02002 02003 os_thread_ret_t 02004 srv_error_monitor_thread( 02005 /*=====================*/ 02006 /* out: a dummy parameter */ 02007 void* arg __attribute__((unused))) 02008 /* in: a dummy parameter required by 02009 os_thread_create */ 02010 { 02011 /* number of successive fatal timeouts observed */ 02012 ulint fatal_cnt = 0; 02013 dulint old_lsn; 02014 dulint new_lsn; 02015 02016 old_lsn = srv_start_lsn; 02017 02018 #ifdef UNIV_DEBUG_THREAD_CREATION 02019 fprintf(stderr, "Error monitor thread starts, id %lu\n", 02020 os_thread_pf(os_thread_get_curr_id())); 02021 #endif 02022 loop: 02023 srv_error_monitor_active = TRUE; 02024 02025 /* Try to track a strange bug reported by Harald Fuchs and others, 02026 where the lsn seems to decrease at times */ 02027 02028 new_lsn = log_get_lsn(); 02029 02030 if (ut_dulint_cmp(new_lsn, old_lsn) < 0) { 02031 ut_print_timestamp(stderr); 02032 fprintf(stderr, 02033 " InnoDB: Error: old log sequence number %lu %lu was greater\n" 02034 "InnoDB: than the new log sequence number %lu %lu!\n" 02035 "InnoDB: Please send a bug report to mysql@lists.mysql.com\n", 02036 (ulong) ut_dulint_get_high(old_lsn), 02037 (ulong) ut_dulint_get_low(old_lsn), 02038 (ulong) ut_dulint_get_high(new_lsn), 02039 (ulong) ut_dulint_get_low(new_lsn)); 02040 } 02041 02042 old_lsn = new_lsn; 02043 02044 if (difftime(time(NULL), srv_last_monitor_time) > 60) { 02045 /* We referesh InnoDB Monitor values so that averages are 02046 printed from at most 60 last seconds */ 02047 02048 srv_refresh_innodb_monitor_stats(); 02049 } 02050 02051 if (sync_array_print_long_waits()) { 02052 fatal_cnt++; 02053 if (fatal_cnt > 5) { 02054 02055 fprintf(stderr, 02056 "InnoDB: Error: semaphore wait has lasted > %lu seconds\n" 02057 "InnoDB: We intentionally crash the server, because it appears to be hung.\n", 02058 srv_fatal_semaphore_wait_threshold); 02059 02060 ut_error; 02061 } 02062 } else { 02063 fatal_cnt = 0; 02064 } 02065 02066 /* Flush stderr so that a database user gets the output 02067 to possible MySQL error file */ 02068 02069 fflush(stderr); 02070 02071 os_thread_sleep(2000000); 02072 02073 if (srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE) { 02074 02075 goto loop; 02076 } 02077 02078 srv_error_monitor_active = FALSE; 02079 02080 /* We count the number of threads in os_thread_exit(). A created 02081 thread should always use that to exit and not use return() to exit. */ 02082 02083 os_thread_exit(NULL); 02084 02085 OS_THREAD_DUMMY_RETURN; 02086 } 02087 02088 /*********************************************************************** 02089 Tells the InnoDB server that there has been activity in the database 02090 and wakes up the master thread if it is suspended (not sleeping). Used 02091 in the MySQL interface. Note that there is a small chance that the master 02092 thread stays suspended (we do not protect our operation with the kernel 02093 mutex, for performace reasons). */ 02094 02095 void 02096 srv_active_wake_master_thread(void) 02097 /*===============================*/ 02098 { 02099 srv_activity_count++; 02100 02101 if (srv_n_threads_active[SRV_MASTER] == 0) { 02102 02103 mutex_enter(&kernel_mutex); 02104 02105 srv_release_threads(SRV_MASTER, 1); 02106 02107 mutex_exit(&kernel_mutex); 02108 } 02109 } 02110 02111 /*********************************************************************** 02112 Wakes up the master thread if it is suspended or being suspended. */ 02113 02114 void 02115 srv_wake_master_thread(void) 02116 /*========================*/ 02117 { 02118 srv_activity_count++; 02119 02120 mutex_enter(&kernel_mutex); 02121 02122 srv_release_threads(SRV_MASTER, 1); 02123 02124 mutex_exit(&kernel_mutex); 02125 } 02126 02127 /************************************************************************* 02128 The master thread controlling the server. */ 02129 02130 os_thread_ret_t 02131 srv_master_thread( 02132 /*==============*/ 02133 /* out: a dummy parameter */ 02134 void* arg __attribute__((unused))) 02135 /* in: a dummy parameter required by 02136 os_thread_create */ 02137 { 02138 os_event_t event; 02139 time_t last_flush_time; 02140 time_t current_time; 02141 ulint old_activity_count; 02142 ulint n_pages_purged; 02143 ulint n_bytes_merged; 02144 ulint n_pages_flushed; 02145 ulint n_bytes_archived; 02146 ulint n_tables_to_drop; 02147 ulint n_ios; 02148 ulint n_ios_old; 02149 ulint n_ios_very_old; 02150 ulint n_pend_ios; 02151 ibool skip_sleep = FALSE; 02152 ulint i; 02153 02154 #ifdef UNIV_DEBUG_THREAD_CREATION 02155 fprintf(stderr, "Master thread starts, id %lu\n", 02156 os_thread_pf(os_thread_get_curr_id())); 02157 #endif 02158 srv_main_thread_process_no = os_proc_get_number(); 02159 srv_main_thread_id = os_thread_pf(os_thread_get_curr_id()); 02160 02161 srv_table_reserve_slot(SRV_MASTER); 02162 02163 mutex_enter(&kernel_mutex); 02164 02165 srv_n_threads_active[SRV_MASTER]++; 02166 02167 mutex_exit(&kernel_mutex); 02168 02169 loop: 02170 /*****************************************************************/ 02171 /* ---- When there is database activity by users, we cycle in this 02172 loop */ 02173 02174 srv_main_thread_op_info = "reserving kernel mutex"; 02175 02176 n_ios_very_old = log_sys->n_log_ios + buf_pool->n_pages_read 02177 + buf_pool->n_pages_written; 02178 mutex_enter(&kernel_mutex); 02179 02180 /* Store the user activity counter at the start of this loop */ 02181 old_activity_count = srv_activity_count; 02182 02183 mutex_exit(&kernel_mutex); 02184 02185 if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND) { 02186 02187 goto suspend_thread; 02188 } 02189 02190 /* ---- We run the following loop approximately once per second 02191 when there is database activity */ 02192 02193 skip_sleep = FALSE; 02194 02195 for (i = 0; i < 10; i++) { 02196 n_ios_old = log_sys->n_log_ios + buf_pool->n_pages_read 02197 + buf_pool->n_pages_written; 02198 srv_main_thread_op_info = "sleeping"; 02199 02200 if (!skip_sleep) { 02201 02202 os_thread_sleep(1000000); 02203 } 02204 02205 skip_sleep = FALSE; 02206 02207 /* ALTER TABLE in MySQL requires on Unix that the table handler 02208 can drop tables lazily after there no longer are SELECT 02209 queries to them. */ 02210 02211 srv_main_thread_op_info = "doing background drop tables"; 02212 02213 row_drop_tables_for_mysql_in_background(); 02214 02215 srv_main_thread_op_info = ""; 02216 02217 if (srv_fast_shutdown && srv_shutdown_state > 0) { 02218 02219 goto background_loop; 02220 } 02221 02222 /* We flush the log once in a second even if no commit 02223 is issued or the we have specified in my.cnf no flush 02224 at transaction commit */ 02225 02226 srv_main_thread_op_info = "flushing log"; 02227 log_buffer_flush_to_disk(); 02228 02229 srv_main_thread_op_info = "making checkpoint"; 02230 log_free_check(); 02231 02232 /* If there were less than 5 i/os during the 02233 one second sleep, we assume that there is free 02234 disk i/o capacity available, and it makes sense to 02235 do an insert buffer merge. */ 02236 02237 n_pend_ios = buf_get_n_pending_ios() 02238 + log_sys->n_pending_writes; 02239 n_ios = log_sys->n_log_ios + buf_pool->n_pages_read 02240 + buf_pool->n_pages_written; 02241 if (n_pend_ios < 3 && (n_ios - n_ios_old < 5)) { 02242 srv_main_thread_op_info = "doing insert buffer merge"; 02243 ibuf_contract_for_n_pages(TRUE, 5); 02244 02245 srv_main_thread_op_info = "flushing log"; 02246 02247 log_buffer_flush_to_disk(); 02248 } 02249 02250 if (buf_get_modified_ratio_pct() > 02251 srv_max_buf_pool_modified_pct) { 02252 02253 /* Try to keep the number of modified pages in the 02254 buffer pool under the limit wished by the user */ 02255 02256 n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100, 02257 ut_dulint_max); 02258 02259 /* If we had to do the flush, it may have taken 02260 even more than 1 second, and also, there may be more 02261 to flush. Do not sleep 1 second during the next 02262 iteration of this loop. */ 02263 02264 skip_sleep = TRUE; 02265 } 02266 02267 if (srv_activity_count == old_activity_count) { 02268 02269 /* There is no user activity at the moment, go to 02270 the background loop */ 02271 02272 goto background_loop; 02273 } 02274 } 02275 02276 /* ---- We perform the following code approximately once per 02277 10 seconds when there is database activity */ 02278 02279 #ifdef MEM_PERIODIC_CHECK 02280 /* Check magic numbers of every allocated mem block once in 10 02281 seconds */ 02282 mem_validate_all_blocks(); 02283 #endif 02284 /* If there were less than 200 i/os during the 10 second period, 02285 we assume that there is free disk i/o capacity available, and it 02286 makes sense to flush 100 pages. */ 02287 02288 n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes; 02289 n_ios = log_sys->n_log_ios + buf_pool->n_pages_read 02290 + buf_pool->n_pages_written; 02291 if (n_pend_ios < 3 && (n_ios - n_ios_very_old < 200)) { 02292 02293 srv_main_thread_op_info = "flushing buffer pool pages"; 02294 buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max); 02295 02296 srv_main_thread_op_info = "flushing log"; 02297 log_buffer_flush_to_disk(); 02298 } 02299 02300 /* We run a batch of insert buffer merge every 10 seconds, 02301 even if the server were active */ 02302 02303 srv_main_thread_op_info = "doing insert buffer merge"; 02304 ibuf_contract_for_n_pages(TRUE, 5); 02305 02306 srv_main_thread_op_info = "flushing log"; 02307 log_buffer_flush_to_disk(); 02308 02309 /* We run a full purge every 10 seconds, even if the server 02310 were active */ 02311 02312 n_pages_purged = 1; 02313 02314 last_flush_time = time(NULL); 02315 02316 while (n_pages_purged) { 02317 02318 if (srv_fast_shutdown && srv_shutdown_state > 0) { 02319 02320 goto background_loop; 02321 } 02322 02323 srv_main_thread_op_info = "purging"; 02324 n_pages_purged = trx_purge(); 02325 02326 current_time = time(NULL); 02327 02328 if (difftime(current_time, last_flush_time) > 1) { 02329 srv_main_thread_op_info = "flushing log"; 02330 02331 log_buffer_flush_to_disk(); 02332 last_flush_time = current_time; 02333 } 02334 } 02335 02336 srv_main_thread_op_info = "flushing buffer pool pages"; 02337 02338 /* Flush a few oldest pages to make a new checkpoint younger */ 02339 02340 if (buf_get_modified_ratio_pct() > 70) { 02341 02342 /* If there are lots of modified pages in the buffer pool 02343 (> 70 %), we assume we can afford reserving the disk(s) for 02344 the time it requires to flush 100 pages */ 02345 02346 n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100, 02347 ut_dulint_max); 02348 } else { 02349 /* Otherwise, we only flush a small number of pages so that 02350 we do not unnecessarily use much disk i/o capacity from 02351 other work */ 02352 02353 n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 10, 02354 ut_dulint_max); 02355 } 02356 02357 srv_main_thread_op_info = "making checkpoint"; 02358 02359 /* Make a new checkpoint about once in 10 seconds */ 02360 02361 log_checkpoint(TRUE, FALSE); 02362 02363 srv_main_thread_op_info = "reserving kernel mutex"; 02364 02365 mutex_enter(&kernel_mutex); 02366 02367 /* ---- When there is database activity, we jump from here back to 02368 the start of loop */ 02369 02370 if (srv_activity_count != old_activity_count) { 02371 mutex_exit(&kernel_mutex); 02372 goto loop; 02373 } 02374 02375 mutex_exit(&kernel_mutex); 02376 02377 /* If the database is quiet, we enter the background loop */ 02378 02379 /*****************************************************************/ 02380 background_loop: 02381 /* ---- In this loop we run background operations when the server 02382 is quiet from user activity. Also in the case of a shutdown, we 02383 loop here, flushing the buffer pool to the data files. */ 02384 02385 /* The server has been quiet for a while: start running background 02386 operations */ 02387 02388 srv_main_thread_op_info = "doing background drop tables"; 02389 02390 n_tables_to_drop = row_drop_tables_for_mysql_in_background(); 02391 02392 if (n_tables_to_drop > 0) { 02393 /* Do not monopolize the CPU even if there are tables waiting 02394 in the background drop queue. (It is essentially a bug if 02395 MySQL tries to drop a table while there are still open handles 02396 to it and we had to put it to the background drop queue.) */ 02397 02398 os_thread_sleep(100000); 02399 } 02400 02401 srv_main_thread_op_info = "purging"; 02402 02403 /* Run a full purge */ 02404 02405 n_pages_purged = 1; 02406 02407 last_flush_time = time(NULL); 02408 02409 while (n_pages_purged) { 02410 if (srv_fast_shutdown && srv_shutdown_state > 0) { 02411 02412 break; 02413 } 02414 02415 srv_main_thread_op_info = "purging"; 02416 n_pages_purged = trx_purge(); 02417 02418 current_time = time(NULL); 02419 02420 if (difftime(current_time, last_flush_time) > 1) { 02421 srv_main_thread_op_info = "flushing log"; 02422 02423 log_buffer_flush_to_disk(); 02424 last_flush_time = current_time; 02425 } 02426 } 02427 02428 srv_main_thread_op_info = "reserving kernel mutex"; 02429 02430 mutex_enter(&kernel_mutex); 02431 if (srv_activity_count != old_activity_count) { 02432 mutex_exit(&kernel_mutex); 02433 goto loop; 02434 } 02435 mutex_exit(&kernel_mutex); 02436 02437 srv_main_thread_op_info = "doing insert buffer merge"; 02438 02439 if (srv_fast_shutdown && srv_shutdown_state > 0) { 02440 n_bytes_merged = 0; 02441 } else { 02442 n_bytes_merged = ibuf_contract_for_n_pages(TRUE, 20); 02443 } 02444 02445 srv_main_thread_op_info = "reserving kernel mutex"; 02446 02447 mutex_enter(&kernel_mutex); 02448 if (srv_activity_count != old_activity_count) { 02449 mutex_exit(&kernel_mutex); 02450 goto loop; 02451 } 02452 mutex_exit(&kernel_mutex); 02453 02454 flush_loop: 02455 srv_main_thread_op_info = "flushing buffer pool pages"; 02456 02457 if (srv_fast_shutdown < 2) { 02458 n_pages_flushed = 02459 buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max); 02460 } else { 02461 /* In the fastest shutdown we do not flush the buffer pool 02462 to data files: we set n_pages_flushed to 0 artificially. */ 02463 02464 n_pages_flushed = 0; 02465 } 02466 02467 srv_main_thread_op_info = "reserving kernel mutex"; 02468 02469 mutex_enter(&kernel_mutex); 02470 if (srv_activity_count != old_activity_count) { 02471 mutex_exit(&kernel_mutex); 02472 goto loop; 02473 } 02474 mutex_exit(&kernel_mutex); 02475 02476 srv_main_thread_op_info = "waiting for buffer pool flush to end"; 02477 buf_flush_wait_batch_end(BUF_FLUSH_LIST); 02478 02479 srv_main_thread_op_info = "flushing log"; 02480 02481 log_buffer_flush_to_disk(); 02482 02483 srv_main_thread_op_info = "making checkpoint"; 02484 02485 log_checkpoint(TRUE, FALSE); 02486 02487 if (buf_get_modified_ratio_pct() > srv_max_buf_pool_modified_pct) { 02488 02489 /* Try to keep the number of modified pages in the 02490 buffer pool under the limit wished by the user */ 02491 02492 goto flush_loop; 02493 } 02494 02495 srv_main_thread_op_info = "reserving kernel mutex"; 02496 02497 mutex_enter(&kernel_mutex); 02498 if (srv_activity_count != old_activity_count) { 02499 mutex_exit(&kernel_mutex); 02500 goto loop; 02501 } 02502 mutex_exit(&kernel_mutex); 02503 /* 02504 srv_main_thread_op_info = "archiving log (if log archive is on)"; 02505 02506 log_archive_do(FALSE, &n_bytes_archived); 02507 */ 02508 n_bytes_archived = 0; 02509 02510 /* Keep looping in the background loop if still work to do */ 02511 02512 if (srv_fast_shutdown && srv_shutdown_state > 0) { 02513 if (n_tables_to_drop + n_pages_flushed 02514 + n_bytes_archived != 0) { 02515 02516 /* If we are doing a fast shutdown (= the default) 02517 we do not do purge or insert buffer merge. But we 02518 flush the buffer pool completely to disk. 02519 In a 'very fast' shutdown we do not flush the buffer 02520 pool to data files: we have set n_pages_flushed to 02521 0 artificially. */ 02522 02523 goto background_loop; 02524 } 02525 } else if (n_tables_to_drop + 02526 n_pages_purged + n_bytes_merged + n_pages_flushed 02527 + n_bytes_archived != 0) { 02528 /* In a 'slow' shutdown we run purge and the insert buffer 02529 merge to completion */ 02530 02531 goto background_loop; 02532 } 02533 02534 /* There is no work for background operations either: suspend 02535 master thread to wait for more server activity */ 02536 02537 suspend_thread: 02538 srv_main_thread_op_info = "suspending"; 02539 02540 mutex_enter(&kernel_mutex); 02541 02542 if (row_get_background_drop_list_len_low() > 0) { 02543 mutex_exit(&kernel_mutex); 02544 02545 goto loop; 02546 } 02547 02548 event = srv_suspend_thread(); 02549 02550 mutex_exit(&kernel_mutex); 02551 02552 srv_main_thread_op_info = "waiting for server activity"; 02553 02554 os_event_wait(event); 02555 02556 if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { 02557 /* This is only extra safety, the thread should exit 02558 already when the event wait ends */ 02559 02560 os_thread_exit(NULL); 02561 } 02562 02563 /* When there is user activity, InnoDB will set the event and the 02564 main thread goes back to loop. */ 02565 02566 goto loop; 02567 } 02568 #endif /* !UNIV_HOTBACKUP */
1.4.7

