Drizzled Public API Documentation

os0file.cc
1 /*****************************************************************************
2 
3 Copyright (C) 1995, 2010, Innobase Oy. All Rights Reserved.
4 Copyright (C) 2009, Percona Inc.
5 
6 Portions of this file contain modifications contributed and copyrighted
7 by Percona Inc.. Those modifications are
8 gratefully acknowledged and are described briefly in the InnoDB
9 documentation. The contributions by Percona Inc. are incorporated with
10 their permission, and subject to the conditions contained in the file
11 COPYING.Percona.
12 
13 This program is free software; you can redistribute it and/or modify it under
14 the terms of the GNU General Public License as published by the Free Software
15 Foundation; version 2 of the License.
16 
17 This program is distributed in the hope that it will be useful, but WITHOUT
18 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
23 St, Fifth Floor, Boston, MA 02110-1301 USA
24 
25 *****************************************************************************/
26 
27 /**************************************************/
34 #include "os0file.h"
35 
36 #ifdef UNIV_NONINL
37 #include "os0file.ic"
38 #endif
39 
40 #include "ut0mem.h"
41 #include "srv0srv.h"
42 #include "srv0start.h"
43 #include "fil0fil.h"
44 #include "buf0buf.h"
45 #include <errno.h>
46 #include <fcntl.h>
47 #include <limits.h>
48 #include <unistd.h>
49 #ifndef UNIV_HOTBACKUP
50 # include "os0sync.h"
51 # include "os0thread.h"
52 #else /* !UNIV_HOTBACKUP */
53 # ifdef __WIN__
54 /* Add includes for the _stat() call to compile on Windows */
55 # include <sys/types.h>
56 # include <sys/stat.h>
57 # endif /* __WIN__ */
58 #endif /* !UNIV_HOTBACKUP */
59 
60 #if defined(LINUX_NATIVE_AIO)
61 #include <libaio.h>
62 #endif
63 
64 /* This specifies the file permissions InnoDB uses when it creates files in
65 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
66 my_umask */
67 
68 #ifndef __WIN__
69 
70 UNIV_INTERN ulint os_innodb_umask
71  = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
72 #else
73 
74 UNIV_INTERN ulint os_innodb_umask = 0;
75 #endif
76 
77 #ifdef UNIV_DO_FLUSH
78 /* If the following is set to TRUE, we do not call os_file_flush in every
79 os_file_write. We can set this TRUE when the doublewrite buffer is used. */
80 UNIV_INTERN ibool os_do_not_call_flush_at_each_write = FALSE;
81 #else
82 /* We do not call os_file_flush in every os_file_write. */
83 #endif /* UNIV_DO_FLUSH */
84 
85 #ifndef UNIV_HOTBACKUP
86 /* We use these mutexes to protect lseek + file i/o operation, if the
87 OS does not provide an atomic pread or pwrite, or similar */
88 #define OS_FILE_N_SEEK_MUTEXES 16
89 UNIV_INTERN os_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
90 
91 /* In simulated aio, merge at most this many consecutive i/os */
92 #define OS_AIO_MERGE_N_CONSECUTIVE 64
93 
94 /**********************************************************************
95 
96 InnoDB AIO Implementation:
97 =========================
98 
99 We support native AIO for windows and linux. For rest of the platforms
100 we simulate AIO by special io-threads servicing the IO-requests.
101 
102 Simulated AIO:
103 ==============
104 
105 In platforms where we 'simulate' AIO following is a rough explanation
106 of the high level design.
107 There are four io-threads (for ibuf, log, read, write).
108 All synchronous IO requests are serviced by the calling thread using
109 os_file_write/os_file_read. The Asynchronous requests are queued up
110 in an array (there are four such arrays) by the calling thread.
111 Later these requests are picked up by the io-thread and are serviced
112 synchronously.
113 
114 Windows native AIO:
115 ==================
116 
117 If srv_use_native_aio is not set then windows follow the same
118 code as simulated AIO. If the flag is set then native AIO interface
119 is used. On windows, one of the limitation is that if a file is opened
120 for AIO no synchronous IO can be done on it. Therefore we have an
121 extra fifth array to queue up synchronous IO requests.
122 There are innodb_file_io_threads helper threads. These threads work
123 on the four arrays mentioned above in Simulated AIO. No thread is
124 required for the sync array.
125 If a synchronous IO request is made, it is first queued in the sync
126 array. Then the calling thread itself waits on the request, thus
127 making the call synchronous.
128 If an AIO request is made the calling thread not only queues it in the
129 array but also submits the requests. The helper thread then collects
130 the completed IO request and calls completion routine on it.
131 
132 Linux native AIO:
133 =================
134 
135 If we have libaio installed on the system and innodb_use_native_aio
136 is set to TRUE we follow the code path of native AIO, otherwise we
137 do simulated AIO.
138 There are innodb_file_io_threads helper threads. These threads work
139 on the four arrays mentioned above in Simulated AIO.
140 If a synchronous IO request is made, it is handled by calling
141 os_file_write/os_file_read.
142 If an AIO request is made the calling thread not only queues it in the
143 array but also submits the requests. The helper thread then collects
144 the completed IO request and calls completion routine on it.
145 
146 **********************************************************************/
147 
149 UNIV_INTERN ibool os_aio_print_debug = FALSE;
150 
151 #ifdef UNIV_PFS_IO
152 /* Keys to register InnoDB I/O with performance schema */
153 UNIV_INTERN mysql_pfs_key_t innodb_file_data_key;
154 UNIV_INTERN mysql_pfs_key_t innodb_file_log_key;
155 UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key;
156 #endif /* UNIV_PFS_IO */
157 
159 typedef struct os_aio_slot_struct os_aio_slot_t;
160 
163  ibool is_read;
164  ulint pos;
166  ibool reserved;
168  ulint len;
170  byte* buf;
171  ulint type;
172  ulint offset;
174  ulint offset_high;
176  const char* name;
183  void* message2;
187 #ifdef WIN_ASYNC_IO
188  HANDLE handle;
190  OVERLAPPED control;
192 #elif defined(LINUX_NATIVE_AIO)
193  struct iocb control; /* Linux control block for aio */
194  int n_bytes; /* bytes written/read. */
195  int ret; /* AIO return code */
196 #endif
197 };
198 
200 typedef struct os_aio_array_struct os_aio_array_t;
201 
213  ulint n_slots;
216  ulint n_segments;
221  ulint cur_seg;
225  ulint n_reserved;
229 #ifdef __WIN__
230  HANDLE* handles;
237 #endif
238 
239 #if defined(LINUX_NATIVE_AIO)
240  io_context_t* aio_ctx;
241  /* completion queue for IO. There is
242  one such queue per segment. Each thread
243  will work on one ctx exclusively. */
244  struct io_event* aio_events;
245  /* The array to collect completed IOs.
246  There is one such event for each
247  possible pending IO. The size of the
248  array is equal to n_slots. */
249 #endif
250 };
251 
252 #if defined(LINUX_NATIVE_AIO)
253 
254 #define OS_AIO_REAP_TIMEOUT (500000000UL)
255 
257 #define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL)
258 
260 #define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5
261 #endif
262 
264 static os_event_t* os_aio_segment_wait_events = NULL;
265 
268 static os_aio_array_t* os_aio_read_array = NULL;
269 static os_aio_array_t* os_aio_write_array = NULL;
270 static os_aio_array_t* os_aio_ibuf_array = NULL;
271 static os_aio_array_t* os_aio_log_array = NULL;
272 static os_aio_array_t* os_aio_sync_array = NULL;
273 /* @} */
274 
276 static ulint os_aio_n_segments = ULINT_UNDEFINED;
277 
280 static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
281 #endif /* !UNIV_HOTBACKUP */
282 
283 UNIV_INTERN ulint os_n_file_reads = 0;
284 UNIV_INTERN ulint os_bytes_read_since_printout = 0;
285 UNIV_INTERN ulint os_n_file_writes = 0;
286 UNIV_INTERN ulint os_n_fsyncs = 0;
287 UNIV_INTERN ulint os_n_file_reads_old = 0;
288 UNIV_INTERN ulint os_n_file_writes_old = 0;
289 UNIV_INTERN ulint os_n_fsyncs_old = 0;
290 UNIV_INTERN time_t os_last_printout;
291 
292 UNIV_INTERN ibool os_has_said_disk_full = FALSE;
293 
294 #ifndef UNIV_HOTBACKUP
295 
296 static os_mutex_t os_file_count_mutex;
297 #endif /* !UNIV_HOTBACKUP */
298 
299 UNIV_INTERN ulint os_file_n_pending_preads = 0;
301 UNIV_INTERN ulint os_file_n_pending_pwrites = 0;
303 UNIV_INTERN ulint os_n_pending_writes = 0;
305 UNIV_INTERN ulint os_n_pending_reads = 0;
306 
307 #ifdef UNIV_DEBUG
308 /**********************************************************************/
311 UNIV_INTERN
312 ibool
313 os_aio_validate_skip(void)
314 /*======================*/
315 {
317 # define OS_AIO_VALIDATE_SKIP 13
318 
321  static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
322 
323  /* There is a race condition below, but it does not matter,
324  because this call is only for heuristic purposes. We want to
325  reduce the call frequency of the costly os_aio_validate()
326  check in debug builds. */
327  if (--os_aio_validate_count > 0) {
328  return(TRUE);
329  }
330 
331  os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
332  return(os_aio_validate());
333 }
334 #endif /* UNIV_DEBUG */
335 
336 #ifdef __WIN__
337 /***********************************************************************/
341 UNIV_INTERN
342 ulint
343 os_get_os_version(void)
344 /*===================*/
345 {
346  OSVERSIONINFO os_info;
347 
348  os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
349 
350  ut_a(GetVersionEx(&os_info));
351 
352  if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
353  return(OS_WIN31);
354  } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
355  return(OS_WIN95);
356  } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
357  switch (os_info.dwMajorVersion) {
358  case 3:
359  case 4:
360  return OS_WINNT;
361  case 5:
362  return (os_info.dwMinorVersion == 0) ? OS_WIN2000
363  : OS_WINXP;
364  case 6:
365  return (os_info.dwMinorVersion == 0) ? OS_WINVISTA
366  : OS_WIN7;
367  default:
368  return OS_WIN7;
369  }
370  } else {
371  ut_error;
372  return(0);
373  }
374 }
375 #endif /* __WIN__ */
376 
377 /***********************************************************************/
383 UNIV_INTERN
384 ulint
386 /*===================*/
387  ibool report_all_errors)
389 {
390  ulint err;
391 
392 #ifdef __WIN__
393 
394  err = (ulint) GetLastError();
395 
396  if (report_all_errors
397  || (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) {
398 
399  ut_print_timestamp(stderr);
400  fprintf(stderr,
401  " InnoDB: Operating system error number %lu"
402  " in a file operation.\n", (ulong) err);
403 
404  if (err == ERROR_PATH_NOT_FOUND) {
405  fprintf(stderr,
406  "InnoDB: The error means the system"
407  " cannot find the path specified.\n");
408 
409  if (srv_is_being_started) {
410  fprintf(stderr,
411  "InnoDB: If you are installing InnoDB,"
412  " remember that you must create\n"
413  "InnoDB: directories yourself, InnoDB"
414  " does not create them.\n");
415  }
416  } else if (err == ERROR_ACCESS_DENIED) {
417  fprintf(stderr,
418  "InnoDB: The error means mysqld does not have"
419  " the access rights to\n"
420  "InnoDB: the directory. It may also be"
421  " you have created a subdirectory\n"
422  "InnoDB: of the same name as a data file.\n");
423  } else if (err == ERROR_SHARING_VIOLATION
424  || err == ERROR_LOCK_VIOLATION) {
425  fprintf(stderr,
426  "InnoDB: The error means that another program"
427  " is using InnoDB's files.\n"
428  "InnoDB: This might be a backup or antivirus"
429  " software or another instance\n"
430  "InnoDB: of MySQL."
431  " Please close it to get rid of this error.\n");
432  } else if (err == ERROR_WORKING_SET_QUOTA
433  || err == ERROR_NO_SYSTEM_RESOURCES) {
434  fprintf(stderr,
435  "InnoDB: The error means that there are no"
436  " sufficient system resources or quota to"
437  " complete the operation.\n");
438  } else if (err == ERROR_OPERATION_ABORTED) {
439  fprintf(stderr,
440  "InnoDB: The error means that the I/O"
441  " operation has been aborted\n"
442  "InnoDB: because of either a thread exit"
443  " or an application request.\n"
444  "InnoDB: Retry attempt is made.\n");
445  } else {
446  fprintf(stderr,
447  "InnoDB: Some operating system error numbers"
448  " are described at\n"
449  "InnoDB: "
450  REFMAN
451  "operating-system-error-codes.html\n");
452  }
453  }
454 
455  fflush(stderr);
456 
457  if (err == ERROR_FILE_NOT_FOUND) {
458  return(OS_FILE_NOT_FOUND);
459  } else if (err == ERROR_DISK_FULL) {
460  return(OS_FILE_DISK_FULL);
461  } else if (err == ERROR_FILE_EXISTS) {
462  return(OS_FILE_ALREADY_EXISTS);
463  } else if (err == ERROR_SHARING_VIOLATION
464  || err == ERROR_LOCK_VIOLATION) {
465  return(OS_FILE_SHARING_VIOLATION);
466  } else if (err == ERROR_WORKING_SET_QUOTA
467  || err == ERROR_NO_SYSTEM_RESOURCES) {
468  return(OS_FILE_INSUFFICIENT_RESOURCE);
469  } else if (err == ERROR_OPERATION_ABORTED) {
470  return(OS_FILE_OPERATION_ABORTED);
471  } else {
472  return(100 + err);
473  }
474 #else
475  err = (ulint) errno;
476 
477  if (report_all_errors
478  || (err != ENOSPC && err != EEXIST)) {
479 
480  ut_print_timestamp(stderr);
481  fprintf(stderr,
482  " InnoDB: Operating system error number %lu"
483  " in a file operation.\n", (ulong) err);
484 
485  if (err == ENOENT) {
486  fprintf(stderr,
487  "InnoDB: The error means the system"
488  " cannot find the path specified.\n");
489 
490  if (srv_is_being_started) {
491  fprintf(stderr,
492  "InnoDB: If you are installing InnoDB,"
493  " remember that you must create\n"
494  "InnoDB: directories yourself, InnoDB"
495  " does not create them.\n");
496  }
497  } else if (err == EACCES) {
498  fprintf(stderr,
499  "InnoDB: The error means mysqld does not have"
500  " the access rights to\n"
501  "InnoDB: the directory.\n");
502  } else {
503  if (strerror((int)err) != NULL) {
504  fprintf(stderr,
505  "InnoDB: Error number %lu"
506  " means '%s'.\n",
507  err, strerror((int)err));
508  }
509 
510  fprintf(stderr,
511  "InnoDB: Some operating system"
512  " error numbers are described at\n"
513  "InnoDB: "
514  REFMAN
515  "operating-system-error-codes.html\n");
516  }
517  }
518 
519  fflush(stderr);
520 
521  switch (err) {
522  case ENOSPC:
523  return(OS_FILE_DISK_FULL);
524  case ENOENT:
525  return(OS_FILE_NOT_FOUND);
526  case EEXIST:
527  return(OS_FILE_ALREADY_EXISTS);
528  case EXDEV:
529  case ENOTDIR:
530  case EISDIR:
531  return(OS_FILE_PATH_ERROR);
532  case EAGAIN:
533  if (srv_use_native_aio) {
534  return(OS_FILE_AIO_RESOURCES_RESERVED);
535  }
536  break;
537  case EINTR:
538  if (srv_use_native_aio) {
539  return(OS_FILE_AIO_INTERRUPTED);
540  }
541  break;
542  }
543  return(100 + err);
544 #endif
545 }
546 
547 /****************************************************************/
552 static
553 ibool
554 os_file_handle_error_cond_exit(
555 /*===========================*/
556  const char* name,
557  const char* operation,
558  ibool should_exit)
560 {
561  ulint err;
562 
563  err = os_file_get_last_error(FALSE);
564 
565  if (err == OS_FILE_DISK_FULL) {
566  /* We only print a warning about disk full once */
567 
568  if (os_has_said_disk_full) {
569 
570  return(FALSE);
571  }
572 
573  if (name) {
574  ut_print_timestamp(stderr);
575  fprintf(stderr,
576  " InnoDB: Encountered a problem with"
577  " file %s\n", name);
578  }
579 
580  ut_print_timestamp(stderr);
581  fprintf(stderr,
582  " InnoDB: Disk is full. Try to clean the disk"
583  " to free space.\n");
584 
585  os_has_said_disk_full = TRUE;
586 
587  fflush(stderr);
588 
589  return(FALSE);
590  } else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
591 
592  return(TRUE);
593  } else if (err == OS_FILE_AIO_INTERRUPTED) {
594 
595  return(TRUE);
596  } else if (err == OS_FILE_ALREADY_EXISTS
597  || err == OS_FILE_PATH_ERROR) {
598 
599  return(FALSE);
600  } else if (err == OS_FILE_SHARING_VIOLATION) {
601 
602  os_thread_sleep(10000000); /* 10 sec */
603  return(TRUE);
604  } else if (err == OS_FILE_INSUFFICIENT_RESOURCE) {
605 
606  os_thread_sleep(100000); /* 100 ms */
607  return(TRUE);
608  } else if (err == OS_FILE_OPERATION_ABORTED) {
609 
610  os_thread_sleep(100000); /* 100 ms */
611  return(TRUE);
612  } else {
613  if (name) {
614  fprintf(stderr, "InnoDB: File name %s\n", name);
615  }
616 
617  fprintf(stderr, "InnoDB: File operation call: '%s'.\n",
618  operation);
619 
620  if (should_exit) {
621  fprintf(stderr, "InnoDB: Cannot continue operation.\n");
622 
623  fflush(stderr);
624 
625  exit(1);
626  }
627  }
628 
629  return(FALSE);
630 }
631 
632 /****************************************************************/
635 static
636 ibool
637 os_file_handle_error(
638 /*=================*/
639  const char* name,
640  const char* operation)
641 {
642  /* exit in case of unknown error */
643  return(os_file_handle_error_cond_exit(name, operation, TRUE));
644 }
645 
646 /****************************************************************/
649 static
650 ibool
651 os_file_handle_error_no_exit(
652 /*=========================*/
653  const char* name,
654  const char* operation)
655 {
656  /* don't exit in case of unknown error */
657  return(os_file_handle_error_cond_exit(name, operation, FALSE));
658 }
659 
660 #undef USE_FILE_LOCK
661 #define USE_FILE_LOCK
662 #if defined(UNIV_HOTBACKUP) || defined(__WIN__)
663 /* InnoDB Hot Backup does not lock the data files.
664  * On Windows, mandatory locking is used.
665  */
666 # undef USE_FILE_LOCK
667 #endif
668 #ifdef USE_FILE_LOCK
669 /****************************************************************/
672 static
673 int
674 os_file_lock(
675 /*=========*/
676  int fd,
677  const char* name)
678 {
679  struct flock lk;
680 
681  if (srv_read_only)
682  return 0;
683 
684  lk.l_type = F_WRLCK;
685  lk.l_whence = SEEK_SET;
686  lk.l_start = lk.l_len = 0;
687  if (fcntl(fd, F_SETLK, &lk) == -1) {
688  fprintf(stderr,
689  "InnoDB: Unable to lock %s, error: %d\n", name, errno);
690 
691  if (errno == EAGAIN || errno == EACCES) {
692  fprintf(stderr,
693  "InnoDB: Check that you do not already have"
694  " another drizzled process\n"
695  "InnoDB: using the same InnoDB data"
696  " or log files.\n");
697  }
698 
699  return(-1);
700  }
701 
702  return(0);
703 }
704 #endif /* USE_FILE_LOCK */
705 
706 #ifndef UNIV_HOTBACKUP
707 /****************************************************************/
709 UNIV_INTERN
710 void
712 /*===================*/
713 {
714  ulint i;
715 
716  os_file_count_mutex = os_mutex_create();
717 
718  for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
719  os_file_seek_mutexes[i] = os_mutex_create();
720  }
721 }
722 
723 /***********************************************************************/
727 UNIV_INTERN
728 FILE*
730 /*========================*/
731 {
732  FILE* file = NULL;
733  int fd = innobase_mysql_tmpfile();
734 
735  if (fd >= 0) {
736  file = fdopen(fd, "w+b");
737  }
738 
739  if (!file) {
740  ut_print_timestamp(stderr);
741  fprintf(stderr,
742  " InnoDB: Error: unable to create temporary file;"
743  " errno: %d\n", errno);
744  if (fd >= 0) {
745  close(fd);
746  }
747  }
748 
749  return(file);
750 }
751 #endif /* !UNIV_HOTBACKUP */
752 
753 /***********************************************************************/
759 UNIV_INTERN
762 /*============*/
763  const char* dirname,
765  ibool error_is_fatal)
770 {
771  os_file_dir_t dir;
772 #ifdef __WIN__
773  LPWIN32_FIND_DATA lpFindFileData;
774  char path[OS_FILE_MAX_PATH + 3];
775 
776  ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
777 
778  strcpy(path, dirname);
779  strcpy(path + strlen(path), "\\*");
780 
781  /* Note that in Windows opening the 'directory stream' also retrieves
782  the first entry in the directory. Since it is '.', that is no problem,
783  as we will skip over the '.' and '..' entries anyway. */
784 
785  lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
786 
787  dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
788 
789  ut_free(lpFindFileData);
790 
791  if (dir == INVALID_HANDLE_VALUE) {
792 
793  if (error_is_fatal) {
794  os_file_handle_error(dirname, "opendir");
795  }
796 
797  return(NULL);
798  }
799 
800  return(dir);
801 #else
802  dir = opendir(dirname);
803 
804  if (dir == NULL && error_is_fatal) {
805  os_file_handle_error(dirname, "opendir");
806  }
807 
808  return(dir);
809 #endif
810 }
811 
812 /***********************************************************************/
815 UNIV_INTERN
816 int
818 /*=============*/
819  os_file_dir_t dir)
820 {
821 #ifdef __WIN__
822  BOOL ret;
823 
824  ret = FindClose(dir);
825 
826  if (!ret) {
827  os_file_handle_error_no_exit(NULL, "closedir");
828 
829  return(-1);
830  }
831 
832  return(0);
833 #else
834  int ret;
835 
836  ret = closedir(dir);
837 
838  if (ret) {
839  os_file_handle_error_no_exit(NULL, "closedir");
840  }
841 
842  return(ret);
843 #endif
844 }
845 
846 /***********************************************************************/
850 UNIV_INTERN
851 int
853 /*======================*/
854  const char* dirname,
855  os_file_dir_t dir,
856  os_file_stat_t* info)
857 {
858 #ifdef __WIN__
859  LPWIN32_FIND_DATA lpFindFileData;
860  BOOL ret;
861 
862  lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
863 next_file:
864  ret = FindNextFile(dir, lpFindFileData);
865 
866  if (ret) {
867  ut_a(strlen((char *) lpFindFileData->cFileName)
868  < OS_FILE_MAX_PATH);
869 
870  if (strcmp((char *) lpFindFileData->cFileName, ".") == 0
871  || strcmp((char *) lpFindFileData->cFileName, "..") == 0) {
872 
873  goto next_file;
874  }
875 
876  strcpy(info->name, (char *) lpFindFileData->cFileName);
877 
878  info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
879  + (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
880  << 32);
881 
882  if (lpFindFileData->dwFileAttributes
883  & FILE_ATTRIBUTE_REPARSE_POINT) {
884  /* TODO: test Windows symlinks */
885  /* TODO: MySQL has apparently its own symlink
886  implementation in Windows, dbname.sym can
887  redirect a database directory:
888  REFMAN "windows-symbolic-links.html" */
889  info->type = OS_FILE_TYPE_LINK;
890  } else if (lpFindFileData->dwFileAttributes
891  & FILE_ATTRIBUTE_DIRECTORY) {
892  info->type = OS_FILE_TYPE_DIR;
893  } else {
894  /* It is probably safest to assume that all other
895  file types are normal. Better to check them rather
896  than blindly skip them. */
897 
898  info->type = OS_FILE_TYPE_FILE;
899  }
900  }
901 
902  ut_free(lpFindFileData);
903 
904  if (ret) {
905  return(0);
906  } else if (GetLastError() == ERROR_NO_MORE_FILES) {
907 
908  return(1);
909  } else {
910  os_file_handle_error_no_exit(dirname,
911  "readdir_next_file");
912  return(-1);
913  }
914 #else
915  struct dirent* ent;
916  char* full_path;
917  int ret;
918  struct stat statinfo;
919 #ifdef HAVE_READDIR_R
920 #ifndef __GNU__
921  char dirent_buf[sizeof(struct dirent)
922  + _POSIX_PATH_MAX + 100];
923  /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
924  the max file name len; but in most standards, the
925  length is NAME_MAX; we add 100 to be even safer */
926 #else
927  char dirent_buf[sizeof(struct dirent)
928  + UCHAR_MAX + 100];
932 #endif
933 #endif
934 
935 next_file:
936 
937 #ifdef HAVE_READDIR_R
938  ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent);
939 
940  if (ret != 0
941 #ifdef UNIV_AIX
942  /* On AIX, only if we got non-NULL 'ent' (result) value and
943  a non-zero 'ret' (return) value, it indicates a failed
944  readdir_r() call. An NULL 'ent' with an non-zero 'ret'
945  would indicate the "end of the directory" is reached. */
946  && ent != NULL
947 #endif
948  ) {
949  fprintf(stderr,
950  "InnoDB: cannot read directory %s, error %lu\n",
951  dirname, (ulong)ret);
952 
953  return(-1);
954  }
955 
956  if (ent == NULL) {
957  /* End of directory */
958 
959  return(1);
960  }
961 
962 #ifndef __GNU__
963  ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
964 #else
965  ut_a(strlen(ent->d_name) < UCHAR_MAX + 100 - 1);
966 #endif
967 
968 #else
969  ent = readdir(dir);
970 
971  if (ent == NULL) {
972 
973  return(1);
974  }
975 #endif
976  ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
977 
978  if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
979 
980  goto next_file;
981  }
982 
983  strcpy(info->name, ent->d_name);
984 
985  full_path = static_cast<char* >(ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10));
986 
987  sprintf(full_path, "%s/%s", dirname, ent->d_name);
988 
989  ret = stat(full_path, &statinfo);
990 
991  if (ret) {
992 
993  if (errno == ENOENT) {
994  /* readdir() returned a file that does not exist,
995  it must have been deleted in the meantime. Do what
996  would have happened if the file was deleted before
997  readdir() - ignore and go to the next entry.
998  If this is the last entry then info->name will still
999  contain the name of the deleted file when this
1000  function returns, but this is not an issue since the
1001  caller shouldn't be looking at info when end of
1002  directory is returned. */
1003 
1004  ut_free(full_path);
1005 
1006  goto next_file;
1007  }
1008 
1009  os_file_handle_error_no_exit(full_path, "stat");
1010 
1011  ut_free(full_path);
1012 
1013  return(-1);
1014  }
1015 
1016  info->size = (ib_int64_t)statinfo.st_size;
1017 
1018  if (S_ISDIR(statinfo.st_mode)) {
1019  info->type = OS_FILE_TYPE_DIR;
1020  } else if (S_ISLNK(statinfo.st_mode)) {
1021  info->type = OS_FILE_TYPE_LINK;
1022  } else if (S_ISREG(statinfo.st_mode)) {
1023  info->type = OS_FILE_TYPE_FILE;
1024  } else {
1025  info->type = OS_FILE_TYPE_UNKNOWN;
1026  }
1027 
1028  ut_free(full_path);
1029 
1030  return(0);
1031 #endif
1032 }
1033 
1034 /*****************************************************************/
1040 UNIV_INTERN
1041 ibool
1043 /*=====================*/
1044  const char* pathname,
1046  ibool fail_if_exists)
1048 {
1049 #ifdef __WIN__
1050  BOOL rcode;
1051 
1052  rcode = CreateDirectory((LPCTSTR) pathname, NULL);
1053  if (!(rcode != 0
1054  || (GetLastError() == ERROR_ALREADY_EXISTS
1055  && !fail_if_exists))) {
1056  /* failure */
1057  os_file_handle_error(pathname, "CreateDirectory");
1058 
1059  return(FALSE);
1060  }
1061 
1062  return (TRUE);
1063 #else
1064  int rcode;
1065 
1066  rcode = mkdir(pathname, 0770);
1067 
1068  if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
1069  /* failure */
1070  os_file_handle_error(pathname, "mkdir");
1071 
1072  return(FALSE);
1073  }
1074 
1075  return (TRUE);
1076 #endif
1077 }
1078 
1079 /****************************************************************/
1085 UNIV_INTERN
1086 os_file_t
1088 /*=======================*/
1089  const char* name,
1091  ulint create_mode,
1098  ulint access_type,
1100  ibool* success)
1101 {
1102 #ifdef __WIN__
1103  os_file_t file;
1104  DWORD create_flag;
1105  DWORD access;
1106  DWORD attributes = 0;
1107  ibool retry;
1108 
1109 try_again:
1110  ut_a(name);
1111 
1112  if (create_mode == OS_FILE_OPEN) {
1113  create_flag = OPEN_EXISTING;
1114  } else if (create_mode == OS_FILE_CREATE) {
1115  create_flag = CREATE_NEW;
1116  } else if (create_mode == OS_FILE_CREATE_PATH) {
1117  /* create subdirs along the path if needed */
1118  *success = os_file_create_subdirs_if_needed(name);
1119  if (!*success) {
1120  ut_error;
1121  }
1122  create_flag = CREATE_NEW;
1123  create_mode = OS_FILE_CREATE;
1124  } else {
1125  create_flag = 0;
1126  ut_error;
1127  }
1128 
1129  if (access_type == OS_FILE_READ_ONLY) {
1130  access = GENERIC_READ;
1131  } else if (access_type == OS_FILE_READ_WRITE) {
1132  access = GENERIC_READ | GENERIC_WRITE;
1133  } else {
1134  access = 0;
1135  ut_error;
1136  }
1137 
1138  file = CreateFile((LPCTSTR) name,
1139  access,
1140  FILE_SHARE_READ | FILE_SHARE_WRITE,
1141  /* file can be read and written also
1142  by other processes */
1143  NULL, /* default security attributes */
1144  create_flag,
1145  attributes,
1146  NULL);
1148  if (file == INVALID_HANDLE_VALUE) {
1149  *success = FALSE;
1150 
1151  retry = os_file_handle_error(name,
1152  create_mode == OS_FILE_OPEN ?
1153  "open" : "create");
1154  if (retry) {
1155  goto try_again;
1156  }
1157  } else {
1158  *success = TRUE;
1159  }
1160 
1161  return(file);
1162 #else /* __WIN__ */
1163  os_file_t file;
1164  int create_flag;
1165  ibool retry;
1166 
1167 try_again:
1168  ut_a(name);
1169 
1170  if (create_mode == OS_FILE_OPEN) {
1171  if (access_type == OS_FILE_READ_ONLY) {
1172  create_flag = O_RDONLY;
1173  } else {
1174  create_flag = O_RDWR;
1175  }
1176  } else if (create_mode == OS_FILE_CREATE) {
1177  create_flag = O_RDWR | O_CREAT | O_EXCL;
1178  } else if (create_mode == OS_FILE_CREATE_PATH) {
1179  /* create subdirs along the path if needed */
1180  *success = os_file_create_subdirs_if_needed(name);
1181  if (!*success) {
1182  return (-1);
1183  }
1184  create_flag = O_RDWR | O_CREAT | O_EXCL;
1185  create_mode = OS_FILE_CREATE;
1186  } else {
1187  create_flag = 0;
1188  ut_error;
1189  }
1190 
1191  if (create_mode == OS_FILE_CREATE) {
1192  file = open(name, create_flag, S_IRUSR | S_IWUSR
1193  | S_IRGRP | S_IWGRP);
1194  } else {
1195  file = open(name, create_flag);
1196  }
1197 
1198  if (file == -1) {
1199  *success = FALSE;
1200 
1201  retry = os_file_handle_error(name,
1202  create_mode == OS_FILE_OPEN ?
1203  "open" : "create");
1204  if (retry) {
1205  goto try_again;
1206  }
1207 #ifdef USE_FILE_LOCK
1208  } else if (access_type == OS_FILE_READ_WRITE
1209  && os_file_lock(file, name)) {
1210  *success = FALSE;
1211  close(file);
1212  file = -1;
1213 #endif
1214  } else {
1215  *success = TRUE;
1216  }
1217 
1218  return(file);
1219 #endif /* __WIN__ */
1220 }
1221 
1222 /****************************************************************/
1228 UNIV_INTERN
1229 os_file_t
1231 /*=========================================*/
1232  const char* name,
1234  ulint create_mode,
1238  ulint access_type,
1242  ibool* success)
1243 {
1244 #ifdef __WIN__
1245  os_file_t file;
1246  DWORD create_flag;
1247  DWORD access;
1248  DWORD attributes = 0;
1249  DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
1250 
1251  ut_a(name);
1252 
1253  if (create_mode == OS_FILE_OPEN) {
1254  create_flag = OPEN_EXISTING;
1255  } else if (create_mode == OS_FILE_CREATE) {
1256  create_flag = CREATE_NEW;
1257  } else {
1258  create_flag = 0;
1259  ut_error;
1260  }
1261 
1262  if (access_type == OS_FILE_READ_ONLY) {
1263  access = GENERIC_READ;
1264  } else if (access_type == OS_FILE_READ_WRITE) {
1265  access = GENERIC_READ | GENERIC_WRITE;
1266  } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
1267  access = GENERIC_READ;
1268  share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ
1269  | FILE_SHARE_WRITE;
1273  } else {
1274  access = 0;
1275  ut_error;
1276  }
1277 
1278  file = CreateFile((LPCTSTR) name,
1279  access,
1280  share_mode,
1281  NULL, /* default security attributes */
1282  create_flag,
1283  attributes,
1284  NULL);
1286  if (file == INVALID_HANDLE_VALUE) {
1287  *success = FALSE;
1288  } else {
1289  *success = TRUE;
1290  }
1291 
1292  return(file);
1293 #else /* __WIN__ */
1294  os_file_t file;
1295  int create_flag;
1296 
1297  ut_a(name);
1298 
1299  if (create_mode == OS_FILE_OPEN) {
1300  if (access_type == OS_FILE_READ_ONLY) {
1301  create_flag = O_RDONLY;
1302  } else {
1303  create_flag = O_RDWR;
1304  }
1305  } else if (create_mode == OS_FILE_CREATE) {
1306  create_flag = O_RDWR | O_CREAT | O_EXCL;
1307  } else {
1308  create_flag = 0;
1309  ut_error;
1310  }
1311 
1312  if (create_mode == OS_FILE_CREATE) {
1313  file = open(name, create_flag, S_IRUSR | S_IWUSR
1314  | S_IRGRP | S_IWGRP);
1315  } else {
1316  file = open(name, create_flag);
1317  }
1318 
1319  if (file == -1) {
1320  *success = FALSE;
1321 #ifdef USE_FILE_LOCK
1322  } else if (access_type == OS_FILE_READ_WRITE
1323  && os_file_lock(file, name)) {
1324  *success = FALSE;
1325  close(file);
1326  file = -1;
1327 #endif
1328  } else {
1329  *success = TRUE;
1330  }
1331 
1332  return(file);
1333 #endif /* __WIN__ */
1334 }
1335 
1336 /****************************************************************/
1338 UNIV_INTERN
1339 void
1341 /*================*/
1342  int fd,
1343  const char* file_name,
1344  const char* operation_name)
1347 {
1348  /* some versions of Solaris may not have DIRECTIO_ON */
1349 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
1350  if (directio(fd, DIRECTIO_ON) == -1) {
1351  int errno_save;
1352  errno_save = (int)errno;
1353  ut_print_timestamp(stderr);
1354  fprintf(stderr,
1355  " InnoDB: Failed to set DIRECTIO_ON "
1356  "on file %s: %s: %s, continuing anyway\n",
1357  file_name, operation_name, strerror(errno_save));
1358  }
1359 #elif defined(O_DIRECT)
1360  if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
1361  int errno_save;
1362  errno_save = (int)errno;
1363  ut_print_timestamp(stderr);
1364  fprintf(stderr,
1365  " InnoDB: Failed to set O_DIRECT "
1366  "on file %s: %s: %s, continuing anyway\n",
1367  file_name, operation_name, strerror(errno_save));
1368  if (errno_save == EINVAL) {
1369  ut_print_timestamp(stderr);
1370  fprintf(stderr,
1371  " InnoDB: O_DIRECT is known to result in "
1372  "'Invalid argument' on Linux on tmpfs, "
1373  "see MySQL Bug#26662\n");
1374  }
1375  }
1376 #else /* Required for OSX */
1377  (void)fd;
1378  (void)file_name;
1379  (void)operation_name;
1380 #endif
1381 }
1382 
1383 /****************************************************************/
1389 UNIV_INTERN
1390 os_file_t
1392 /*================*/
1393  const char* name,
1395  ulint create_mode,
1403  ulint purpose,
1410  ulint type,
1411  ibool* success)
1412 {
1413 #ifdef __WIN__
1414  os_file_t file;
1415  DWORD share_mode = FILE_SHARE_READ;
1416  DWORD create_flag;
1417  DWORD attributes;
1418  ibool retry;
1419 try_again:
1420  ut_a(name);
1421 
1422  if (create_mode == OS_FILE_OPEN_RAW) {
1423  create_flag = OPEN_EXISTING;
1424  share_mode = FILE_SHARE_WRITE;
1425  } else if (create_mode == OS_FILE_OPEN
1426  || create_mode == OS_FILE_OPEN_RETRY) {
1427  create_flag = OPEN_EXISTING;
1428  } else if (create_mode == OS_FILE_CREATE) {
1429  create_flag = CREATE_NEW;
1430  } else if (create_mode == OS_FILE_OVERWRITE) {
1431  create_flag = CREATE_ALWAYS;
1432  } else {
1433  create_flag = 0;
1434  ut_error;
1435  }
1436 
1437  if (purpose == OS_FILE_AIO) {
1438  /* If specified, use asynchronous (overlapped) io and no
1439  buffering of writes in the OS */
1440  attributes = 0;
1441 #ifdef WIN_ASYNC_IO
1442  if (srv_use_native_aio) {
1443  attributes = attributes | FILE_FLAG_OVERLAPPED;
1444  }
1445 #endif
1446 #ifdef UNIV_NON_BUFFERED_IO
1447 # ifndef UNIV_HOTBACKUP
1448  if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1449  /* Do not use unbuffered i/o to log files because
1450  value 2 denotes that we do not flush the log at every
1451  commit, but only once per second */
1452  } else if (srv_win_file_flush_method
1453  == SRV_WIN_IO_UNBUFFERED) {
1454  attributes = attributes | FILE_FLAG_NO_BUFFERING;
1455  }
1456 # else /* !UNIV_HOTBACKUP */
1457  attributes = attributes | FILE_FLAG_NO_BUFFERING;
1458 # endif /* !UNIV_HOTBACKUP */
1459 #endif /* UNIV_NON_BUFFERED_IO */
1460  } else if (purpose == OS_FILE_NORMAL) {
1461  attributes = 0;
1462 #ifdef UNIV_NON_BUFFERED_IO
1463 # ifndef UNIV_HOTBACKUP
1464  if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
1465  /* Do not use unbuffered i/o to log files because
1466  value 2 denotes that we do not flush the log at every
1467  commit, but only once per second */
1468  } else if (srv_win_file_flush_method
1469  == SRV_WIN_IO_UNBUFFERED) {
1470  attributes = attributes | FILE_FLAG_NO_BUFFERING;
1471  }
1472 # else /* !UNIV_HOTBACKUP */
1473  attributes = attributes | FILE_FLAG_NO_BUFFERING;
1474 # endif /* !UNIV_HOTBACKUP */
1475 #endif /* UNIV_NON_BUFFERED_IO */
1476  } else {
1477  attributes = 0;
1478  ut_error;
1479  }
1480 
1481  file = CreateFile((LPCTSTR) name,
1482  GENERIC_READ | GENERIC_WRITE, /* read and write
1483  access */
1484  share_mode, /* File can be read also by other
1485  processes; we must give the read
1486  permission because of ibbackup. We do
1487  not give the write permission to
1488  others because if one would succeed to
1489  start 2 instances of mysqld on the
1490  SAME files, that could cause severe
1491  database corruption! When opening
1492  raw disk partitions, Microsoft manuals
1493  say that we must give also the write
1494  permission. */
1495  NULL, /* default security attributes */
1496  create_flag,
1497  attributes,
1498  NULL);
1500  if (file == INVALID_HANDLE_VALUE) {
1501  *success = FALSE;
1502 
1503  /* When srv_file_per_table is on, file creation failure may not
1504  be critical to the whole instance. Do not crash the server in
1505  case of unknown errors.
1506  Please note "srv_file_per_table" is a global variable with
1507  no explicit synchronization protection. It could be
1508  changed during this execution path. It might not have the
1509  same value as the one when building the table definition */
1510  if (srv_file_per_table) {
1511  retry = os_file_handle_error_no_exit(name,
1512  create_mode == OS_FILE_CREATE ?
1513  "create" : "open");
1514  } else {
1515  retry = os_file_handle_error(name,
1516  create_mode == OS_FILE_CREATE ?
1517  "create" : "open");
1518  }
1519 
1520  if (retry) {
1521  goto try_again;
1522  }
1523  } else {
1524  *success = TRUE;
1525  }
1526 
1527  return(file);
1528 #else /* __WIN__ */
1529  os_file_t file;
1530  int create_flag;
1531  ibool retry;
1532  const char* mode_str = NULL;
1533 
1534 try_again:
1535  ut_a(name);
1536 
1537  if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW
1538  || create_mode == OS_FILE_OPEN_RETRY) {
1539  mode_str = "OPEN";
1540  if (srv_read_only)
1541  create_flag = O_RDONLY;
1542  else
1543  create_flag = O_RDWR;
1544  } else if (create_mode == OS_FILE_CREATE) {
1545  mode_str = "CREATE";
1546  create_flag = O_RDWR | O_CREAT | O_EXCL;
1547  } else if (create_mode == OS_FILE_OVERWRITE) {
1548  mode_str = "OVERWRITE";
1549  create_flag = O_RDWR | O_CREAT | O_TRUNC;
1550  } else {
1551  create_flag = 0;
1552  ut_error;
1553  }
1554 
1555  ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
1556  ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
1557 
1558 #ifdef O_SYNC
1559  /* We let O_SYNC only affect log files; note that we map O_DSYNC to
1560  O_SYNC because the datasync options seemed to corrupt files in 2001
1561  in both Linux and Solaris */
1562  if (type == OS_LOG_FILE
1563  && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
1564 
1565 # if 0
1566  fprintf(stderr, "Using O_SYNC for file %s\n", name);
1567 # endif
1568 
1569  create_flag = create_flag | O_SYNC;
1570  }
1571 #endif /* O_SYNC */
1572 
1573  file = open(name, create_flag, os_innodb_umask);
1574 
1575  if (file == -1) {
1576  *success = FALSE;
1577 
1578  /* When srv_file_per_table is on, file creation failure may not
1579  be critical to the whole instance. Do not crash the server in
1580  case of unknown errors.
1581  Please note "srv_file_per_table" is a global variable with
1582  no explicit synchronization protection. It could be
1583  changed during this execution path. It might not have the
1584  same value as the one when building the table definition */
1585  if (srv_file_per_table) {
1586  retry = os_file_handle_error_no_exit(name,
1587  create_mode == OS_FILE_CREATE ?
1588  "create" : "open");
1589  } else {
1590  retry = os_file_handle_error(name,
1591  create_mode == OS_FILE_CREATE ?
1592  "create" : "open");
1593  }
1594 
1595  if (retry) {
1596  goto try_again;
1597  } else {
1598  return(file /* -1 */);
1599  }
1600  }
1601  /* else */
1602 
1603  *success = TRUE;
1604 
1605  /* We disable OS caching (O_DIRECT) only on data files */
1606  if (type != OS_LOG_FILE
1607  && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
1608 
1609  os_file_set_nocache(file, name, mode_str);
1610  }
1611 
1612  /* With ALL_O_DIRECT we disable OS caching for trx log file too */
1613  if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
1614  os_file_set_nocache(file, name, mode_str);
1615  }
1616 
1617 #ifdef USE_FILE_LOCK
1618  if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
1619 
1620  if (create_mode == OS_FILE_OPEN_RETRY) {
1621  int i;
1622  ut_print_timestamp(stderr);
1623  fputs(" InnoDB: Retrying to lock"
1624  " the first data file\n",
1625  stderr);
1626  for (i = 0; i < 100; i++) {
1627  os_thread_sleep(1000000);
1628  if (!os_file_lock(file, name)) {
1629  *success = TRUE;
1630  return(file);
1631  }
1632  }
1633  ut_print_timestamp(stderr);
1634  fputs(" InnoDB: Unable to open the first data file\n",
1635  stderr);
1636  }
1637 
1638  *success = FALSE;
1639  close(file);
1640  file = -1;
1641  }
1642 #endif /* USE_FILE_LOCK */
1643 
1644  return(file);
1645 #endif /* __WIN__ */
1646 }
1647 
1648 /***********************************************************************/
1651 UNIV_INTERN
1652 ibool
1654 /*=====================*/
1655  const char* name)
1656 {
1657 #ifdef __WIN__
1658  BOOL ret;
1659  ulint count = 0;
1660 loop:
1661  /* In Windows, deleting an .ibd file may fail if ibbackup is copying
1662  it */
1663 
1664  ret = DeleteFile((LPCTSTR)name);
1665 
1666  if (ret) {
1667  return(TRUE);
1668  }
1669 
1670  if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1671  /* the file does not exist, this not an error */
1672 
1673  return(TRUE);
1674  }
1675 
1676  count++;
1677 
1678  if (count > 100 && 0 == (count % 10)) {
1679  fprintf(stderr,
1680  "InnoDB: Warning: cannot delete file %s\n"
1681  "InnoDB: Are you running ibbackup"
1682  " to back up the file?\n", name);
1683 
1684  os_file_get_last_error(TRUE); /* print error information */
1685  }
1686 
1687  os_thread_sleep(1000000); /* sleep for a second */
1688 
1689  if (count > 2000) {
1690 
1691  return(FALSE);
1692  }
1693 
1694  goto loop;
1695 #else
1696  int ret;
1697 
1698  ret = unlink(name);
1699 
1700  if (ret != 0 && errno != ENOENT) {
1701  os_file_handle_error_no_exit(name, "delete");
1702 
1703  return(FALSE);
1704  }
1705 
1706  return(TRUE);
1707 #endif
1708 }
1709 
1710 /***********************************************************************/
1713 UNIV_INTERN
1714 ibool
1716 /*===========*/
1717  const char* name)
1718 {
1719 #ifdef __WIN__
1720  BOOL ret;
1721  ulint count = 0;
1722 loop:
1723  /* In Windows, deleting an .ibd file may fail if ibbackup is copying
1724  it */
1725 
1726  ret = DeleteFile((LPCTSTR)name);
1727 
1728  if (ret) {
1729  return(TRUE);
1730  }
1731 
1732  if (GetLastError() == ERROR_FILE_NOT_FOUND) {
1733  /* If the file does not exist, we classify this as a 'mild'
1734  error and return */
1735 
1736  return(FALSE);
1737  }
1738 
1739  count++;
1740 
1741  if (count > 100 && 0 == (count % 10)) {
1742  fprintf(stderr,
1743  "InnoDB: Warning: cannot delete file %s\n"
1744  "InnoDB: Are you running ibbackup"
1745  " to back up the file?\n", name);
1746 
1747  os_file_get_last_error(TRUE); /* print error information */
1748  }
1749 
1750  os_thread_sleep(1000000); /* sleep for a second */
1751 
1752  if (count > 2000) {
1753 
1754  return(FALSE);
1755  }
1756 
1757  goto loop;
1758 #else
1759  int ret;
1760 
1761  ret = unlink(name);
1762 
1763  if (ret != 0) {
1764  os_file_handle_error_no_exit(name, "delete");
1765 
1766  return(FALSE);
1767  }
1768 
1769  return(TRUE);
1770 #endif
1771 }
1772 
1773 /***********************************************************************/
1778 UNIV_INTERN
1779 ibool
1781 /*================*/
1782  const char* oldpath,
1784  const char* newpath)
1785 {
1786 #ifdef __WIN__
1787  BOOL ret;
1788 
1789  ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath);
1790 
1791  if (ret) {
1792  return(TRUE);
1793  }
1794 
1795  os_file_handle_error_no_exit(oldpath, "rename");
1796 
1797  return(FALSE);
1798 #else
1799  int ret;
1800 
1801  ret = rename(oldpath, newpath);
1802 
1803  if (ret != 0) {
1804  os_file_handle_error_no_exit(oldpath, "rename");
1805 
1806  return(FALSE);
1807  }
1808 
1809  return(TRUE);
1810 #endif
1811 }
1812 
1813 /***********************************************************************/
1818 UNIV_INTERN
1819 ibool
1821 /*===============*/
1822  os_file_t file)
1823 {
1824 #ifdef __WIN__
1825  BOOL ret;
1826 
1827  ut_a(file);
1828 
1829  ret = CloseHandle(file);
1830 
1831  if (ret) {
1832  return(TRUE);
1833  }
1834 
1835  os_file_handle_error(NULL, "close");
1836 
1837  return(FALSE);
1838 #else
1839  int ret;
1840 
1841  ret = close(file);
1842 
1843  if (ret == -1) {
1844  os_file_handle_error(NULL, "close");
1845 
1846  return(FALSE);
1847  }
1848 
1849  return(TRUE);
1850 #endif
1851 }
1852 
1853 #ifdef UNIV_HOTBACKUP
1854 /***********************************************************************/
1857 UNIV_INTERN
1858 ibool
1859 os_file_close_no_error_handling(
1860 /*============================*/
1861  os_file_t file)
1862 {
1863 #ifdef __WIN__
1864  BOOL ret;
1865 
1866  ut_a(file);
1867 
1868  ret = CloseHandle(file);
1869 
1870  if (ret) {
1871  return(TRUE);
1872  }
1873 
1874  return(FALSE);
1875 #else
1876  int ret;
1877 
1878  ret = close(file);
1879 
1880  if (ret == -1) {
1881 
1882  return(FALSE);
1883  }
1884 
1885  return(TRUE);
1886 #endif
1887 }
1888 #endif /* UNIV_HOTBACKUP */
1889 
1890 /***********************************************************************/
1893 UNIV_INTERN
1894 ibool
1896 /*=============*/
1897  os_file_t file,
1898  ulint* size,
1900  ulint* size_high)
1901 {
1902 #ifdef __WIN__
1903  DWORD high;
1904  DWORD low;
1905 
1906  low = GetFileSize(file, &high);
1907 
1908  if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
1909  return(FALSE);
1910  }
1911 
1912  *size = low;
1913  *size_high = high;
1914 
1915  return(TRUE);
1916 #else
1917  off_t offs;
1918 
1919  offs = lseek(file, 0, SEEK_END);
1920 
1921  if (offs == ((off_t)-1)) {
1922 
1923  return(FALSE);
1924  }
1925 
1926  if (sizeof(off_t) > 4) {
1927  *size = (ulint)(offs & 0xFFFFFFFFUL);
1928  *size_high = (ulint)(offs >> 32);
1929  } else {
1930  *size = (ulint) offs;
1931  *size_high = 0;
1932  }
1933 
1934  return(TRUE);
1935 #endif
1936 }
1937 
1938 /***********************************************************************/
1941 UNIV_INTERN
1942 ib_int64_t
1944 /*===========================*/
1945  os_file_t file)
1946 {
1947  ulint size;
1948  ulint size_high;
1949  ibool success;
1950 
1951  success = os_file_get_size(file, &size, &size_high);
1952 
1953  if (!success) {
1954 
1955  return(-1);
1956  }
1957 
1958  return((((ib_int64_t)size_high) << 32) + (ib_int64_t)size);
1959 }
1960 
1961 /***********************************************************************/
1964 UNIV_INTERN
1965 ibool
1967 /*=============*/
1968  const char* name,
1970  os_file_t file,
1971  ulint size,
1973  ulint size_high)
1974 {
1975  ib_int64_t current_size;
1976  ib_int64_t desired_size;
1977  ibool ret;
1978  byte* buf;
1979  byte* buf2;
1980  ulint buf_size;
1981 
1982  ut_a(size == (size & 0xFFFFFFFF));
1983 
1984  current_size = 0;
1985  desired_size = (ib_int64_t)size + (((ib_int64_t)size_high) << 32);
1986 
1987  /* Write up to 1 megabyte at a time. */
1988  buf_size = ut_min(64, (ulint) (desired_size / UNIV_PAGE_SIZE))
1989  * UNIV_PAGE_SIZE;
1990  buf2 = static_cast<unsigned char *>(ut_malloc(buf_size + UNIV_PAGE_SIZE));
1991 
1992  /* Align the buffer for possible raw i/o */
1993  buf = static_cast<unsigned char *>(ut_align(buf2, UNIV_PAGE_SIZE));
1994 
1995  /* Write buffer full of zeros */
1996  memset(buf, 0, buf_size);
1997 
1998  if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
1999 
2000  fprintf(stderr, "InnoDB: Progress in MB:");
2001  }
2002 
2003  while (current_size < desired_size) {
2004  ulint n_bytes;
2005 
2006  if (desired_size - current_size < (ib_int64_t) buf_size) {
2007  n_bytes = (ulint) (desired_size - current_size);
2008  } else {
2009  n_bytes = buf_size;
2010  }
2011 
2012  ret = os_file_write(name, file, buf,
2013  (ulint)(current_size & 0xFFFFFFFF),
2014  (ulint)(current_size >> 32),
2015  n_bytes);
2016  if (!ret) {
2017  ut_free(buf2);
2018  goto error_handling;
2019  }
2020 
2021  /* Print about progress for each 100 MB written */
2022  if ((ib_int64_t) (current_size + n_bytes) / (ib_int64_t)(100 * 1024 * 1024)
2023  != current_size / (ib_int64_t)(100 * 1024 * 1024)) {
2024 
2025  fprintf(stderr, " %lu00",
2026  (ulong) ((current_size + n_bytes)
2027  / (ib_int64_t)(100 * 1024 * 1024)));
2028  }
2029 
2030  current_size += n_bytes;
2031  }
2032 
2033  if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
2034 
2035  fprintf(stderr, "\n");
2036  }
2037 
2038  ut_free(buf2);
2039 
2040  ret = os_file_flush(file);
2041 
2042  if (ret) {
2043  return(TRUE);
2044  }
2045 
2046 error_handling:
2047  return(FALSE);
2048 }
2049 
2050 /***********************************************************************/
2053 UNIV_INTERN
2054 ibool
2056 /*============*/
2057  FILE* file)
2058 {
2059 #ifdef __WIN__
2060  HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
2061  return(SetEndOfFile(h));
2062 #else /* __WIN__ */
2063  return(!ftruncate(fileno(file), ftell(file)));
2064 #endif /* __WIN__ */
2065 }
2066 
2067 #ifndef __WIN__
2068 /***********************************************************************/
2074 static
2075 int
2076 os_file_fsync(
2077 /*==========*/
2078  os_file_t file)
2079 {
2080  int ret;
2081  int failures;
2082  ibool retry;
2083 
2084  failures = 0;
2085 
2086  do {
2087  ret = fsync(file);
2088 
2089  os_n_fsyncs++;
2090 
2091  if (ret == -1 && errno == ENOLCK) {
2092 
2093  if (failures % 100 == 0) {
2094 
2095  ut_print_timestamp(stderr);
2096  fprintf(stderr,
2097  " InnoDB: fsync(): "
2098  "No locks available; retrying\n");
2099  }
2100 
2101  os_thread_sleep(200000 /* 0.2 sec */);
2102 
2103  failures++;
2104 
2105  retry = TRUE;
2106  } else {
2107 
2108  retry = FALSE;
2109  }
2110  } while (retry);
2111 
2112  return(ret);
2113 }
2114 #endif /* !__WIN__ */
2115 
2116 /***********************************************************************/
2120 UNIV_INTERN
2121 ibool
2123 /*===============*/
2124  os_file_t file)
2125 {
2126 #ifdef __WIN__
2127  BOOL ret;
2128 
2129  ut_a(file);
2130 
2131  os_n_fsyncs++;
2132 
2133  ret = FlushFileBuffers(file);
2134 
2135  if (ret) {
2136  return(TRUE);
2137  }
2138 
2139  /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
2140  actually a raw device, we choose to ignore that error if we are using
2141  raw disks */
2142 
2143  if (srv_start_raw_disk_in_use && GetLastError()
2144  == ERROR_INVALID_FUNCTION) {
2145  return(TRUE);
2146  }
2147 
2148  os_file_handle_error(NULL, "flush");
2149 
2150  /* It is a fatal error if a file flush does not succeed, because then
2151  the database can get corrupt on disk */
2152  ut_error;
2153 
2154  return(FALSE);
2155 #else
2156  int ret;
2157 
2158 #if defined(HAVE_DARWIN_THREADS)
2159 # ifndef F_FULLFSYNC
2160  /* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
2161 # define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
2162 # elif F_FULLFSYNC != 51
2163 # error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
2164 # endif
2165  /* Apple has disabled fsync() for internal disk drives in OS X. That
2166  caused corruption for a user when he tested a power outage. Let us in
2167  OS X use a nonstandard flush method recommended by an Apple
2168  engineer. */
2169 
2170  if (!srv_have_fullfsync) {
2171  /* If we are not on an operating system that supports this,
2172  then fall back to a plain fsync. */
2173 
2174  ret = os_file_fsync(file);
2175  } else {
2176  ret = fcntl(file, F_FULLFSYNC, NULL);
2177 
2178  if (ret) {
2179  /* If we are not on a file system that supports this,
2180  then fall back to a plain fsync. */
2181  ret = os_file_fsync(file);
2182  }
2183  }
2184 #else
2185  ret = os_file_fsync(file);
2186 #endif
2187 
2188  if (ret == 0) {
2189  return(TRUE);
2190  }
2191 
2192  /* Since Linux returns EINVAL if the 'file' is actually a raw device,
2193  we choose to ignore that error if we are using raw disks */
2194 
2195  if (srv_start_raw_disk_in_use && errno == EINVAL) {
2196 
2197  return(TRUE);
2198  }
2199 
2200  ut_print_timestamp(stderr);
2201 
2202  fprintf(stderr,
2203  " InnoDB: Error: the OS said file flush did not succeed\n");
2204 
2205  os_file_handle_error(NULL, "flush");
2206 
2207  /* It is a fatal error if a file flush does not succeed, because then
2208  the database can get corrupt on disk */
2209  ut_error;
2210 
2211  return(FALSE);
2212 #endif
2213 }
2214 
2215 #ifndef __WIN__
2216 /*******************************************************************/
2219 static
2220 ssize_t
2221 os_file_pread(
2222 /*==========*/
2223  os_file_t file,
2224  void* buf,
2225  ulint n,
2226  ulint offset,
2228  ulint offset_high)
2230 {
2231  off_t offs;
2232 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2233  ssize_t n_bytes;
2234 #endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */
2235 
2236  ut_a((offset & 0xFFFFFFFFUL) == offset);
2237 
2238  /* If off_t is > 4 bytes in size, then we assume we can pass a
2239  64-bit address */
2240 
2241  if (sizeof(off_t) > 4) {
2242  offs = (off_t)offset + (((off_t)offset_high) << 32);
2243 
2244  } else {
2245  offs = (off_t)offset;
2246 
2247  if (offset_high > 0) {
2248  fprintf(stderr,
2249  "InnoDB: Error: file read at offset > 4 GB\n");
2250  }
2251  }
2252 
2253  os_n_file_reads++;
2254 
2255 #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
2256  os_mutex_enter(os_file_count_mutex);
2259  os_mutex_exit(os_file_count_mutex);
2260 
2261  n_bytes = pread(file, buf, (ssize_t)n, offs);
2262 
2263  os_mutex_enter(os_file_count_mutex);
2266  os_mutex_exit(os_file_count_mutex);
2267 
2268  return(n_bytes);
2269 #else
2270  {
2271  off_t ret_offset;
2272  ssize_t ret;
2273 #ifndef UNIV_HOTBACKUP
2274  ulint i;
2275 #endif /* !UNIV_HOTBACKUP */
2276 
2277  os_mutex_enter(os_file_count_mutex);
2279  os_mutex_exit(os_file_count_mutex);
2280 
2281 #ifndef UNIV_HOTBACKUP
2282  /* Protect the seek / read operation with a mutex */
2283  i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2284 
2285  os_mutex_enter(os_file_seek_mutexes[i]);
2286 #endif /* !UNIV_HOTBACKUP */
2287 
2288  ret_offset = lseek(file, offs, SEEK_SET);
2289 
2290  if (ret_offset < 0) {
2291  ret = -1;
2292  } else {
2293  ret = read(file, buf, (ssize_t)n);
2294  }
2295 
2296 #ifndef UNIV_HOTBACKUP
2297  os_mutex_exit(os_file_seek_mutexes[i]);
2298 #endif /* !UNIV_HOTBACKUP */
2299 
2300  os_mutex_enter(os_file_count_mutex);
2302  os_mutex_exit(os_file_count_mutex);
2303 
2304  return(ret);
2305  }
2306 #endif
2307 }
2308 
2309 /*******************************************************************/
2312 static
2313 ssize_t
2314 os_file_pwrite(
2315 /*===========*/
2316  os_file_t file,
2317  const void* buf,
2318  ulint n,
2319  ulint offset,
2321  ulint offset_high)
2323 {
2324  ssize_t ret;
2325  off_t offs;
2326 
2327  ut_a((offset & 0xFFFFFFFFUL) == offset);
2328 
2329  /* If off_t is > 4 bytes in size, then we assume we can pass a
2330  64-bit address */
2331 
2332  if (sizeof(off_t) > 4) {
2333  offs = (off_t)offset + (((off_t)offset_high) << 32);
2334  } else {
2335  offs = (off_t)offset;
2336 
2337  if (offset_high > 0) {
2338  fprintf(stderr,
2339  "InnoDB: Error: file write"
2340  " at offset > 4 GB\n");
2341  }
2342  }
2343 
2344  if (srv_fake_write)
2345  return(TRUE);
2346 
2347  os_n_file_writes++;
2348 
2349 #if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
2350  os_mutex_enter(os_file_count_mutex);
2353  os_mutex_exit(os_file_count_mutex);
2354 
2355  ret = pwrite(file, buf, (ssize_t)n, offs);
2356 
2357  os_mutex_enter(os_file_count_mutex);
2360  os_mutex_exit(os_file_count_mutex);
2361 
2362 # ifdef UNIV_DO_FLUSH
2363  if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
2364  && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
2365  && !os_do_not_call_flush_at_each_write) {
2366 
2367  /* Always do fsync to reduce the probability that when
2368  the OS crashes, a database page is only partially
2369  physically written to disk. */
2370 
2371  ut_a(TRUE == os_file_flush(file));
2372  }
2373 # endif /* UNIV_DO_FLUSH */
2374 
2375  return(ret);
2376 #else
2377  {
2378  off_t ret_offset;
2379 # ifndef UNIV_HOTBACKUP
2380  ulint i;
2381 # endif /* !UNIV_HOTBACKUP */
2382 
2383  os_mutex_enter(os_file_count_mutex);
2385  os_mutex_exit(os_file_count_mutex);
2386 
2387 # ifndef UNIV_HOTBACKUP
2388  /* Protect the seek / write operation with a mutex */
2389  i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2390 
2391  os_mutex_enter(os_file_seek_mutexes[i]);
2392 # endif /* UNIV_HOTBACKUP */
2393 
2394  ret_offset = lseek(file, offs, SEEK_SET);
2395 
2396  if (ret_offset < 0) {
2397  ret = -1;
2398 
2399  goto func_exit;
2400  }
2401 
2402  ret = write(file, buf, (ssize_t)n);
2403 
2404 # ifdef UNIV_DO_FLUSH
2405  if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
2406  && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
2407  && !os_do_not_call_flush_at_each_write) {
2408 
2409  /* Always do fsync to reduce the probability that when
2410  the OS crashes, a database page is only partially
2411  physically written to disk. */
2412 
2413  ut_a(TRUE == os_file_flush(file));
2414  }
2415 # endif /* UNIV_DO_FLUSH */
2416 
2417 func_exit:
2418 # ifndef UNIV_HOTBACKUP
2419  os_mutex_exit(os_file_seek_mutexes[i]);
2420 # endif /* !UNIV_HOTBACKUP */
2421 
2422  os_mutex_enter(os_file_count_mutex);
2424  os_mutex_exit(os_file_count_mutex);
2425 
2426  return(ret);
2427  }
2428 #endif
2429 }
2430 #endif
2431 
2432 /*******************************************************************/
2437 UNIV_INTERN
2438 ibool
2440 /*==============*/
2441  os_file_t file,
2442  void* buf,
2443  ulint offset,
2445  ulint offset_high,
2447  ulint n)
2448 {
2449 #ifdef __WIN__
2450  BOOL ret;
2451  DWORD len;
2452  DWORD ret2;
2453  DWORD low;
2454  DWORD high;
2455  ibool retry;
2456 #ifndef UNIV_HOTBACKUP
2457  ulint i;
2458 #endif /* !UNIV_HOTBACKUP */
2459 
2460  /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2461  no more than 32 bits. */
2462  ut_a((offset & 0xFFFFFFFFUL) == offset);
2463  ut_a((n & 0xFFFFFFFFUL) == n);
2464 
2465  os_n_file_reads++;
2466  os_bytes_read_since_printout += n;
2467 
2468 try_again:
2469  ut_ad(file);
2470  ut_ad(buf);
2471  ut_ad(n > 0);
2472 
2473  low = (DWORD) offset;
2474  high = (DWORD) offset_high;
2475 
2476  os_mutex_enter(os_file_count_mutex);
2478  os_mutex_exit(os_file_count_mutex);
2479 
2480 #ifndef UNIV_HOTBACKUP
2481  /* Protect the seek / read operation with a mutex */
2482  i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2483 
2484  os_mutex_enter(os_file_seek_mutexes[i]);
2485 #endif /* !UNIV_HOTBACKUP */
2486 
2487  ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2488 
2489  if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2490 
2491 #ifndef UNIV_HOTBACKUP
2492  os_mutex_exit(os_file_seek_mutexes[i]);
2493 #endif /* !UNIV_HOTBACKUP */
2494 
2495  os_mutex_enter(os_file_count_mutex);
2497  os_mutex_exit(os_file_count_mutex);
2498 
2499  goto error_handling;
2500  }
2501 
2502  ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2503 
2504 #ifndef UNIV_HOTBACKUP
2505  os_mutex_exit(os_file_seek_mutexes[i]);
2506 #endif /* !UNIV_HOTBACKUP */
2507 
2508  os_mutex_enter(os_file_count_mutex);
2510  os_mutex_exit(os_file_count_mutex);
2511 
2512  if (ret && len == n) {
2513  return(TRUE);
2514  }
2515 #else /* __WIN__ */
2516  ibool retry;
2517  ssize_t ret;
2518 
2519  os_bytes_read_since_printout += n;
2520 
2521 try_again:
2522  ret = os_file_pread(file, buf, n, offset, offset_high);
2523 
2524  if ((ulint)ret == n) {
2525 
2526  return(TRUE);
2527  }
2528 
2529  fprintf(stderr,
2530  "InnoDB: Error: tried to read %lu bytes at offset %lu %lu.\n"
2531  "InnoDB: Was only able to read %ld.\n",
2532  (ulong)n, (ulong)offset_high,
2533  (ulong)offset, (long)ret);
2534 #endif /* __WIN__ */
2535 #ifdef __WIN__
2536 error_handling:
2537 #endif
2538  retry = os_file_handle_error(NULL, "read");
2539 
2540  if (retry) {
2541  goto try_again;
2542  }
2543 
2544  fprintf(stderr,
2545  "InnoDB: Fatal error: cannot read from file."
2546  " OS error number %lu.\n",
2547 #ifdef __WIN__
2548  (ulong) GetLastError()
2549 #else
2550  (ulong) errno
2551 #endif
2552  );
2553  fflush(stderr);
2554 
2555  ut_error;
2556 
2557  return(FALSE);
2558 }
2559 
2560 /*******************************************************************/
2566 UNIV_INTERN
2567 ibool
2569 /*================================*/
2570  os_file_t file,
2571  void* buf,
2572  ulint offset,
2574  ulint offset_high,
2576  ulint n)
2577 {
2578 #ifdef __WIN__
2579  BOOL ret;
2580  DWORD len;
2581  DWORD ret2;
2582  DWORD low;
2583  DWORD high;
2584  ibool retry;
2585 #ifndef UNIV_HOTBACKUP
2586  ulint i;
2587 #endif /* !UNIV_HOTBACKUP */
2588 
2589  /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2590  no more than 32 bits. */
2591  ut_a((offset & 0xFFFFFFFFUL) == offset);
2592  ut_a((n & 0xFFFFFFFFUL) == n);
2593 
2594  os_n_file_reads++;
2595  os_bytes_read_since_printout += n;
2596 
2597 try_again:
2598  ut_ad(file);
2599  ut_ad(buf);
2600  ut_ad(n > 0);
2601 
2602  low = (DWORD) offset;
2603  high = (DWORD) offset_high;
2604 
2605  os_mutex_enter(os_file_count_mutex);
2607  os_mutex_exit(os_file_count_mutex);
2608 
2609 #ifndef UNIV_HOTBACKUP
2610  /* Protect the seek / read operation with a mutex */
2611  i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2612 
2613  os_mutex_enter(os_file_seek_mutexes[i]);
2614 #endif /* !UNIV_HOTBACKUP */
2615 
2616  ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2617 
2618  if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2619 
2620 #ifndef UNIV_HOTBACKUP
2621  os_mutex_exit(os_file_seek_mutexes[i]);
2622 #endif /* !UNIV_HOTBACKUP */
2623 
2624  os_mutex_enter(os_file_count_mutex);
2626  os_mutex_exit(os_file_count_mutex);
2627 
2628  goto error_handling;
2629  }
2630 
2631  ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
2632 
2633 #ifndef UNIV_HOTBACKUP
2634  os_mutex_exit(os_file_seek_mutexes[i]);
2635 #endif /* !UNIV_HOTBACKUP */
2636 
2637  os_mutex_enter(os_file_count_mutex);
2639  os_mutex_exit(os_file_count_mutex);
2640 
2641  if (ret && len == n) {
2642  return(TRUE);
2643  }
2644 #else /* __WIN__ */
2645  ibool retry;
2646  ssize_t ret;
2647 
2648  os_bytes_read_since_printout += n;
2649 
2650 try_again:
2651  ret = os_file_pread(file, buf, n, offset, offset_high);
2652 
2653  if ((ulint)ret == n) {
2654 
2655  return(TRUE);
2656  }
2657 #endif /* __WIN__ */
2658 #ifdef __WIN__
2659 error_handling:
2660 #endif
2661  retry = os_file_handle_error_no_exit(NULL, "read");
2662 
2663  if (retry) {
2664  goto try_again;
2665  }
2666 
2667  return(FALSE);
2668 }
2669 
2670 /*******************************************************************/
2674 UNIV_INTERN
2675 void
2677 /*================*/
2678  FILE* file,
2679  char* str,
2680  ulint size)
2681 {
2682  size_t flen;
2683 
2684  if (size == 0) {
2685  return;
2686  }
2687 
2688  rewind(file);
2689  flen = fread(str, 1, size - 1, file);
2690  str[flen] = '\0';
2691 }
2692 
2693 /*******************************************************************/
2698 UNIV_INTERN
2699 ibool
2701 /*===============*/
2702  const char* name,
2704  os_file_t file,
2705  const void* buf,
2706  ulint offset,
2708  ulint offset_high,
2710  ulint n)
2711 {
2712 #ifdef __WIN__
2713  BOOL ret;
2714  DWORD len;
2715  DWORD ret2;
2716  DWORD low;
2717  DWORD high;
2718  ulint n_retries = 0;
2719  ulint err;
2720 #ifndef UNIV_HOTBACKUP
2721  ulint i;
2722 #endif /* !UNIV_HOTBACKUP */
2723 
2724  /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
2725  no more than 32 bits. */
2726  ut_a((offset & 0xFFFFFFFFUL) == offset);
2727  ut_a((n & 0xFFFFFFFFUL) == n);
2728 
2729  if (srv_fake_write)
2730  return(TRUE);
2731 
2732  os_n_file_writes++;
2733 
2734  ut_ad(file);
2735  ut_ad(buf);
2736  ut_ad(n > 0);
2737 retry:
2738  low = (DWORD) offset;
2739  high = (DWORD) offset_high;
2740 
2741  os_mutex_enter(os_file_count_mutex);
2743  os_mutex_exit(os_file_count_mutex);
2744 
2745 #ifndef UNIV_HOTBACKUP
2746  /* Protect the seek / write operation with a mutex */
2747  i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
2748 
2749  os_mutex_enter(os_file_seek_mutexes[i]);
2750 #endif /* !UNIV_HOTBACKUP */
2751 
2752  ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
2753 
2754  if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
2755 
2756 #ifndef UNIV_HOTBACKUP
2757  os_mutex_exit(os_file_seek_mutexes[i]);
2758 #endif /* !UNIV_HOTBACKUP */
2759 
2760  os_mutex_enter(os_file_count_mutex);
2762  os_mutex_exit(os_file_count_mutex);
2763 
2764  ut_print_timestamp(stderr);
2765 
2766  fprintf(stderr,
2767  " InnoDB: Error: File pointer positioning to"
2768  " file %s failed at\n"
2769  "InnoDB: offset %lu %lu. Operating system"
2770  " error number %lu.\n"
2771  "InnoDB: Some operating system error numbers"
2772  " are described at\n"
2773  "InnoDB: "
2774  REFMAN "operating-system-error-codes.html\n",
2775  name, (ulong) offset_high, (ulong) offset,
2776  (ulong) GetLastError());
2777 
2778  return(FALSE);
2779  }
2780 
2781  ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
2782 
2783  /* Always do fsync to reduce the probability that when the OS crashes,
2784  a database page is only partially physically written to disk. */
2785 
2786 # ifdef UNIV_DO_FLUSH
2787  if (!os_do_not_call_flush_at_each_write) {
2788  ut_a(TRUE == os_file_flush(file));
2789  }
2790 # endif /* UNIV_DO_FLUSH */
2791 
2792 #ifndef UNIV_HOTBACKUP
2793  os_mutex_exit(os_file_seek_mutexes[i]);
2794 #endif /* !UNIV_HOTBACKUP */
2795 
2796  os_mutex_enter(os_file_count_mutex);
2798  os_mutex_exit(os_file_count_mutex);
2799 
2800  if (ret && len == n) {
2801 
2802  return(TRUE);
2803  }
2804 
2805  /* If some background file system backup tool is running, then, at
2806  least in Windows 2000, we may get here a specific error. Let us
2807  retry the operation 100 times, with 1 second waits. */
2808 
2809  if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
2810 
2811  os_thread_sleep(1000000);
2812 
2813  n_retries++;
2814 
2815  goto retry;
2816  }
2817 
2818  if (!os_has_said_disk_full) {
2819 
2820  err = (ulint)GetLastError();
2821 
2822  ut_print_timestamp(stderr);
2823 
2824  fprintf(stderr,
2825  " InnoDB: Error: Write to file %s failed"
2826  " at offset %lu %lu.\n"
2827  "InnoDB: %lu bytes should have been written,"
2828  " only %lu were written.\n"
2829  "InnoDB: Operating system error number %lu.\n"
2830  "InnoDB: Check that your OS and file system"
2831  " support files of this size.\n"
2832  "InnoDB: Check also that the disk is not full"
2833  " or a disk quota exceeded.\n",
2834  name, (ulong) offset_high, (ulong) offset,
2835  (ulong) n, (ulong) len, (ulong) err);
2836 
2837  if (strerror((int)err) != NULL) {
2838  fprintf(stderr,
2839  "InnoDB: Error number %lu means '%s'.\n",
2840  (ulong) err, strerror((int)err));
2841  }
2842 
2843  fprintf(stderr,
2844  "InnoDB: Some operating system error numbers"
2845  " are described at\n"
2846  "InnoDB: "
2847  REFMAN "operating-system-error-codes.html\n");
2848 
2849  os_has_said_disk_full = TRUE;
2850  }
2851 
2852  return(FALSE);
2853 #else
2854  ssize_t ret;
2855 
2856  ret = os_file_pwrite(file, buf, n, offset, offset_high);
2857 
2858  if ((ulint)ret == n) {
2859 
2860  return(TRUE);
2861  }
2862 
2863  if (!os_has_said_disk_full) {
2864 
2865  ut_print_timestamp(stderr);
2866 
2867  fprintf(stderr,
2868  " InnoDB: Error: Write to file %s failed"
2869  " at offset %lu %lu.\n"
2870  "InnoDB: %lu bytes should have been written,"
2871  " only %ld were written.\n"
2872  "InnoDB: Operating system error number %lu.\n"
2873  "InnoDB: Check that your OS and file system"
2874  " support files of this size.\n"
2875  "InnoDB: Check also that the disk is not full"
2876  " or a disk quota exceeded.\n",
2877  name, offset_high, offset, n, (long int)ret,
2878  (ulint)errno);
2879  if (strerror(errno) != NULL) {
2880  fprintf(stderr,
2881  "InnoDB: Error number %lu means '%s'.\n",
2882  (ulint)errno, strerror(errno));
2883  }
2884 
2885  fprintf(stderr,
2886  "InnoDB: Some operating system error numbers"
2887  " are described at\n"
2888  "InnoDB: "
2889  REFMAN "operating-system-error-codes.html\n");
2890 
2891  os_has_said_disk_full = TRUE;
2892  }
2893 
2894  return(FALSE);
2895 #endif
2896 }
2897 
2898 /*******************************************************************/
2901 UNIV_INTERN
2902 ibool
2904 /*===========*/
2905  const char* path,
2906  ibool* exists,
2907  os_file_type_t* type)
2908 {
2909 #ifdef __WIN__
2910  int ret;
2911  struct _stat statinfo;
2912 
2913  ret = _stat(path, &statinfo);
2914  if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2915  /* file does not exist */
2916  *exists = FALSE;
2917  return(TRUE);
2918  } else if (ret) {
2919  /* file exists, but stat call failed */
2920 
2921  os_file_handle_error_no_exit(path, "stat");
2922 
2923  return(FALSE);
2924  }
2925 
2926  if (_S_IFDIR & statinfo.st_mode) {
2927  *type = OS_FILE_TYPE_DIR;
2928  } else if (_S_IFREG & statinfo.st_mode) {
2929  *type = OS_FILE_TYPE_FILE;
2930  } else {
2931  *type = OS_FILE_TYPE_UNKNOWN;
2932  }
2933 
2934  *exists = TRUE;
2935 
2936  return(TRUE);
2937 #else
2938  int ret;
2939  struct stat statinfo;
2940 
2941  ret = stat(path, &statinfo);
2942  if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2943  /* file does not exist */
2944  *exists = FALSE;
2945  return(TRUE);
2946  } else if (ret) {
2947  /* file exists, but stat call failed */
2948 
2949  os_file_handle_error_no_exit(path, "stat");
2950 
2951  return(FALSE);
2952  }
2953 
2954  if (S_ISDIR(statinfo.st_mode)) {
2955  *type = OS_FILE_TYPE_DIR;
2956  } else if (S_ISLNK(statinfo.st_mode)) {
2957  *type = OS_FILE_TYPE_LINK;
2958  } else if (S_ISREG(statinfo.st_mode)) {
2959  *type = OS_FILE_TYPE_FILE;
2960  } else {
2961  *type = OS_FILE_TYPE_UNKNOWN;
2962  }
2963 
2964  *exists = TRUE;
2965 
2966  return(TRUE);
2967 #endif
2968 }
2969 
2970 /*******************************************************************/
2973 UNIV_INTERN
2974 ibool
2976 /*===============*/
2977  const char* path,
2978  os_file_stat_t* stat_info)
2980 {
2981 #ifdef __WIN__
2982  int ret;
2983  struct _stat statinfo;
2984 
2985  ret = _stat(path, &statinfo);
2986  if (ret && (errno == ENOENT || errno == ENOTDIR)) {
2987  /* file does not exist */
2988 
2989  return(FALSE);
2990  } else if (ret) {
2991  /* file exists, but stat call failed */
2992 
2993  os_file_handle_error_no_exit(path, "stat");
2994 
2995  return(FALSE);
2996  }
2997  if (_S_IFDIR & statinfo.st_mode) {
2998  stat_info->type = OS_FILE_TYPE_DIR;
2999  } else if (_S_IFREG & statinfo.st_mode) {
3000  stat_info->type = OS_FILE_TYPE_FILE;
3001  } else {
3002  stat_info->type = OS_FILE_TYPE_UNKNOWN;
3003  }
3004 
3005  stat_info->ctime = statinfo.st_ctime;
3006  stat_info->atime = statinfo.st_atime;
3007  stat_info->mtime = statinfo.st_mtime;
3008  stat_info->size = statinfo.st_size;
3009 
3010  return(TRUE);
3011 #else
3012  int ret;
3013  struct stat statinfo;
3014 
3015  ret = stat(path, &statinfo);
3016 
3017  if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3018  /* file does not exist */
3019 
3020  return(FALSE);
3021  } else if (ret) {
3022  /* file exists, but stat call failed */
3023 
3024  os_file_handle_error_no_exit(path, "stat");
3025 
3026  return(FALSE);
3027  }
3028 
3029  if (S_ISDIR(statinfo.st_mode)) {
3030  stat_info->type = OS_FILE_TYPE_DIR;
3031  } else if (S_ISLNK(statinfo.st_mode)) {
3032  stat_info->type = OS_FILE_TYPE_LINK;
3033  } else if (S_ISREG(statinfo.st_mode)) {
3034  stat_info->type = OS_FILE_TYPE_FILE;
3035  } else {
3036  stat_info->type = OS_FILE_TYPE_UNKNOWN;
3037  }
3038 
3039  stat_info->ctime = statinfo.st_ctime;
3040  stat_info->atime = statinfo.st_atime;
3041  stat_info->mtime = statinfo.st_mtime;
3042  stat_info->size = statinfo.st_size;
3043 
3044  return(TRUE);
3045 #endif
3046 }
3047 
3048 /* path name separator character */
3049 #ifdef __WIN__
3050 # define OS_FILE_PATH_SEPARATOR '\\'
3051 #else
3052 # define OS_FILE_PATH_SEPARATOR '/'
3053 #endif
3054 
3055 /****************************************************************/
3083 UNIV_INTERN
3084 char*
3086 /*============*/
3087  const char* path)
3088 {
3089  /* Find the offset of the last slash */
3090  const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
3091  if (!last_slash) {
3092  /* No slash in the path, return "." */
3093 
3094  return(mem_strdup("."));
3095  }
3096 
3097  /* Ok, there is a slash */
3098 
3099  if (last_slash == path) {
3100  /* last slash is the first char of the path */
3101 
3102  return(mem_strdup("/"));
3103  }
3104 
3105  /* Non-trivial directory component */
3106 
3107  return(mem_strdupl(path, last_slash - path));
3108 }
3109 
3110 /****************************************************************/
3113 UNIV_INTERN
3114 ibool
3116 /*=============================*/
3117  const char* path)
3118 {
3119  char* subdir;
3120  ibool success, subdir_exists;
3121  os_file_type_t type;
3122 
3123  subdir = os_file_dirname(path);
3124  if (strlen(subdir) == 1
3125  && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
3126  /* subdir is root or cwd, nothing to do */
3127  mem_free(subdir);
3128 
3129  return(TRUE);
3130  }
3131 
3132  /* Test if subdir exists */
3133  success = os_file_status(subdir, &subdir_exists, &type);
3134  if (success && !subdir_exists) {
3135  /* subdir does not exist, create it */
3136  success = os_file_create_subdirs_if_needed(subdir);
3137  if (!success) {
3138  mem_free(subdir);
3139 
3140  return(FALSE);
3141  }
3142  success = os_file_create_directory(subdir, FALSE);
3143  }
3144 
3145  mem_free(subdir);
3146 
3147  return(success);
3148 }
3149 
3150 #ifndef UNIV_HOTBACKUP
3151 /****************************************************************/
3154 static
3156 os_aio_array_get_nth_slot(
3157 /*======================*/
3158  os_aio_array_t* array,
3159  ulint index)
3160 {
3161  ut_a(index < array->n_slots);
3162 
3163  return((array->slots) + index);
3164 }
3165 
3166 #if defined(LINUX_NATIVE_AIO)
3167 /******************************************************************/
3170 static
3171 ibool
3172 os_aio_linux_create_io_ctx(
3173 /*=======================*/
3174  ulint max_events,
3175  io_context_t* io_ctx)
3176 {
3177  int ret;
3178  ulint retries = 0;
3179 
3180 retry:
3181  memset(io_ctx, 0x0, sizeof(*io_ctx));
3182 
3183  /* Initialize the io_ctx. Tell it how many pending
3184  IO requests this context will handle. */
3185 
3186  ret = io_setup(max_events, io_ctx);
3187  if (ret == 0) {
3188 #if defined(UNIV_AIO_DEBUG)
3189  fprintf(stderr,
3190  "InnoDB: Linux native AIO:"
3191  " initialized io_ctx for segment\n");
3192 #endif
3193  /* Success. Return now. */
3194  return(TRUE);
3195  }
3196 
3197  /* If we hit EAGAIN we'll make a few attempts before failing. */
3198 
3199  switch (ret) {
3200  case -EAGAIN:
3201  if (retries == 0) {
3202  /* First time around. */
3203  ut_print_timestamp(stderr);
3204  fprintf(stderr,
3205  " InnoDB: Warning: io_setup() failed"
3206  " with EAGAIN. Will make %d attempts"
3207  " before giving up.\n",
3208  OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3209  }
3210 
3211  if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
3212  ++retries;
3213  fprintf(stderr,
3214  "InnoDB: Warning: io_setup() attempt"
3215  " %lu failed.\n",
3216  retries);
3217  os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
3218  goto retry;
3219  }
3220 
3221  /* Have tried enough. Better call it a day. */
3222  ut_print_timestamp(stderr);
3223  fprintf(stderr,
3224  " InnoDB: Error: io_setup() failed"
3225  " with EAGAIN after %d attempts.\n",
3226  OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
3227  break;
3228 
3229  case -ENOSYS:
3230  ut_print_timestamp(stderr);
3231  fprintf(stderr,
3232  " InnoDB: Error: Linux Native AIO interface"
3233  " is not supported on this platform. Please"
3234  " check your OS documentation and install"
3235  " appropriate binary of InnoDB.\n");
3236 
3237  break;
3238 
3239  default:
3240  ut_print_timestamp(stderr);
3241  fprintf(stderr,
3242  " InnoDB: Error: Linux Native AIO setup"
3243  " returned following error[%d]\n", -ret);
3244  break;
3245  }
3246 
3247  fprintf(stderr,
3248  "InnoDB: You can disable Linux Native AIO by"
3249  " setting innodb_native_aio = off in my.cnf\n");
3250  return(FALSE);
3251 }
3252 #endif /* LINUX_NATIVE_AIO */
3253 
3254 /******************************************************************/
3259 static
3261 os_aio_array_create(
3262 /*================*/
3263  ulint n,
3266  ulint n_segments)
3267 {
3268  os_aio_array_t* array;
3269  ulint i;
3270  os_aio_slot_t* slot;
3271 #ifdef WIN_ASYNC_IO
3272  OVERLAPPED* over;
3273 #elif defined(LINUX_NATIVE_AIO)
3274  struct io_event* aio_event = NULL;
3275 #endif
3276  ut_a(n > 0);
3277  ut_a(n_segments > 0);
3278 
3279  array = static_cast<os_aio_array_t *>(ut_malloc(sizeof(os_aio_array_t)));
3280 
3281  array->mutex = os_mutex_create();
3282  array->not_full = os_event_create(NULL);
3283  array->is_empty = os_event_create(NULL);
3284 
3285  os_event_set(array->is_empty);
3286 
3287  array->n_slots = n;
3288  array->n_segments = n_segments;
3289  array->n_reserved = 0;
3290  array->cur_seg = 0;
3291  array->slots = static_cast<os_aio_slot_t *>(ut_malloc(n * sizeof(os_aio_slot_t)));
3292 #ifdef __WIN__
3293  array->handles = ut_malloc(n * sizeof(HANDLE));
3294 #endif
3295 
3296 #if defined(LINUX_NATIVE_AIO)
3297  array->aio_ctx = NULL;
3298  array->aio_events = NULL;
3299 
3300  /* If we are not using native aio interface then skip this
3301  part of initialization. */
3302  if (!srv_use_native_aio) {
3303  goto skip_native_aio;
3304  }
3305 
3306  /* Initialize the io_context array. One io_context
3307  per segment in the array. */
3308 
3309  array->aio_ctx = (io_context**) ut_malloc(n_segments *
3310  sizeof(*array->aio_ctx));
3311  for (i = 0; i < n_segments; ++i) {
3312  if (!os_aio_linux_create_io_ctx(n/n_segments,
3313  &array->aio_ctx[i])) {
3314  /* If something bad happened during aio setup
3315  we should call it a day and return right away.
3316  We don't care about any leaks because a failure
3317  to initialize the io subsystem means that the
3318  server (or atleast the innodb storage engine)
3319  is not going to startup. */
3320  return(NULL);
3321  }
3322  }
3323 
3324  /* Initialize the event array. One event per slot. */
3325  aio_event = (io_event*) ut_malloc(n * sizeof(io_event));
3326  memset(aio_event, 0x0, sizeof(io_event) * n);
3327  array->aio_events = aio_event;
3328 
3329 skip_native_aio:
3330 #endif /* LINUX_NATIVE_AIO */
3331  for (i = 0; i < n; i++) {
3332  slot = os_aio_array_get_nth_slot(array, i);
3333 
3334  slot->pos = i;
3335  slot->reserved = FALSE;
3336 #ifdef WIN_ASYNC_IO
3337  slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
3338 
3339  over = &(slot->control);
3340 
3341  over->hEvent = slot->handle;
3342 
3343  *((array->handles) + i) = over->hEvent;
3344 
3345 #elif defined(LINUX_NATIVE_AIO)
3346 
3347  memset(&slot->control, 0x0, sizeof(slot->control));
3348  slot->n_bytes = 0;
3349  slot->ret = 0;
3350 #endif
3351  }
3352 
3353  return(array);
3354 }
3355 
3356 /************************************************************************/
3358 static
3359 void
3360 os_aio_array_free(
3361 /*==============*/
3362  os_aio_array_t* array)
3363 {
3364 #ifdef WIN_ASYNC_IO
3365  ulint i;
3366 
3367  for (i = 0; i < array->n_slots; i++) {
3368  os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
3369  CloseHandle(slot->handle);
3370  }
3371 #endif /* WIN_ASYNC_IO */
3372 
3373 #ifdef __WIN__
3374  ut_free(array->handles);
3375 #endif /* __WIN__ */
3376  os_mutex_free(array->mutex);
3377  os_event_free(array->not_full);
3378  os_event_free(array->is_empty);
3379 
3380 #if defined(LINUX_NATIVE_AIO)
3381  if (srv_use_native_aio) {
3382  ut_free(array->aio_events);
3383  ut_free(array->aio_ctx);
3384  }
3385 #endif /* LINUX_NATIVE_AIO */
3386 
3387  ut_free(array->slots);
3388  ut_free(array);
3389 }
3390 
3391 /***********************************************************************
3392 Initializes the asynchronous io system. Creates one array each for ibuf
3393 and log i/o. Also creates one array each for read and write where each
3394 array is divided logically into n_read_segs and n_write_segs
3395 respectively. The caller must create an i/o handler thread for each
3396 segment in these arrays. This function also creates the sync array.
3397 No i/o handler thread needs to be created for that */
3398 UNIV_INTERN
3399 ibool
3400 os_aio_init(
3401 /*========*/
3402  ulint n_per_seg, /*<! in: maximum number of pending aio
3403  operations allowed per segment */
3404  ulint n_read_segs, /*<! in: number of reader threads */
3405  ulint n_write_segs, /*<! in: number of writer threads */
3406  ulint n_slots_sync) /*<! in: number of slots in the sync aio
3407  array */
3408 {
3409  ulint i;
3410  ulint n_segments = 2 + n_read_segs + n_write_segs;
3411 
3412  ut_ad(n_segments >= 4);
3413 
3415 
3416  for (i = 0; i < n_segments; i++) {
3417  srv_set_io_thread_op_info(i, "not started yet");
3418  }
3419 
3420 
3421  /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
3422 
3423  os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
3424  if (os_aio_ibuf_array == NULL) {
3425  goto err_exit;
3426  }
3427 
3428  srv_io_thread_function[0] = "insert buffer thread";
3429 
3430  os_aio_log_array = os_aio_array_create(n_per_seg, 1);
3431  if (os_aio_log_array == NULL) {
3432  goto err_exit;
3433  }
3434 
3435  srv_io_thread_function[1] = "log thread";
3436 
3437  os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
3438  n_read_segs);
3439  if (os_aio_read_array == NULL) {
3440  goto err_exit;
3441  }
3442 
3443  for (i = 2; i < 2 + n_read_segs; i++) {
3444  ut_a(i < SRV_MAX_N_IO_THREADS);
3445  srv_io_thread_function[i] = "read thread";
3446  }
3447 
3448  os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
3449  n_write_segs);
3450  if (os_aio_write_array == NULL) {
3451  goto err_exit;
3452  }
3453 
3454  for (i = 2 + n_read_segs; i < n_segments; i++) {
3455  ut_a(i < SRV_MAX_N_IO_THREADS);
3456  srv_io_thread_function[i] = "write thread";
3457  }
3458 
3459  os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
3460  if (os_aio_sync_array == NULL) {
3461  goto err_exit;
3462  }
3463 
3464 
3465  os_aio_n_segments = n_segments;
3466 
3467  os_aio_validate();
3468 
3469  os_aio_segment_wait_events = static_cast<os_event_t *>(ut_malloc(n_segments * sizeof(void*)));
3470 
3471  for (i = 0; i < n_segments; i++) {
3472  os_aio_segment_wait_events[i] = os_event_create(NULL);
3473  }
3474 
3475  os_last_printout = time(NULL);
3476 
3477  return(TRUE);
3478 
3479 err_exit:
3480  return(FALSE);
3481 
3482 }
3483 
3484 /***********************************************************************
3485 Frees the asynchronous io system. */
3486 UNIV_INTERN
3487 void
3488 os_aio_free(void)
3489 /*=============*/
3490 {
3491  ulint i;
3492 
3493  os_aio_array_free(os_aio_ibuf_array);
3494  os_aio_ibuf_array = NULL;
3495  os_aio_array_free(os_aio_log_array);
3496  os_aio_log_array = NULL;
3497  os_aio_array_free(os_aio_read_array);
3498  os_aio_read_array = NULL;
3499  os_aio_array_free(os_aio_write_array);
3500  os_aio_write_array = NULL;
3501  os_aio_array_free(os_aio_sync_array);
3502  os_aio_sync_array = NULL;
3503 
3504  for (i = 0; i < os_aio_n_segments; i++) {
3505  os_event_free(os_aio_segment_wait_events[i]);
3506  }
3507 
3508  ut_free(os_aio_segment_wait_events);
3509  os_aio_segment_wait_events = 0;
3510  os_aio_n_segments = 0;
3511 }
3512 
3513 #ifdef WIN_ASYNC_IO
3514 /************************************************************************/
3517 static
3518 void
3519 os_aio_array_wake_win_aio_at_shutdown(
3520 /*==================================*/
3521  os_aio_array_t* array)
3522 {
3523  ulint i;
3524 
3525  for (i = 0; i < array->n_slots; i++) {
3526 
3527  SetEvent((array->slots + i)->handle);
3528  }
3529 }
3530 #endif
3531 
3532 /************************************************************************/
3535 UNIV_INTERN
3536 void
3538 /*=====================================*/
3539 {
3540  ulint i;
3541 
3542 #ifdef WIN_ASYNC_IO
3543  /* This code wakes up all ai/o threads in Windows native aio */
3544  os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
3545  os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
3546  os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
3547  os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
3548 
3549 #elif defined(LINUX_NATIVE_AIO)
3550 
3551  /* When using native AIO interface the io helper threads
3552  wait on io_getevents with a timeout value of 500ms. At
3553  each wake up these threads check the server status.
3554  No need to do anything to wake them up. */
3555 
3556  if (srv_use_native_aio) {
3557  return;
3558  }
3559  /* Fall through to simulated AIO handler wakeup if we are
3560  not using native AIO. */
3561 #endif
3562  /* This loop wakes up all simulated ai/o threads */
3563 
3564  for (i = 0; i < os_aio_n_segments; i++) {
3565 
3566  os_event_set(os_aio_segment_wait_events[i]);
3567  }
3568 }
3569 
3570 /************************************************************************/
3573 UNIV_INTERN
3574 void
3576 /*=====================================*/
3577 {
3578  os_event_wait(os_aio_write_array->is_empty);
3579 }
3580 
3581 /**********************************************************************/
3585 static
3586 ulint
3587 os_aio_get_segment_no_from_slot(
3588 /*============================*/
3589  os_aio_array_t* array,
3590  os_aio_slot_t* slot)
3591 {
3592  ulint segment;
3593  ulint seg_len;
3594 
3595  if (array == os_aio_ibuf_array) {
3596  segment = 0;
3597 
3598  } else if (array == os_aio_log_array) {
3599  segment = 1;
3600 
3601  } else if (array == os_aio_read_array) {
3602  seg_len = os_aio_read_array->n_slots
3603  / os_aio_read_array->n_segments;
3604 
3605  segment = 2 + slot->pos / seg_len;
3606  } else {
3607  ut_a(array == os_aio_write_array);
3608  seg_len = os_aio_write_array->n_slots
3609  / os_aio_write_array->n_segments;
3610 
3611  segment = os_aio_read_array->n_segments + 2
3612  + slot->pos / seg_len;
3613  }
3614 
3615  return(segment);
3616 }
3617 
3618 /**********************************************************************/
3621 static
3622 ulint
3623 os_aio_get_array_and_local_segment(
3624 /*===============================*/
3625  os_aio_array_t** array,
3626  ulint global_segment)
3627 {
3628  ulint segment;
3629 
3630  ut_a(global_segment < os_aio_n_segments);
3631 
3632  if (global_segment == 0) {
3633  *array = os_aio_ibuf_array;
3634  segment = 0;
3635 
3636  } else if (global_segment == 1) {
3637  *array = os_aio_log_array;
3638  segment = 0;
3639 
3640  } else if (global_segment < os_aio_read_array->n_segments + 2) {
3641  *array = os_aio_read_array;
3642 
3643  segment = global_segment - 2;
3644  } else {
3645  *array = os_aio_write_array;
3646 
3647  segment = global_segment - (os_aio_read_array->n_segments + 2);
3648  }
3649 
3650  return(segment);
3651 }
3652 
3653 /*******************************************************************/
3657 static
3659 os_aio_array_reserve_slot(
3660 /*======================*/
3661  ulint type,
3662  os_aio_array_t* array,
3663  fil_node_t* message1,
3665  void* message2,
3667  os_file_t file,
3668  const char* name,
3670  void* buf,
3672  ulint offset,
3674  ulint offset_high,
3676  ulint len)
3677 {
3678  os_aio_slot_t* slot = NULL;
3679 #ifdef WIN_ASYNC_IO
3680  OVERLAPPED* control;
3681 
3682 #elif defined(LINUX_NATIVE_AIO)
3683 
3684  struct iocb* iocb;
3685  off_t aio_offset;
3686 
3687 #endif
3688  ulint i;
3689  ulint counter;
3690  ulint slots_per_seg;
3691  ulint local_seg;
3692 
3693 #ifdef WIN_ASYNC_IO
3694  ut_a((len & 0xFFFFFFFFUL) == len);
3695 #endif
3696 
3697  /* No need of a mutex. Only reading constant fields */
3698  slots_per_seg = array->n_slots / array->n_segments;
3699 
3700  /* We attempt to keep adjacent blocks in the same local
3701  segment. This can help in merging IO requests when we are
3702  doing simulated AIO */
3703  local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
3704  % array->n_segments;
3705 
3706 loop:
3707  os_mutex_enter(array->mutex);
3708 
3709  if (array->n_reserved == array->n_slots) {
3710  os_mutex_exit(array->mutex);
3711 
3712  if (!srv_use_native_aio) {
3713  /* If the handler threads are suspended, wake them
3714  so that we get more slots */
3715 
3717  }
3718 
3719  os_event_wait(array->not_full);
3720 
3721  goto loop;
3722  }
3723 
3724  /* We start our search for an available slot from our preferred
3725  local segment and do a full scan of the array. We are
3726  guaranteed to find a slot in full scan. */
3727  for (i = local_seg * slots_per_seg, counter = 0;
3728  counter < array->n_slots; i++, counter++) {
3729 
3730  i %= array->n_slots;
3731  slot = os_aio_array_get_nth_slot(array, i);
3732 
3733  if (slot->reserved == FALSE) {
3734  goto found;
3735  }
3736  }
3737 
3738  /* We MUST always be able to get hold of a reserved slot. */
3739  ut_error;
3740 
3741 found:
3742  ut_a(slot->reserved == FALSE);
3743  array->n_reserved++;
3744 
3745  if (array->n_reserved == 1) {
3746  os_event_reset(array->is_empty);
3747  }
3748 
3749  if (array->n_reserved == array->n_slots) {
3750  os_event_reset(array->not_full);
3751  }
3752 
3753  slot->reserved = TRUE;
3754  slot->reservation_time = time(NULL);
3755  slot->message1 = message1;
3756  slot->message2 = message2;
3757  slot->file = file;
3758  slot->name = name;
3759  slot->len = len;
3760  slot->type = type;
3761  slot->buf = static_cast<unsigned char *>(buf);
3762  slot->offset = offset;
3763  slot->offset_high = offset_high;
3764  slot->io_already_done = FALSE;
3765 
3766 #ifdef WIN_ASYNC_IO
3767  control = &(slot->control);
3768  control->Offset = (DWORD)offset;
3769  control->OffsetHigh = (DWORD)offset_high;
3770  ResetEvent(slot->handle);
3771 
3772 #elif defined(LINUX_NATIVE_AIO)
3773 
3774  /* If we are not using native AIO skip this part. */
3775  if (!srv_use_native_aio) {
3776  goto skip_native_aio;
3777  }
3778 
3779  /* Check if we are dealing with 64 bit arch.
3780  If not then make sure that offset fits in 32 bits. */
3781  if (sizeof(aio_offset) == 8) {
3782  aio_offset = offset_high;
3783  aio_offset <<= 32;
3784  aio_offset += offset;
3785  } else {
3786  ut_a(offset_high == 0);
3787  aio_offset = offset;
3788  }
3789 
3790  iocb = &slot->control;
3791 
3792  if (type == OS_FILE_READ) {
3793  io_prep_pread(iocb, file, buf, len, aio_offset);
3794  } else {
3795  ut_a(type == OS_FILE_WRITE);
3796  io_prep_pwrite(iocb, file, buf, len, aio_offset);
3797  }
3798 
3799  iocb->data = (void*)slot;
3800  slot->n_bytes = 0;
3801  slot->ret = 0;
3802  /*fprintf(stderr, "Filled up Linux native iocb.\n");*/
3803 
3804 
3805 skip_native_aio:
3806 #endif /* LINUX_NATIVE_AIO */
3807  os_mutex_exit(array->mutex);
3808 
3809  return(slot);
3810 }
3811 
3812 /*******************************************************************/
3814 static
3815 void
3816 os_aio_array_free_slot(
3817 /*===================*/
3818  os_aio_array_t* array,
3819  os_aio_slot_t* slot)
3820 {
3821  ut_ad(array);
3822  ut_ad(slot);
3823 
3824  os_mutex_enter(array->mutex);
3825 
3826  ut_ad(slot->reserved);
3827 
3828  slot->reserved = FALSE;
3829 
3830  array->n_reserved--;
3831 
3832  if (array->n_reserved == array->n_slots - 1) {
3833  os_event_set(array->not_full);
3834  }
3835 
3836  if (array->n_reserved == 0) {
3837  os_event_set(array->is_empty);
3838  }
3839 
3840 #ifdef WIN_ASYNC_IO
3841 
3842  ResetEvent(slot->handle);
3843 
3844 #elif defined(LINUX_NATIVE_AIO)
3845 
3846  if (srv_use_native_aio) {
3847  memset(&slot->control, 0x0, sizeof(slot->control));
3848  slot->n_bytes = 0;
3849  slot->ret = 0;
3850  /*fprintf(stderr, "Freed up Linux native slot.\n");*/
3851  } else {
3852  /* These fields should not be used if we are not
3853  using native AIO. */
3854  ut_ad(slot->n_bytes == 0);
3855  ut_ad(slot->ret == 0);
3856  }
3857 
3858 #endif
3859  os_mutex_exit(array->mutex);
3860 }
3861 
3862 /**********************************************************************/
3864 static
3865 void
3866 os_aio_simulated_wake_handler_thread(
3867 /*=================================*/
3868  ulint global_segment)
3870 {
3871  os_aio_array_t* array;
3872  os_aio_slot_t* slot;
3873  ulint segment;
3874  ulint n;
3875  ulint i;
3876 
3877  ut_ad(!srv_use_native_aio);
3878 
3879  segment = os_aio_get_array_and_local_segment(&array, global_segment);
3880 
3881  n = array->n_slots / array->n_segments;
3882 
3883  /* Look through n slots after the segment * n'th slot */
3884 
3885  os_mutex_enter(array->mutex);
3886 
3887  for (i = 0; i < n; i++) {
3888  slot = os_aio_array_get_nth_slot(array, i + segment * n);
3889 
3890  if (slot->reserved) {
3891  /* Found an i/o request */
3892 
3893  break;
3894  }
3895  }
3896 
3897  os_mutex_exit(array->mutex);
3898 
3899  if (i < n) {
3900  os_event_set(os_aio_segment_wait_events[global_segment]);
3901  }
3902 }
3903 
3904 /**********************************************************************/
3906 UNIV_INTERN
3907 void
3909 /*=======================================*/
3910 {
3911  ulint i;
3912 
3913  if (srv_use_native_aio) {
3914  /* We do not use simulated aio: do nothing */
3915 
3916  return;
3917  }
3918 
3919  os_aio_recommend_sleep_for_read_threads = FALSE;
3920 
3921  for (i = 0; i < os_aio_n_segments; i++) {
3922  os_aio_simulated_wake_handler_thread(i);
3923  }
3924 }
3925 
3926 /**********************************************************************/
3931 UNIV_INTERN
3932 void
3934 /*============================================*/
3935 {
3936 
3937 /* The idea of putting background IO threads to sleep is only for
3938 Windows when using simulated AIO. Windows XP seems to schedule
3939 background threads too eagerly to allow for coalescing during
3940 readahead requests. */
3941 #ifdef __WIN__
3942  os_aio_array_t* array;
3943  ulint g;
3944 
3945  if (srv_use_native_aio) {
3946  /* We do not use simulated aio: do nothing */
3947 
3948  return;
3949  }
3950 
3951  os_aio_recommend_sleep_for_read_threads = TRUE;
3952 
3953  for (g = 0; g < os_aio_n_segments; g++) {
3954  os_aio_get_array_and_local_segment(&array, g);
3955 
3956  if (array == os_aio_read_array) {
3957 
3958  os_event_reset(os_aio_segment_wait_events[g]);
3959  }
3960  }
3961 #endif /* __WIN__ */
3962 }
3963 
3964 #if defined(LINUX_NATIVE_AIO)
3965 /*******************************************************************/
3968 static
3969 ibool
3970 os_aio_linux_dispatch(
3971 /*==================*/
3972  os_aio_array_t* array,
3973  os_aio_slot_t* slot)
3974 {
3975  int ret;
3976  ulint io_ctx_index;
3977  struct iocb* iocb;
3978 
3979  ut_ad(slot != NULL);
3980  ut_ad(array);
3981 
3982  ut_a(slot->reserved);
3983 
3984  /* Find out what we are going to work with.
3985  The iocb struct is directly in the slot.
3986  The io_context is one per segment. */
3987 
3988  iocb = &slot->control;
3989  io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
3990 
3991  ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
3992 
3993 #if defined(UNIV_AIO_DEBUG)
3994  fprintf(stderr,
3995  "io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
3996  (slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
3997  array->aio_ctx[io_ctx_index], (ulong)io_ctx_index);
3998 #endif
3999 
4000  /* io_submit returns number of successfully
4001  queued requests or -errno. */
4002  if (UNIV_UNLIKELY(ret != 1)) {
4003  errno = -ret;
4004  return(FALSE);
4005  }
4006 
4007  return(TRUE);
4008 }
4009 #endif /* LINUX_NATIVE_AIO */
4010 
4011 
4012 /*******************************************************************/
4016 UNIV_INTERN
4017 ibool
4019 /*========*/
4020  ulint type,
4021  ulint mode,
4034  const char* name,
4036  os_file_t file,
4037  void* buf,
4039  ulint offset,
4041  ulint offset_high,
4043  ulint n,
4044  fil_node_t* message1,
4048  void* message2)
4052 {
4053  os_aio_array_t* array;
4054  os_aio_slot_t* slot;
4055 #ifdef WIN_ASYNC_IO
4056  ibool retval;
4057  BOOL ret = TRUE;
4058  DWORD len = (DWORD) n;
4059  struct fil_node_struct * dummy_mess1;
4060  void* dummy_mess2;
4061  ulint dummy_type;
4062 #endif /* WIN_ASYNC_IO */
4063 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
4064  ibool retry;
4065 #endif
4066  ulint wake_later;
4067 
4068  ut_ad(file);
4069  ut_ad(buf);
4070  ut_ad(n > 0);
4071  ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
4072  ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
4073  ut_ad(os_aio_validate_skip());
4074 #ifdef WIN_ASYNC_IO
4075  ut_ad((n & 0xFFFFFFFFUL) == n);
4076 #endif
4077 
4078  wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
4079  mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
4080 
4081  if (mode == OS_AIO_SYNC
4082 #ifdef WIN_ASYNC_IO
4083  && !srv_use_native_aio
4084 #endif /* WIN_ASYNC_IO */
4085  ) {
4086  /* This is actually an ordinary synchronous read or write:
4087  no need to use an i/o-handler thread. NOTE that if we use
4088  Windows async i/o, Windows does not allow us to use
4089  ordinary synchronous os_file_read etc. on the same file,
4090  therefore we have built a special mechanism for synchronous
4091  wait in the Windows case. */
4092 
4093  if (type == OS_FILE_READ) {
4094  return(os_file_read(file, buf, offset,
4095  offset_high, n));
4096  }
4097 
4098  ut_a(type == OS_FILE_WRITE);
4099 
4100  return(os_file_write(name, file, buf, offset, offset_high, n));
4101  }
4102 
4103 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
4104 try_again:
4105 #endif
4106  if (mode == OS_AIO_NORMAL) {
4107  if (type == OS_FILE_READ) {
4108  array = os_aio_read_array;
4109  } else {
4110  array = os_aio_write_array;
4111  }
4112  } else if (mode == OS_AIO_IBUF) {
4113  ut_ad(type == OS_FILE_READ);
4114  /* Reduce probability of deadlock bugs in connection with ibuf:
4115  do not let the ibuf i/o handler sleep */
4116 
4117  wake_later = FALSE;
4118 
4119  array = os_aio_ibuf_array;
4120  } else if (mode == OS_AIO_LOG) {
4121 
4122  array = os_aio_log_array;
4123  } else if (mode == OS_AIO_SYNC) {
4124  array = os_aio_sync_array;
4125 
4126 #if defined(LINUX_NATIVE_AIO)
4127  /* In Linux native AIO we don't use sync IO array. */
4128  ut_a(!srv_use_native_aio);
4129 #endif /* LINUX_NATIVE_AIO */
4130  } else {
4131  array = NULL; /* Eliminate compiler warning */
4132  ut_error;
4133  }
4134 
4135  slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
4136  name, buf, offset, offset_high, n);
4137  if (type == OS_FILE_READ) {
4138  if (srv_use_native_aio) {
4139  os_n_file_reads++;
4140  os_bytes_read_since_printout += n;
4141 #ifdef WIN_ASYNC_IO
4142  ret = ReadFile(file, buf, (DWORD)n, &len,
4143  &(slot->control));
4144 
4145 #elif defined(LINUX_NATIVE_AIO)
4146  if (!os_aio_linux_dispatch(array, slot)) {
4147  goto err_exit;
4148  }
4149 #endif
4150  } else {
4151  if (!wake_later) {
4152  os_aio_simulated_wake_handler_thread(
4153  os_aio_get_segment_no_from_slot(
4154  array, slot));
4155  }
4156  }
4157  } else if (type == OS_FILE_WRITE) {
4158  if (srv_use_native_aio) {
4159  os_n_file_writes++;
4160 #ifdef WIN_ASYNC_IO
4161  ret = WriteFile(file, buf, (DWORD)n, &len,
4162  &(slot->control));
4163 
4164 #elif defined(LINUX_NATIVE_AIO)
4165  if (!os_aio_linux_dispatch(array, slot)) {
4166  goto err_exit;
4167  }
4168 #endif
4169  } else {
4170  if (!wake_later) {
4171  os_aio_simulated_wake_handler_thread(
4172  os_aio_get_segment_no_from_slot(
4173  array, slot));
4174  }
4175  }
4176  } else {
4177  ut_error;
4178  }
4179 
4180 #ifdef WIN_ASYNC_IO
4181  if (srv_use_native_aio) {
4182  if ((ret && len == n)
4183  || (!ret && GetLastError() == ERROR_IO_PENDING)) {
4184  /* aio was queued successfully! */
4185 
4186  if (mode == OS_AIO_SYNC) {
4187  /* We want a synchronous i/o operation on a
4188  file where we also use async i/o: in Windows
4189  we must use the same wait mechanism as for
4190  async i/o */
4191 
4192  retval = os_aio_windows_handle(ULINT_UNDEFINED,
4193  slot->pos,
4194  &dummy_mess1,
4195  &dummy_mess2,
4196  &dummy_type);
4197 
4198  return(retval);
4199  }
4200 
4201  return(TRUE);
4202  }
4203 
4204  goto err_exit;
4205  }
4206 #endif /* WIN_ASYNC_IO */
4207  /* aio was queued successfully! */
4208  return(TRUE);
4209 
4210 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
4211 err_exit:
4212  os_aio_array_free_slot(array, slot);
4213 
4214  retry = os_file_handle_error(name,
4215  type == OS_FILE_READ
4216  ? "aio read" : "aio write");
4217  if (retry) {
4218 
4219  goto try_again;
4220  }
4221 
4222  return(FALSE);
4223 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
4224 }
4225 
4226 #ifdef WIN_ASYNC_IO
4227 /**********************************************************************/
4235 UNIV_INTERN
4236 ibool
4237 os_aio_windows_handle(
4238 /*==================*/
4239  ulint segment,
4247  ulint pos,
4249  fil_node_t**message1,
4254  void** message2,
4255  ulint* type)
4256 {
4257  ulint orig_seg = segment;
4258  os_aio_array_t* array;
4259  os_aio_slot_t* slot;
4260  ulint n;
4261  ulint i;
4262  ibool ret_val;
4263  BOOL ret;
4264  DWORD len;
4265  BOOL retry = FALSE;
4266 
4267  if (segment == ULINT_UNDEFINED) {
4268  array = os_aio_sync_array;
4269  segment = 0;
4270  } else {
4271  segment = os_aio_get_array_and_local_segment(&array, segment);
4272  }
4273 
4274  /* NOTE! We only access constant fields in os_aio_array. Therefore
4275  we do not have to acquire the protecting mutex yet */
4276 
4277  ut_ad(os_aio_validate_skip());
4278  ut_ad(segment < array->n_segments);
4279 
4280  n = array->n_slots / array->n_segments;
4281 
4282  if (array == os_aio_sync_array) {
4283  WaitForSingleObject(
4284  os_aio_array_get_nth_slot(array, pos)->handle,
4285  INFINITE);
4286  i = pos;
4287  } else {
4288  srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
4289  i = WaitForMultipleObjects((DWORD) n,
4290  array->handles + segment * n,
4291  FALSE,
4292  INFINITE);
4293  }
4294 
4296  os_thread_exit(NULL);
4297  }
4298 
4299  os_mutex_enter(array->mutex);
4300 
4301  slot = os_aio_array_get_nth_slot(array, i + segment * n);
4302 
4303  ut_a(slot->reserved);
4304 
4305  if (orig_seg != ULINT_UNDEFINED) {
4306  srv_set_io_thread_op_info(orig_seg,
4307  "get windows aio return value");
4308  }
4309 
4310  ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE);
4311 
4312  *message1 = slot->message1;
4313  *message2 = slot->message2;
4314 
4315  *type = slot->type;
4316 
4317  if (ret && len == slot->len) {
4318  ret_val = TRUE;
4319 
4320 #ifdef UNIV_DO_FLUSH
4321  if (slot->type == OS_FILE_WRITE
4322  && !os_do_not_call_flush_at_each_write) {
4323  if (!os_file_flush(slot->file)) {
4324  ut_error;
4325  }
4326  }
4327 #endif /* UNIV_DO_FLUSH */
4328  } else if (os_file_handle_error(slot->name, "Windows aio")) {
4329 
4330  retry = TRUE;
4331  } else {
4332 
4333  ret_val = FALSE;
4334  }
4335 
4336  os_mutex_exit(array->mutex);
4337 
4338  if (retry) {
4339  /* retry failed read/write operation synchronously.
4340  No need to hold array->mutex. */
4341 
4342 #ifdef UNIV_PFS_IO
4343  /* This read/write does not go through os_file_read
4344  and os_file_write APIs, need to register with
4345  performance schema explicitly here. */
4346  struct PSI_file_locker* locker = NULL;
4347  register_pfs_file_io_begin(locker, slot->file, slot->len,
4348  (slot->type == OS_FILE_WRITE)
4349  ? PSI_FILE_WRITE
4350  : PSI_FILE_READ,
4351  __FILE__, __LINE__);
4352 #endif
4353 
4354  ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
4355 
4356  switch (slot->type) {
4357  case OS_FILE_WRITE:
4358  ret = WriteFile(slot->file, slot->buf,
4359  (DWORD) slot->len, &len,
4360  &(slot->control));
4361 
4362  break;
4363  case OS_FILE_READ:
4364  ret = ReadFile(slot->file, slot->buf,
4365  (DWORD) slot->len, &len,
4366  &(slot->control));
4367 
4368  break;
4369  default:
4370  ut_error;
4371  }
4372 
4373 #ifdef UNIV_PFS_IO
4374  register_pfs_file_io_end(locker, len);
4375 #endif
4376 
4377  if (!ret && GetLastError() == ERROR_IO_PENDING) {
4378  /* aio was queued successfully!
4379  We want a synchronous i/o operation on a
4380  file where we also use async i/o: in Windows
4381  we must use the same wait mechanism as for
4382  async i/o */
4383 
4384  ret = GetOverlappedResult(slot->file,
4385  &(slot->control),
4386  &len, TRUE);
4387  }
4388 
4389  ret_val = ret && len == slot->len;
4390  }
4391 
4392  os_aio_array_free_slot(array, slot);
4393 
4394  return(ret_val);
4395 }
4396 #endif
4397 
4398 #if defined(LINUX_NATIVE_AIO)
4399 /******************************************************************/
4410 static
4411 void
4412 os_aio_linux_collect(
4413 /*=================*/
4414  os_aio_array_t* array,
4415  ulint segment,
4416  ulint seg_size)
4417 {
4418  int i;
4419  int ret;
4420  ulint start_pos;
4421  ulint end_pos;
4422  struct timespec timeout;
4423  struct io_event* events;
4424  struct io_context* io_ctx;
4425 
4426  /* sanity checks. */
4427  ut_ad(array != NULL);
4428  ut_ad(seg_size > 0);
4429  ut_ad(segment < array->n_segments);
4430 
4431  /* Which part of event array we are going to work on. */
4432  events = &array->aio_events[segment * seg_size];
4433 
4434  /* Which io_context we are going to use. */
4435  io_ctx = array->aio_ctx[segment];
4436 
4437  /* Starting point of the segment we will be working on. */
4438  start_pos = segment * seg_size;
4439 
4440  /* End point. */
4441  end_pos = start_pos + seg_size;
4442 
4443 retry:
4444 
4445  /* Go down if we are in shutdown mode.
4446  In case of srv_fast_shutdown == 2, there may be pending
4447  IO requests but that should be OK as we essentially treat
4448  that as a crash of InnoDB. */
4450  os_thread_exit(NULL);
4451  }
4452 
4453  /* Initialize the events. The timeout value is arbitrary.
4454  We probably need to experiment with it a little. */
4455  memset(events, 0, sizeof(*events) * seg_size);
4456  timeout.tv_sec = 0;
4457  timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
4458 
4459  ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
4460 
4461  /* This error handling is for any error in collecting the
4462  IO requests. The errors, if any, for any particular IO
4463  request are simply passed on to the calling routine. */
4464 
4465  /* Not enough resources! Try again. */
4466  if (ret == -EAGAIN) {
4467  goto retry;
4468  }
4469 
4470  /* Interrupted! I have tested the behaviour in case of an
4471  interrupt. If we have some completed IOs available then
4472  the return code will be the number of IOs. We get EINTR only
4473  if there are no completed IOs and we have been interrupted. */
4474  if (ret == -EINTR) {
4475  goto retry;
4476  }
4477 
4478  /* No pending request! Go back and check again. */
4479  if (ret == 0) {
4480  goto retry;
4481  }
4482 
4483  /* All other errors! should cause a trap for now. */
4484  if (UNIV_UNLIKELY(ret < 0)) {
4485  ut_print_timestamp(stderr);
4486  fprintf(stderr,
4487  " InnoDB: unexpected ret_code[%d] from"
4488  " io_getevents()!\n", ret);
4489  ut_error;
4490  }
4491 
4492  ut_a(ret > 0);
4493 
4494  for (i = 0; i < ret; i++) {
4495  os_aio_slot_t* slot;
4496  struct iocb* control;
4497 
4498  control = (struct iocb *)events[i].obj;
4499  ut_a(control != NULL);
4500 
4501  slot = (os_aio_slot_t *) control->data;
4502 
4503  /* Some sanity checks. */
4504  ut_a(slot != NULL);
4505  ut_a(slot->reserved);
4506 
4507 #if defined(UNIV_AIO_DEBUG)
4508  fprintf(stderr,
4509  "io_getevents[%c]: slot[%p] ctx[%p]"
4510  " seg[%lu]\n",
4511  (slot->type == OS_FILE_WRITE) ? 'w' : 'r',
4512  slot, io_ctx, segment);
4513 #endif
4514 
4515  /* We are not scribbling previous segment. */
4516  ut_a(slot->pos >= start_pos);
4517 
4518  /* We have not overstepped to next segment. */
4519  ut_a(slot->pos < end_pos);
4520 
4521  /* Mark this request as completed. The error handling
4522  will be done in the calling function. */
4523  os_mutex_enter(array->mutex);
4524  slot->n_bytes = events[i].res;
4525  slot->ret = events[i].res2;
4526  slot->io_already_done = TRUE;
4527  os_mutex_exit(array->mutex);
4528  }
4529 
4530  return;
4531 }
4532 
4533 /**********************************************************************/
4541 UNIV_INTERN
4542 ibool
4543 os_aio_linux_handle(
4544 /*================*/
4545  ulint global_seg,
4551  fil_node_t**message1,
4552  void** message2,
4556  ulint* type)
4557 {
4558  ulint segment;
4559  os_aio_array_t* array;
4560  os_aio_slot_t* slot;
4561  ulint n;
4562  ulint i;
4563  ibool ret = FALSE;
4564 
4565  /* Should never be doing Sync IO here. */
4566  ut_a(global_seg != ULINT_UNDEFINED);
4567 
4568  /* Find the array and the local segment. */
4569  segment = os_aio_get_array_and_local_segment(&array, global_seg);
4570  n = array->n_slots / array->n_segments;
4571 
4572  /* Loop until we have found a completed request. */
4573  for (;;) {
4574  os_mutex_enter(array->mutex);
4575  for (i = 0; i < n; ++i) {
4576  slot = os_aio_array_get_nth_slot(
4577  array, i + segment * n);
4578  if (slot->reserved && slot->io_already_done) {
4579  /* Something for us to work on. */
4580  goto found;
4581  }
4582  }
4583 
4584  os_mutex_exit(array->mutex);
4585 
4586  /* We don't have any completed request.
4587  Wait for some request. Note that we return
4588  from wait iff we have found a request. */
4589 
4590  srv_set_io_thread_op_info(global_seg,
4591  "waiting for completed aio requests");
4592  os_aio_linux_collect(array, segment, n);
4593  }
4594 
4595 found:
4596  /* Note that it may be that there are more then one completed
4597  IO requests. We process them one at a time. We may have a case
4598  here to improve the performance slightly by dealing with all
4599  requests in one sweep. */
4600  srv_set_io_thread_op_info(global_seg,
4601  "processing completed aio requests");
4602 
4603  /* Ensure that we are scribbling only our segment. */
4604  ut_a(i < n);
4605 
4606  ut_ad(slot != NULL);
4607  ut_ad(slot->reserved);
4608  ut_ad(slot->io_already_done);
4609 
4610  *message1 = slot->message1;
4611  *message2 = slot->message2;
4612 
4613  *type = slot->type;
4614 
4615  if ((slot->ret == 0) && (slot->n_bytes == (long)slot->len)) {
4616  ret = TRUE;
4617 
4618 #ifdef UNIV_DO_FLUSH
4619  if (slot->type == OS_FILE_WRITE
4620  && !os_do_not_call_flush_at_each_write)
4621  && !os_file_flush(slot->file) {
4622  ut_error;
4623  }
4624 #endif /* UNIV_DO_FLUSH */
4625  } else {
4626  errno = -slot->ret;
4627 
4628  /* os_file_handle_error does tell us if we should retry
4629  this IO. As it stands now, we don't do this retry when
4630  reaping requests from a different context than
4631  the dispatcher. This non-retry logic is the same for
4632  windows and linux native AIO.
4633  We should probably look into this to transparently
4634  re-submit the IO. */
4635  os_file_handle_error(slot->name, "Linux aio");
4636 
4637  ret = FALSE;
4638  }
4639 
4640  os_mutex_exit(array->mutex);
4641 
4642  os_aio_array_free_slot(array, slot);
4643 
4644  return(ret);
4645 }
4646 #endif /* LINUX_NATIVE_AIO */
4647 
4648 /**********************************************************************/
4652 UNIV_INTERN
4653 ibool
4655 /*====================*/
4656  ulint global_segment,
4661  fil_node_t**message1,
4666  void** message2,
4667  ulint* type)
4668 {
4669  os_aio_array_t* array;
4670  ulint segment;
4671  os_aio_slot_t* slot;
4672  os_aio_slot_t* slot2;
4673  os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
4674  ulint n_consecutive;
4675  ulint total_len;
4676  ulint offs;
4677  ulint lowest_offset;
4678  ulint biggest_age;
4679  ulint age;
4680  byte* combined_buf;
4681  byte* combined_buf2;
4682  ibool ret;
4683  ulint n;
4684  ulint i;
4685 
4686  /* Fix compiler warning */
4687  *consecutive_ios = NULL;
4688 
4689  memset(consecutive_ios, 0, sizeof(os_aio_slot_t*) * OS_AIO_MERGE_N_CONSECUTIVE);
4690  segment = os_aio_get_array_and_local_segment(&array, global_segment);
4691 
4692 restart:
4693  /* NOTE! We only access constant fields in os_aio_array. Therefore
4694  we do not have to acquire the protecting mutex yet */
4695 
4696  srv_set_io_thread_op_info(global_segment,
4697  "looking for i/o requests (a)");
4698  ut_ad(os_aio_validate_skip());
4699  ut_ad(segment < array->n_segments);
4700 
4701  n = array->n_slots / array->n_segments;
4702 
4703  /* Look through n slots after the segment * n'th slot */
4704 
4705  if (array == os_aio_read_array
4706  && os_aio_recommend_sleep_for_read_threads) {
4707 
4708  /* Give other threads chance to add several i/os to the array
4709  at once. */
4710 
4711  goto recommended_sleep;
4712  }
4713 
4714  os_mutex_enter(array->mutex);
4715 
4716  srv_set_io_thread_op_info(global_segment,
4717  "looking for i/o requests (b)");
4718 
4719  /* Check if there is a slot for which the i/o has already been
4720  done */
4721 
4722  for (i = 0; i < n; i++) {
4723  slot = os_aio_array_get_nth_slot(array, i + segment * n);
4724 
4725  if (slot->reserved && slot->io_already_done) {
4726 
4727  if (os_aio_print_debug) {
4728  fprintf(stderr,
4729  "InnoDB: i/o for slot %lu"
4730  " already done, returning\n",
4731  (ulong) i);
4732  }
4733 
4734  ret = TRUE;
4735 
4736  goto slot_io_done;
4737  }
4738  }
4739 
4740  n_consecutive = 0;
4741 
4742  /* If there are at least 2 seconds old requests, then pick the oldest
4743  one to prevent starvation. If several requests have the same age,
4744  then pick the one at the lowest offset. */
4745 
4746  biggest_age = 0;
4747  lowest_offset = ULINT_MAX;
4748 
4749  for (i = 0; i < n; i++) {
4750  slot = os_aio_array_get_nth_slot(array, i + segment * n);
4751 
4752  if (slot->reserved) {
4753  age = (ulint)difftime(time(NULL),
4754  slot->reservation_time);
4755 
4756  if ((age >= 2 && age > biggest_age)
4757  || (age >= 2 && age == biggest_age
4758  && slot->offset < lowest_offset)) {
4759 
4760  /* Found an i/o request */
4761  consecutive_ios[0] = slot;
4762 
4763  n_consecutive = 1;
4764 
4765  biggest_age = age;
4766  lowest_offset = slot->offset;
4767  }
4768  }
4769  }
4770 
4771  if (n_consecutive == 0) {
4772  /* There were no old requests. Look for an i/o request at the
4773  lowest offset in the array (we ignore the high 32 bits of the
4774  offset in these heuristics) */
4775 
4776  lowest_offset = ULINT_MAX;
4777 
4778  for (i = 0; i < n; i++) {
4779  slot = os_aio_array_get_nth_slot(array,
4780  i + segment * n);
4781 
4782  if (slot->reserved && slot->offset < lowest_offset) {
4783 
4784  /* Found an i/o request */
4785  consecutive_ios[0] = slot;
4786 
4787  n_consecutive = 1;
4788 
4789  lowest_offset = slot->offset;
4790  }
4791  }
4792  }
4793 
4794  if (n_consecutive == 0) {
4795 
4796  /* No i/o requested at the moment */
4797 
4798  goto wait_for_io;
4799  }
4800 
4801  /* if n_consecutive != 0, then we have assigned
4802  something valid to consecutive_ios[0] */
4803  ut_ad(n_consecutive != 0);
4804  ut_ad(consecutive_ios[0] != NULL);
4805 
4806  slot = consecutive_ios[0];
4807 
4808  /* Check if there are several consecutive blocks to read or write */
4809 
4810 consecutive_loop:
4811  for (i = 0; i < n; i++) {
4812  slot2 = os_aio_array_get_nth_slot(array, i + segment * n);
4813 
4814  if (slot2->reserved && slot2 != slot
4815  && slot2->offset == slot->offset + slot->len
4816  /* check that sum does not wrap over */
4817  && slot->offset + slot->len > slot->offset
4818  && slot2->offset_high == slot->offset_high
4819  && slot2->type == slot->type
4820  && slot2->file == slot->file) {
4821 
4822  /* Found a consecutive i/o request */
4823 
4824  consecutive_ios[n_consecutive] = slot2;
4825  n_consecutive++;
4826 
4827  slot = slot2;
4828 
4829  if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
4830 
4831  goto consecutive_loop;
4832  } else {
4833  break;
4834  }
4835  }
4836  }
4837 
4838  srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
4839 
4840  /* We have now collected n_consecutive i/o requests in the array;
4841  allocate a single buffer which can hold all data, and perform the
4842  i/o */
4843 
4844  total_len = 0;
4845  slot = consecutive_ios[0];
4846 
4847  for (i = 0; i < n_consecutive; i++) {
4848  total_len += consecutive_ios[i]->len;
4849  }
4850 
4851  if (n_consecutive == 1) {
4852  /* We can use the buffer of the i/o request */
4853  combined_buf = slot->buf;
4854  combined_buf2 = NULL;
4855  } else {
4856  combined_buf2 = static_cast<unsigned char *>(ut_malloc(total_len + UNIV_PAGE_SIZE));
4857 
4858  ut_a(combined_buf2);
4859 
4860  combined_buf = static_cast<unsigned char *>(ut_align(combined_buf2, UNIV_PAGE_SIZE));
4861  }
4862 
4863  /* We release the array mutex for the time of the i/o: NOTE that
4864  this assumes that there is just one i/o-handler thread serving
4865  a single segment of slots! */
4866 
4867  os_mutex_exit(array->mutex);
4868 
4869  if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
4870  /* Copy the buffers to the combined buffer */
4871  offs = 0;
4872 
4873  for (i = 0; i < n_consecutive; i++) {
4874 
4875  ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
4876  consecutive_ios[i]->len);
4877  offs += consecutive_ios[i]->len;
4878  }
4879  }
4880 
4881  srv_set_io_thread_op_info(global_segment, "doing file i/o");
4882 
4883  if (os_aio_print_debug) {
4884  fprintf(stderr,
4885  "InnoDB: doing i/o of type %lu at offset %lu %lu,"
4886  " length %lu\n",
4887  (ulong) slot->type, (ulong) slot->offset_high,
4888  (ulong) slot->offset, (ulong) total_len);
4889  }
4890 
4891  /* Do the i/o with ordinary, synchronous i/o functions: */
4892  if (slot->type == OS_FILE_WRITE) {
4893  ret = os_file_write(slot->name, slot->file, combined_buf,
4894  slot->offset, slot->offset_high,
4895  total_len);
4896  } else {
4897  ret = os_file_read(slot->file, combined_buf,
4898  slot->offset, slot->offset_high, total_len);
4899  }
4900 
4901  ut_a(ret);
4902  srv_set_io_thread_op_info(global_segment, "file i/o done");
4903 
4904 #if 0
4905  fprintf(stderr,
4906  "aio: %lu consecutive %lu:th segment, first offs %lu blocks\n",
4907  n_consecutive, global_segment, slot->offset / UNIV_PAGE_SIZE);
4908 #endif
4909 
4910  if (slot->type == OS_FILE_READ && n_consecutive > 1) {
4911  /* Copy the combined buffer to individual buffers */
4912  offs = 0;
4913 
4914  for (i = 0; i < n_consecutive; i++) {
4915 
4916  ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
4917  consecutive_ios[i]->len);
4918  offs += consecutive_ios[i]->len;
4919  }
4920  }
4921 
4922  if (combined_buf2) {
4923  ut_free(combined_buf2);
4924  }
4925 
4926  os_mutex_enter(array->mutex);
4927 
4928  /* Mark the i/os done in slots */
4929 
4930  for (i = 0; i < n_consecutive; i++) {
4931  consecutive_ios[i]->io_already_done = TRUE;
4932  }
4933 
4934  /* We return the messages for the first slot now, and if there were
4935  several slots, the messages will be returned with subsequent calls
4936  of this function */
4937 
4938 slot_io_done:
4939 
4940  ut_a(slot->reserved);
4941 
4942  *message1 = slot->message1;
4943  *message2 = slot->message2;
4944 
4945  *type = slot->type;
4946 
4947  os_mutex_exit(array->mutex);
4948 
4949  os_aio_array_free_slot(array, slot);
4950 
4951  return(ret);
4952 
4953 wait_for_io:
4954  srv_set_io_thread_op_info(global_segment, "resetting wait event");
4955 
4956  /* We wait here until there again can be i/os in the segment
4957  of this thread */
4958 
4959  os_event_reset(os_aio_segment_wait_events[global_segment]);
4960 
4961  os_mutex_exit(array->mutex);
4962 
4963 recommended_sleep:
4964  srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
4965 
4966  os_event_wait(os_aio_segment_wait_events[global_segment]);
4967 
4968  if (os_aio_print_debug) {
4969  fprintf(stderr,
4970  "InnoDB: i/o handler thread for i/o"
4971  " segment %lu wakes up\n",
4972  (ulong) global_segment);
4973  }
4974 
4975  goto restart;
4976 }
4977 
4978 /**********************************************************************/
4981 static
4982 ibool
4983 os_aio_array_validate(
4984 /*==================*/
4985  os_aio_array_t* array)
4986 {
4987  os_aio_slot_t* slot;
4988  ulint n_reserved = 0;
4989  ulint i;
4990 
4991  ut_a(array);
4992 
4993  os_mutex_enter(array->mutex);
4994 
4995  ut_a(array->n_slots > 0);
4996  ut_a(array->n_segments > 0);
4997 
4998  for (i = 0; i < array->n_slots; i++) {
4999  slot = os_aio_array_get_nth_slot(array, i);
5000 
5001  if (slot->reserved) {
5002  n_reserved++;
5003  ut_a(slot->len > 0);
5004  }
5005  }
5006 
5007  ut_a(array->n_reserved == n_reserved);
5008 
5009  os_mutex_exit(array->mutex);
5010 
5011  return(TRUE);
5012 }
5013 
5014 /**********************************************************************/
5017 UNIV_INTERN
5018 ibool
5020 /*=================*/
5021 {
5022  os_aio_array_validate(os_aio_read_array);
5023  os_aio_array_validate(os_aio_write_array);
5024  os_aio_array_validate(os_aio_ibuf_array);
5025  os_aio_array_validate(os_aio_log_array);
5026  os_aio_array_validate(os_aio_sync_array);
5027 
5028  return(TRUE);
5029 }
5030 
5031 /**********************************************************************/
5036 static
5037 void
5038 os_aio_print_segment_info(
5039 /*======================*/
5040  FILE* file,
5041  ulint* n_seg,
5042  os_aio_array_t* array)
5043 {
5044  ulint i;
5045 
5046  ut_ad(array);
5047  ut_ad(n_seg);
5048  ut_ad(array->n_segments > 0);
5049 
5050  if (array->n_segments == 1) {
5051  return;
5052  }
5053 
5054  fprintf(file, " [");
5055  for (i = 0; i < array->n_segments; i++) {
5056  if (i != 0) {
5057  fprintf(file, ", ");
5058  }
5059 
5060  fprintf(file, "%lu", n_seg[i]);
5061  }
5062  fprintf(file, "] ");
5063 }
5064 
5065 /**********************************************************************/
5067 UNIV_INTERN
5068 void
5070 /*=========*/
5071  FILE* file)
5072 {
5073  os_aio_array_t* array;
5074  os_aio_slot_t* slot;
5075  ulint n_reserved;
5076  ulint n_res_seg[SRV_MAX_N_IO_THREADS];
5077  time_t current_time;
5078  double time_elapsed;
5079  double avg_bytes_read;
5080  ulint i;
5081 
5082  for (i = 0; i < srv_n_file_io_threads; i++) {
5083  fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i,
5084  srv_io_thread_op_info[i],
5085  srv_io_thread_function[i]);
5086 
5087 #ifndef __WIN__
5088  if (os_aio_segment_wait_events[i]->is_set) {
5089  fprintf(file, " ev set");
5090  }
5091 #endif
5092 
5093  fprintf(file, "\n");
5094  }
5095 
5096  fputs("Pending normal aio reads:", file);
5097 
5098  array = os_aio_read_array;
5099 loop:
5100  ut_a(array);
5101 
5102  os_mutex_enter(array->mutex);
5103 
5104  ut_a(array->n_slots > 0);
5105  ut_a(array->n_segments > 0);
5106 
5107  n_reserved = 0;
5108 
5109  memset(n_res_seg, 0x0, sizeof(n_res_seg));
5110 
5111  for (i = 0; i < array->n_slots; i++) {
5112  ulint seg_no;
5113 
5114  slot = os_aio_array_get_nth_slot(array, i);
5115 
5116  seg_no = (i * array->n_segments) / array->n_slots;
5117  if (slot->reserved) {
5118  n_reserved++;
5119  n_res_seg[seg_no]++;
5120 #if 0
5121  fprintf(stderr, "Reserved slot, messages %p %p\n",
5122  (void*) slot->message1,
5123  (void*) slot->message2);
5124 #endif
5125  ut_a(slot->len > 0);
5126  }
5127  }
5128 
5129  ut_a(array->n_reserved == n_reserved);
5130 
5131  fprintf(file, " %lu", (ulong) n_reserved);
5132 
5133  os_aio_print_segment_info(file, n_res_seg, array);
5134 
5135  os_mutex_exit(array->mutex);
5136 
5137  if (array == os_aio_read_array) {
5138  fputs(", aio writes:", file);
5139 
5140  array = os_aio_write_array;
5141 
5142  goto loop;
5143  }
5144 
5145  if (array == os_aio_write_array) {
5146  fputs(",\n ibuf aio reads:", file);
5147  array = os_aio_ibuf_array;
5148 
5149  goto loop;
5150  }
5151 
5152  if (array == os_aio_ibuf_array) {
5153  fputs(", log i/o's:", file);
5154  array = os_aio_log_array;
5155 
5156  goto loop;
5157  }
5158 
5159  if (array == os_aio_log_array) {
5160  fputs(", sync i/o's:", file);
5161  array = os_aio_sync_array;
5162 
5163  goto loop;
5164  }
5165 
5166  putc('\n', file);
5167  current_time = time(NULL);
5168  time_elapsed = 0.001 + difftime(current_time, os_last_printout);
5169 
5170  fprintf(file,
5171  "Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
5172  "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
5173  (ulong) fil_n_pending_log_flushes,
5175  (ulong) os_n_file_reads, (ulong) os_n_file_writes,
5176  (ulong) os_n_fsyncs);
5177 
5179  fprintf(file,
5180  "%lu pending preads, %lu pending pwrites\n",
5181  (ulong) os_file_n_pending_preads,
5182  (ulong) os_file_n_pending_pwrites);
5183  }
5184 
5185  if (os_n_file_reads == os_n_file_reads_old) {
5186  avg_bytes_read = 0.0;
5187  } else {
5188  avg_bytes_read = (double) os_bytes_read_since_printout
5189  / (os_n_file_reads - os_n_file_reads_old);
5190  }
5191 
5192  fprintf(file,
5193  "%.2f reads/s, %lu avg bytes/read,"
5194  " %.2f writes/s, %.2f fsyncs/s\n",
5195  (os_n_file_reads - os_n_file_reads_old)
5196  / time_elapsed,
5197  (ulong)avg_bytes_read,
5198  (os_n_file_writes - os_n_file_writes_old)
5199  / time_elapsed,
5200  (os_n_fsyncs - os_n_fsyncs_old)
5201  / time_elapsed);
5202 
5203  os_n_file_reads_old = os_n_file_reads;
5204  os_n_file_writes_old = os_n_file_writes;
5205  os_n_fsyncs_old = os_n_fsyncs;
5206  os_bytes_read_since_printout = 0;
5207 
5208  os_last_printout = current_time;
5209 }
5210 
5211 /**********************************************************************/
5213 UNIV_INTERN
5214 void
5216 /*======================*/
5217 {
5218  os_n_file_reads_old = os_n_file_reads;
5219  os_n_file_writes_old = os_n_file_writes;
5220  os_n_fsyncs_old = os_n_fsyncs;
5221  os_bytes_read_since_printout = 0;
5222 
5223  os_last_printout = time(NULL);
5224 }
5225 
5226 #ifdef UNIV_DEBUG
5227 /**********************************************************************/
5231 UNIV_INTERN
5232 ibool
5233 os_aio_all_slots_free(void)
5234 /*=======================*/
5235 {
5236  os_aio_array_t* array;
5237  ulint n_res = 0;
5238 
5239  array = os_aio_read_array;
5240 
5241  os_mutex_enter(array->mutex);
5242 
5243  n_res += array->n_reserved;
5244 
5245  os_mutex_exit(array->mutex);
5246 
5247  array = os_aio_write_array;
5248 
5249  os_mutex_enter(array->mutex);
5250 
5251  n_res += array->n_reserved;
5252 
5253  os_mutex_exit(array->mutex);
5254 
5255  array = os_aio_ibuf_array;
5256 
5257  os_mutex_enter(array->mutex);
5258 
5259  n_res += array->n_reserved;
5260 
5261  os_mutex_exit(array->mutex);
5262 
5263  array = os_aio_log_array;
5264 
5265  os_mutex_enter(array->mutex);
5266 
5267  n_res += array->n_reserved;
5268 
5269  os_mutex_exit(array->mutex);
5270 
5271  array = os_aio_sync_array;
5272 
5273  os_mutex_enter(array->mutex);
5274 
5275  n_res += array->n_reserved;
5276 
5277  os_mutex_exit(array->mutex);
5278 
5279  if (n_res == 0) {
5280 
5281  return(TRUE);
5282  }
5283 
5284  return(FALSE);
5285 }
5286 #endif /* UNIV_DEBUG */
5287 
5288 #endif /* !UNIV_HOTBACKUP */
int os_file_t
Definition: os0file.h:87
UNIV_INTERN ibool os_file_close_func(os_file_t file)
Definition: os0file.cc:1820
FILE * os_file_create_tmpfile(void)
Definition: os0file.cc:729
#define OS_AIO_IBUF
Definition: os0file.h:158
ibool io_already_done
Definition: os0file.cc:177
ulint os_file_n_pending_pwrites
Definition: os0file.cc:301
UNIV_INTERN ibool os_file_create_directory(const char *pathname, ibool fail_if_exists)
Definition: os0file.cc:1042
UNIV_INTERN void os_aio_simulated_put_read_threads_to_sleep(void)
Definition: os0file.cc:3933
ulint os_file_n_pending_preads
Definition: os0file.cc:299
os_event_t is_empty
Definition: os0file.cc:209
UNIV_INTERN void os_mutex_free(os_mutex_t mutex)
Definition: os0sync.cc:840
UNIV_INTERN void os_file_set_nocache(int fd, const char *file_name, const char *operation_name)
Definition: os0file.cc:1340
UNIV_INTERN ib_int64_t os_file_get_size_as_iblonglong(os_file_t file)
Definition: os0file.cc:1943
UNIV_INTERN void os_io_init_simple(void)
Definition: os0file.cc:711
UNIV_INTERN ibool os_file_get_size(os_file_t file, ulint *size, ulint *size_high)
Definition: os0file.cc:1895
UNIV_INTERN void srv_set_io_thread_op_info(ulint i, const char *str)
Definition: srv0srv.cc:841
#define mem_free(PTR)
Definition: mem0mem.h:249
UNIV_INTERN ibool os_file_read_no_error_handling_func(os_file_t file, void *buf, ulint offset, ulint offset_high, ulint n)
Definition: os0file.cc:2568
#define OS_AIO_SYNC
Definition: os0file.h:162
UNIV_INTERN ibool os_file_create_subdirs_if_needed(const char *path)
Definition: os0file.cc:3115
UNIV_INLINE void * ut_memcpy(void *dest, const void *sour, ulint n)
#define OS_WINVISTA
Definition: os0file.h:200
UNIV_INTERN os_event_t os_event_create(const char *name)
Definition: os0sync.cc:365
UNIV_INTERN void * ut_malloc(ulint n)
Definition: ut0mem.cc:235
UNIV_INTERN ibool os_file_delete_if_exists(const char *name)
Definition: os0file.cc:1653
const char * name
Definition: os0file.cc:176
#define OS_WIN31
Definition: os0file.h:193
os_file_type_t type
Definition: os0file.h:384
#define OS_WIN95
Definition: os0file.h:194
os_aio_slot_t * slots
Definition: os0file.cc:228
UNIV_INTERN void os_aio_wake_all_threads_at_shutdown(void)
Definition: os0file.cc:3537
UNIV_INTERN os_file_t os_file_create_simple_no_error_handling_func(const char *name, ulint create_mode, ulint access_type, ibool *success)
Definition: os0file.cc:1230
UNIV_INTERN void os_mutex_enter(os_mutex_t mutex)
Definition: os0sync.cc:809
#define OS_FILE_READ
Definition: os0file.h:144
UNIV_INTERN ibool os_file_read_func(os_file_t file, void *buf, ulint offset, ulint offset_high, ulint n)
Definition: os0file.cc:2439
UNIV_INTERN void os_aio_print(FILE *file)
Definition: os0file.cc:5069
UNIV_INTERN int os_file_readdir_next_file(const char *dirname, os_file_dir_t dir, os_file_stat_t *info)
Definition: os0file.cc:852
UNIV_INTERN char * os_file_dirname(const char *path)
Definition: os0file.cc:3085
UNIV_INTERN ibool os_file_rename_func(const char *oldpath, const char *newpath)
Definition: os0file.cc:1780
UNIV_INTERN os_file_dir_t os_file_opendir(const char *dirname, ibool error_is_fatal)
Definition: os0file.cc:761
UNIV_INTERN ibool os_aio_simulated_handle(ulint segment, fil_node_t **message1, void **message2, ulint *type)
Definition: os0file.cc:4654
UNIV_INLINE ulint ut_min(ulint n1, ulint n2)
UNIV_INTERN ibool os_aio_validate(void)
Definition: os0file.cc:5019
time_t reservation_time
Definition: os0file.cc:167
#define OS_FILE_LOG_BLOCK_SIZE
Definition: os0file.h:104
UNIV_INTERN ibool os_file_write_func(const char *name, os_file_t file, const void *buf, ulint offset, ulint offset_high, ulint n)
Definition: os0file.cc:2700
#define OS_WINNT
Definition: os0file.h:195
UNIV_INTERN ibool os_file_get_status(const char *path, os_file_stat_t *stat_info)
Definition: os0file.cc:2975
UNIV_INTERN os_file_t os_file_create_simple_func(const char *name, ulint create_mode, ulint access_type, ibool *success)
Definition: os0file.cc:1087
fil_node_t * message1
Definition: os0file.cc:182
ibool srv_start_raw_disk_in_use
Definition: srv0start.cc:110
UNIV_INTERN ibool os_file_set_eof(FILE *file)
Definition: os0file.cc:2055
UNIV_INTERN os_file_t os_file_create_func(const char *name, ulint create_mode, ulint purpose, ulint type, ibool *success)
Definition: os0file.cc:1391
UNIV_INLINE void * ut_align(const void *ptr, ulint align_no)
os_mutex_t mutex
Definition: os0file.cc:204
UNIV_INTERN void os_mutex_exit(os_mutex_t mutex)
Definition: os0sync.cc:824
UNIV_INTERN void os_event_set(os_event_t event)
Definition: os0sync.cc:434
UNIV_INTERN int os_file_closedir(os_file_dir_t dir)
Definition: os0file.cc:817
UNIV_INTERN void os_thread_sleep(ulint tm)
Definition: os0thread.cc:265
#define OS_FILE_NOT_FOUND
Definition: os0file.h:130
UNIV_INTERN ib_int64_t os_event_reset(os_event_t event)
Definition: os0sync.cc:472
UNIV_INTERN void os_file_read_string(FILE *file, char *str, ulint size)
Definition: os0file.cc:2676
#define ut_a(EXPR)
Definition: ut0dbg.h:105
UNIV_INTERN void os_aio_wait_until_no_pending_writes(void)
Definition: os0file.cc:3575
ib_int64_t size
Definition: os0file.h:385
UNIV_INTERN ulint os_file_get_last_error(ibool report_all_errors)
Definition: os0file.cc:385
#define OS_WIN2000
Definition: os0file.h:196
#define OS_AIO_NORMAL
Definition: os0file.h:155
UNIV_INTERN int innobase_mysql_tmpfile(void)
Definition: ha_innodb.cc:1479
UNIV_INTERN os_mutex_t os_mutex_create(void)
Definition: os0sync.cc:774
UNIV_INTERN ibool os_file_delete(const char *name)
Definition: os0file.cc:1715
my_bool srv_file_per_table
Definition: srv0srv.cc:125
#define OS_DATA_FILE
Definition: os0file.h:125
#define ut_ad(EXPR)
Definition: ut0dbg.h:127
os_file_t file
Definition: os0file.cc:175
ulint fil_n_pending_tablespace_flushes
Definition: fil0fil.cc:120
os_event_t not_full
Definition: os0file.cc:205
UNIV_INTERN ibool os_file_set_size(const char *name, os_file_t file, ulint size, ulint size_high)
Definition: os0file.cc:1966
UNIV_INTERN void ut_free(void *ptr)
Definition: ut0mem.cc:294
#define ut_error
Definition: ut0dbg.h:115
UNIV_INLINE char * mem_strdupl(const char *str, ulint len)
#define OS_WIN7
Definition: os0file.h:203
UNIV_INLINE char * mem_strdup(const char *str)
UNIV_INTERN void os_event_free(os_event_t event)
Definition: os0sync.cc:535
DIR * os_file_dir_t
Definition: os0file.h:395
UNIV_INTERN void os_aio_refresh_stats(void)
Definition: os0file.cc:5215
ibool os_aio_print_debug
Definition: os0file.cc:149
ulint os_innodb_umask
Definition: os0file.cc:71
UNIV_INTERN ibool os_file_flush_func(os_file_t file)
Definition: os0file.cc:2122
UNIV_INTERN void ut_print_timestamp(FILE *file)
Definition: ut0ut.cc:247
UNIV_INTERN void os_aio_simulated_wake_handler_threads(void)
Definition: os0file.cc:3908
#define OS_FILE_OPEN
Definition: os0file.h:107
#define OS_WINXP
Definition: os0file.h:197
srv_shutdown_state
Definition: srv0start.h:113
UNIV_INTERN void os_thread_exit(void *exit_value)
Definition: os0thread.cc:199
ibool srv_is_being_started
Definition: srv0start.cc:116
ulint os_n_pending_reads
Definition: os0file.cc:305
UNIV_INTERN ibool os_file_status(const char *path, ibool *exists, os_file_type_t *type)
Definition: os0file.cc:2903
os_file_t handle
Definition: fil0fil.cc:141
#define OS_AIO_SIMULATED_WAKE_LATER
Definition: os0file.h:180
char name[OS_FILE_MAX_PATH]
Definition: os0file.h:383
UNIV_INTERN ibool os_aio_func(ulint type, ulint mode, const char *name, os_file_t file, void *buf, ulint offset, ulint offset_high, ulint n, fil_node_t *message1, void *message2)
Definition: os0file.cc:4018
#define OS_AIO_LOG
Definition: os0file.h:161
ulint os_n_pending_writes
Definition: os0file.cc:303
ulint fil_n_pending_log_flushes
Definition: fil0fil.cc:118