1 /*****************************************************************************
2 * RRDtool 1.4.3 Copyright by Tobi Oetiker, 1997-2010
3 *****************************************************************************
4 * rrd_open.c Open an RRD File
5 *****************************************************************************
7 *****************************************************************************/
19 #ifdef HAVE_BROKEN_MS_ASYNC
20 #include <sys/types.h>
27 #define _LK_UNLCK 0 /* Unlock */
28 #define _LK_LOCK 1 /* Lock */
29 #define _LK_NBLCK 2 /* Non-blocking lock */
30 #define _LK_RLCK 3 /* Lock for read only */
31 #define _LK_NBRLCK 4 /* Non-blocking lock for read only */
34 #define LK_UNLCK _LK_UNLCK
35 #define LK_LOCK _LK_LOCK
36 #define LK_NBLCK _LK_NBLCK
37 #define LK_RLCK _LK_RLCK
38 #define LK_NBRLCK _LK_NBRLCK
41 /* DEBUG 2 prints information obtained via mincore(2) */
43 /* do not calculate exact madvise hints but assume 1 page for headers and
44 * set DONTNEED for the rest, which is assumed to be data */
45 /* Avoid calling madvise on areas that were already hinted. May be benefical if
46 * your syscalls are very slow */
49 /* the cast to void* is there to avoid this warning seen on ia64 with certain
50 versions of gcc: 'cast increases required alignment of target type'
52 #define __rrd_read(dst, dst_t, cnt) { \
53 size_t wanted = sizeof(dst_t)*(cnt); \
54 if (offset + wanted > rrd_file->file_len) { \
55 rrd_set_error("reached EOF while loading header " #dst); \
56 goto out_nullify_head; \
58 (dst) = (dst_t*)(void*) (data + offset); \
62 #define __rrd_read(dst, dst_t, cnt) { \
63 size_t wanted = sizeof(dst_t)*(cnt); \
65 if ((dst = (dst_t*)malloc(wanted)) == NULL) { \
66 rrd_set_error(#dst " malloc"); \
67 goto out_nullify_head; \
69 got = read (rrd_simple_file->fd, dst, wanted); \
70 if (got != wanted) { \
71 rrd_set_error("short read while reading header " #dst); \
72 goto out_nullify_head; \
78 /* get the address of the start of this page */
79 #if defined USE_MADVISE || defined HAVE_POSIX_FADVISE
81 #define PAGE_START(addr) ((addr)&(~(_page_size-1)))
85 /* Open a database file, return its header and an open filehandle,
86 * positioned to the first cdp in the first rra.
87 * In the error path of rrd_open, only rrd_free(&rrd) has to be called
88 * before returning an error. Do not call rrd_close upon failure of rrd_open.
89 * If creating a new file, the parameter rrd must be initialised with
90 * details of the file content.
91 * If opening an existing file, then use rrd must be initialised by
92 * rrd_init(rrd) prior to invoking rrd_open
96 const char *const file_name,
105 ssize_t _page_size = sysconf(_SC_PAGESIZE);
106 char *data = MAP_FAILED;
110 rrd_file_t *rrd_file = NULL;
111 rrd_simple_file_t *rrd_simple_file = NULL;
112 size_t newfile_size = 0;
113 size_t header_len, value_cnt, data_len;
115 /* Are we creating a new file? */
116 if((rdwr & RRD_CREAT) && (rrd->stat_head != NULL))
118 header_len = rrd_get_header_size(rrd);
121 for (ui = 0; ui < rrd->stat_head->rra_cnt; ui++)
122 value_cnt += rrd->stat_head->ds_cnt * rrd->rra_def[ui].row_cnt;
124 data_len = sizeof(rrd_value_t) * value_cnt;
126 newfile_size = header_len + data_len;
129 rrd_file = (rrd_file_t*)malloc(sizeof(rrd_file_t));
130 if (rrd_file == NULL) {
131 rrd_set_error("allocating rrd_file descriptor for '%s'", file_name);
134 memset(rrd_file, 0, sizeof(rrd_file_t));
136 rrd_file->pvt = malloc(sizeof(rrd_simple_file_t));
137 if(rrd_file->pvt == NULL) {
138 rrd_set_error("allocating rrd_simple_file for '%s'", file_name);
141 memset(rrd_file->pvt, 0, sizeof(rrd_simple_file_t));
142 rrd_simple_file = (rrd_simple_file_t *)rrd_file->pvt;
145 if ((rdwr & (RRD_READONLY | RRD_READWRITE)) ==
146 (RRD_READONLY | RRD_READWRITE)) {
147 /* Both READONLY and READWRITE were given, which is invalid. */
148 rrd_set_error("in read/write request mask");
154 rrd_simple_file->mm_prot = PROT_READ;
155 rrd_simple_file->mm_flags = 0;
158 if (rdwr & RRD_READONLY) {
162 rrd_simple_file->mm_flags = MAP_PRIVATE;
164 # ifdef MAP_NORESERVE
165 rrd_simple_file->mm_flags |= MAP_NORESERVE; /* readonly, so no swap backing needed */
169 if (rdwr & RRD_READWRITE) {
172 rrd_simple_file->mm_flags = MAP_SHARED;
173 rrd_simple_file->mm_prot |= PROT_WRITE;
176 if (rdwr & RRD_CREAT) {
177 flags |= (O_CREAT | O_TRUNC);
179 if (rdwr & RRD_EXCL) {
183 if (rdwr & RRD_READAHEAD) {
185 rrd_simple_file->mm_flags |= MAP_POPULATE; /* populate ptes and data */
187 #if defined MAP_NONBLOCK
188 rrd_simple_file->mm_flags |= MAP_NONBLOCK; /* just populate ptes */
191 #if defined(_WIN32) && !defined(__CYGWIN__) && !defined(__CYGWIN32__)
195 if ((rrd_simple_file->fd = open(file_name, flags, 0666)) < 0) {
196 rrd_set_error("opening '%s': %s", file_name, rrd_strerror(errno));
201 #ifdef HAVE_BROKEN_MS_ASYNC
202 if (rdwr & RRD_READWRITE) {
203 /* some unices, the files mtime does not get update
204 on msync MS_ASYNC, in order to help them,
205 we update the the timestamp at this point.
206 The thing happens pretty 'close' to the open
207 call so the chances of a race should be minimal.
209 Maybe ask your vendor to fix your OS ... */
210 utime(file_name,NULL);
215 /* Better try to avoid seeks as much as possible. stat may be heavy but
216 * many concurrent seeks are even worse. */
217 if (newfile_size == 0 && ((fstat(rrd_simple_file->fd, &statb)) < 0)) {
218 rrd_set_error("fstat '%s': %s", file_name, rrd_strerror(errno));
221 if (newfile_size == 0) {
222 rrd_file->file_len = statb.st_size;
224 rrd_file->file_len = newfile_size;
225 #ifdef HAVE_POSIX_FALLOCATE
226 if (posix_fallocate(rrd_simple_file->fd, 0, newfile_size) == -1) {
227 rrd_set_error("posix_fallocate '%s': %s", file_name,
228 rrd_strerror(errno));
232 lseek(rrd_simple_file->fd, newfile_size - 1, SEEK_SET);
233 if ( write(rrd_simple_file->fd, "\0", 1) == -1){ /* poke */
234 rrd_set_error("write '%s': %s", file_name, rrd_strerror(errno));
237 lseek(rrd_simple_file->fd, 0, SEEK_SET);
240 #ifdef HAVE_POSIX_FADVISE
241 /* In general we need no read-ahead when dealing with rrd_files.
242 When we stop reading, it is highly unlikely that we start up again.
243 In this manner we actually save time and diskaccess (and buffer cache).
244 Thanks to Dave Plonka for the Idea of using POSIX_FADV_RANDOM here. */
245 posix_fadvise(rrd_simple_file->fd, 0, 0, POSIX_FADV_RANDOM);
249 if (rdwr & RRD_READWRITE)
251 if (setvbuf((rrd_simple_file->fd),NULL,_IONBF,2)) {
252 rrd_set_error("failed to disable the stream buffer\n");
259 #ifndef HAVE_POSIX_FALLOCATE
260 /* force allocating the file on the underlaying filesystem to prevent any
261 * future bus error when the filesystem is full and attempting to write
262 * trough the file mapping. Filling the file using memset on the file
263 * mapping can also lead some bus error, so we use the old fashioned
266 if (rdwr & RRD_CREAT) {
270 memset(buf, DNAN, sizeof buf);
271 lseek(rrd_simple_file->fd, offset, SEEK_SET);
273 for (i = 0; i < (newfile_size - 1) / sizeof buf; ++i)
275 if (write(rrd_simple_file->fd, buf, sizeof buf) == -1)
277 rrd_set_error("write '%s': %s", file_name, rrd_strerror(errno));
282 if (write(rrd_simple_file->fd, buf,
283 (newfile_size - 1) % sizeof buf) == -1)
285 rrd_set_error("write '%s': %s", file_name, rrd_strerror(errno));
289 lseek(rrd_simple_file->fd, 0, SEEK_SET);
293 data = mmap(0, rrd_file->file_len,
294 rrd_simple_file->mm_prot, rrd_simple_file->mm_flags,
295 rrd_simple_file->fd, offset);
297 /* lets see if the first read worked */
298 if (data == MAP_FAILED) {
299 rrd_set_error("mmaping file '%s': %s", file_name,
300 rrd_strerror(errno));
303 rrd_simple_file->file_start = data;
304 if (rdwr & RRD_CREAT) {
308 if (rdwr & RRD_CREAT)
311 if (rdwr & RRD_COPY) {
312 /* We will read everything in a moment (copying) */
313 madvise(data, rrd_file->file_len, MADV_WILLNEED );
314 madvise(data, rrd_file->file_len, MADV_SEQUENTIAL );
316 /* We do not need to read anything in for the moment */
317 madvise(data, rrd_file->file_len, MADV_RANDOM);
318 /* the stat_head will be needed soonish, so hint accordingly */
319 madvise(data, sizeof(stat_head_t), MADV_WILLNEED);
320 madvise(data, sizeof(stat_head_t), MADV_RANDOM);
324 __rrd_read(rrd->stat_head, stat_head_t,
327 /* lets do some test if we are on track ... */
328 if (memcmp(rrd->stat_head->cookie, RRD_COOKIE, sizeof(RRD_COOKIE)) != 0) {
329 rrd_set_error("'%s' is not an RRD file", file_name);
330 goto out_nullify_head;
333 if (rrd->stat_head->float_cookie != FLOAT_COOKIE) {
334 rrd_set_error("This RRD was created on another architecture");
335 goto out_nullify_head;
338 version = atoi(rrd->stat_head->version);
340 if (version > atoi(RRD_VERSION)) {
341 rrd_set_error("can't handle RRD file version %s",
342 rrd->stat_head->version);
343 goto out_nullify_head;
345 #if defined USE_MADVISE
346 /* the ds_def will be needed soonish, so hint accordingly */
347 madvise(data + PAGE_START(offset),
348 sizeof(ds_def_t) * rrd->stat_head->ds_cnt, MADV_WILLNEED);
350 __rrd_read(rrd->ds_def, ds_def_t,
351 rrd->stat_head->ds_cnt);
353 #if defined USE_MADVISE
354 /* the rra_def will be needed soonish, so hint accordingly */
355 madvise(data + PAGE_START(offset),
356 sizeof(rra_def_t) * rrd->stat_head->rra_cnt, MADV_WILLNEED);
358 __rrd_read(rrd->rra_def, rra_def_t,
359 rrd->stat_head->rra_cnt);
361 /* handle different format for the live_head */
363 rrd->live_head = (live_head_t *) malloc(sizeof(live_head_t));
364 if (rrd->live_head == NULL) {
365 rrd_set_error("live_head_t malloc");
368 #if defined USE_MADVISE
369 /* the live_head will be needed soonish, so hint accordingly */
370 madvise(data + PAGE_START(offset), sizeof(time_t), MADV_WILLNEED);
372 __rrd_read(rrd->legacy_last_up, time_t,
375 rrd->live_head->last_up = *rrd->legacy_last_up;
376 rrd->live_head->last_up_usec = 0;
378 #if defined USE_MADVISE
379 /* the live_head will be needed soonish, so hint accordingly */
380 madvise(data + PAGE_START(offset),
381 sizeof(live_head_t), MADV_WILLNEED);
383 __rrd_read(rrd->live_head, live_head_t,
386 __rrd_read(rrd->pdp_prep, pdp_prep_t,
387 rrd->stat_head->ds_cnt);
388 __rrd_read(rrd->cdp_prep, cdp_prep_t,
389 rrd->stat_head->rra_cnt * rrd->stat_head->ds_cnt);
390 __rrd_read(rrd->rra_ptr, rra_ptr_t,
391 rrd->stat_head->rra_cnt);
393 rrd_file->header_len = offset;
394 rrd_file->pos = offset;
397 unsigned long row_cnt = 0;
399 for (ui=0; ui<rrd->stat_head->rra_cnt; ui++)
400 row_cnt += rrd->rra_def[ui].row_cnt;
402 size_t correct_len = rrd_file->header_len +
403 sizeof(rrd_value_t) * row_cnt * rrd->stat_head->ds_cnt;
405 if (correct_len > rrd_file->file_len)
407 rrd_set_error("'%s' is too small (should be %ld bytes)",
408 file_name, (long long) correct_len);
409 goto out_nullify_head;
416 rrd->stat_head = NULL;
419 if (data != MAP_FAILED)
420 munmap(data, rrd_file->file_len);
423 close(rrd_simple_file->fd);
431 #if defined DEBUG && DEBUG > 1
432 /* Print list of in-core pages of a the current rrd_file. */
435 rrd_file_t *rrd_file,
438 rrd_simple_file_t *rrd_simple_file;
439 rrd_simple_file = (rrd_simple_file_t *)rrd_file->pvt;
441 /* pretty print blocks in core */
444 ssize_t _page_size = sysconf(_SC_PAGESIZE);
446 off = rrd_file->file_len +
447 ((rrd_file->file_len + _page_size - 1) / _page_size);
451 if (mincore(rrd_simple_file->file_start, rrd_file->file_len, vec) == 0) {
453 unsigned is_in = 0, was_in = 0;
455 for (off = 0, prev = 0; off < rrd_file->file_len; ++off) {
456 is_in = vec[off] & 1; /* if lsb set then is core resident */
459 if (was_in != is_in) {
460 fprintf(stderr, "%s: %sin core: %p len %ld\n", mark,
461 was_in ? "" : "not ", vec + prev, off - prev);
467 "%s: %sin core: %p len %ld\n", mark,
468 was_in ? "" : "not ", vec + prev, off - prev);
470 fprintf(stderr, "mincore: %s", rrd_strerror(errno));
473 fprintf(stderr, "sorry mincore only works with mmap");
476 #endif /* defined DEBUG && DEBUG > 1 */
479 * get exclusive lock to whole file.
480 * lock gets removed when we close the file
482 * returns 0 on success
485 rrd_file_t *rrd_file)
488 rrd_simple_file_t *rrd_simple_file;
489 rrd_simple_file = (rrd_simple_file_t *)rrd_file->pvt;
492 #if defined(_WIN32) && !defined(__CYGWIN__) && !defined(__CYGWIN32__)
495 if (_fstat(rrd_simple_file->fd, &st) == 0) {
496 rcstat = _locking(rrd_simple_file->fd, _LK_NBLCK, st.st_size);
503 lock.l_type = F_WRLCK; /* exclusive write lock */
504 lock.l_len = 0; /* whole file */
505 lock.l_start = 0; /* start of file */
506 lock.l_whence = SEEK_SET; /* end of file */
508 rcstat = fcntl(rrd_simple_file->fd, F_SETLK, &lock);
516 /* drop cache except for the header and the active pages */
518 rrd_file_t *rrd_file,
521 rrd_simple_file_t *rrd_simple_file = (rrd_simple_file_t *)rrd_file->pvt;
522 #if defined USE_MADVISE || defined HAVE_POSIX_FADVISE
523 size_t dontneed_start;
527 ssize_t _page_size = sysconf(_SC_PAGESIZE);
529 if (rrd_file == NULL) {
530 #if defined DEBUG && DEBUG
531 fprintf (stderr, "rrd_dontneed: Argument 'rrd_file' is NULL.\n");
536 #if defined DEBUG && DEBUG > 1
537 mincore_print(rrd_file, "before");
540 /* ignoring errors from RRDs that are smaller then the file_len+rounding */
541 rra_start = rrd_file->header_len;
542 dontneed_start = PAGE_START(rra_start) + _page_size;
543 for (i = 0; i < rrd->stat_head->rra_cnt; ++i) {
546 + rrd->rra_ptr[i].cur_row
547 * rrd->stat_head->ds_cnt * sizeof(rrd_value_t));
548 if (active_block > dontneed_start) {
550 madvise(rrd_simple_file->file_start + dontneed_start,
551 active_block - dontneed_start - 1, MADV_DONTNEED);
553 /* in linux at least only fadvise DONTNEED seems to purge pages from cache */
554 #ifdef HAVE_POSIX_FADVISE
555 posix_fadvise(rrd_simple_file->fd, dontneed_start,
556 active_block - dontneed_start - 1,
557 POSIX_FADV_DONTNEED);
560 dontneed_start = active_block;
561 /* do not release 'hot' block if update for this RAA will occur
562 * within 10 minutes */
563 if (rrd->stat_head->pdp_step * rrd->rra_def[i].pdp_cnt -
564 rrd->live_head->last_up % (rrd->stat_head->pdp_step *
565 rrd->rra_def[i].pdp_cnt) < 10 * 60) {
566 dontneed_start += _page_size;
569 rrd->rra_def[i].row_cnt * rrd->stat_head->ds_cnt *
573 if (dontneed_start < rrd_file->file_len) {
575 madvise(rrd_simple_file->file_start + dontneed_start,
576 rrd_file->file_len - dontneed_start, MADV_DONTNEED);
578 #ifdef HAVE_POSIX_FADVISE
579 posix_fadvise(rrd_simple_file->fd, dontneed_start,
580 rrd_file->file_len - dontneed_start,
581 POSIX_FADV_DONTNEED);
585 #if defined DEBUG && DEBUG > 1
586 mincore_print(rrd_file, "after");
588 #endif /* without madvise and posix_fadvise it does not make much sense todo anything */
596 rrd_file_t *rrd_file)
598 rrd_simple_file_t *rrd_simple_file;
599 rrd_simple_file = (rrd_simple_file_t *)rrd_file->pvt;
603 ret = msync(rrd_simple_file->file_start, rrd_file->file_len, MS_ASYNC);
605 rrd_set_error("msync rrd_file: %s", rrd_strerror(errno));
606 ret = munmap(rrd_simple_file->file_start, rrd_file->file_len);
608 rrd_set_error("munmap rrd_file: %s", rrd_strerror(errno));
610 ret = close(rrd_simple_file->fd);
612 rrd_set_error("closing file: %s", rrd_strerror(errno));
620 /* Set position of rrd_file. */
623 rrd_file_t *rrd_file,
628 rrd_simple_file_t *rrd_simple_file;
629 rrd_simple_file = (rrd_simple_file_t *)rrd_file->pvt;
632 if (whence == SEEK_SET)
634 else if (whence == SEEK_CUR)
635 rrd_file->pos += off;
636 else if (whence == SEEK_END)
637 rrd_file->pos = rrd_file->file_len + off;
639 ret = lseek(rrd_simple_file->fd, off, whence);
641 rrd_set_error("lseek: %s", rrd_strerror(errno));
644 /* mimic fseek, which returns 0 upon success */
645 return ret < 0; /*XXX: or just ret to mimic lseek */
649 /* Get current position in rrd_file. */
652 rrd_file_t *rrd_file)
654 return rrd_file->pos;
658 /* Read count bytes into buffer buf, starting at rrd_file->pos.
659 * Returns the number of bytes read or <0 on error. */
662 rrd_file_t *rrd_file,
666 rrd_simple_file_t *rrd_simple_file = (rrd_simple_file_t *)rrd_file->pvt;
671 if (rrd_file->pos > rrd_file->file_len || _cnt == 0) /* EOF */
674 return -1; /* EINVAL */
675 _surplus = rrd_file->pos + _cnt - rrd_file->file_len;
676 if (_surplus > 0) { /* short read */
681 buf = memcpy(buf, rrd_simple_file->file_start + rrd_file->pos, _cnt);
683 rrd_file->pos += _cnt; /* mimmic read() semantics */
688 ret = read(rrd_simple_file->fd, buf, count);
690 rrd_file->pos += ret; /* mimmic read() semantics */
696 /* Write count bytes from buffer buf to the current position
697 * rrd_file->pos of rrd_simple_file->fd.
698 * Returns the number of bytes written or <0 on error. */
701 rrd_file_t *rrd_file,
705 rrd_simple_file_t *rrd_simple_file = (rrd_simple_file_t *)rrd_file->pvt;
707 size_t old_size = rrd_file->file_len;
711 return -1; /* EINVAL */
713 if((rrd_file->pos + count) > old_size)
715 rrd_set_error("attempting to write beyond end of file (%ld + %ld > %ld)",rrd_file->pos, count, old_size);
718 memcpy(rrd_simple_file->file_start + rrd_file->pos, buf, count);
719 rrd_file->pos += count;
720 return count; /* mimmic write() semantics */
722 ssize_t _sz = write(rrd_simple_file->fd, buf, count);
725 rrd_file->pos += _sz;
731 /* this is a leftover from the old days, it serves no purpose
732 and is therefore turned into a no-op */
734 rrd_file_t UNUSED(*rrd_file))
738 /* Initialize RRD header. */
743 rrd->stat_head = NULL;
746 rrd->live_head = NULL;
747 rrd->legacy_last_up = NULL;
749 rrd->pdp_prep = NULL;
750 rrd->cdp_prep = NULL;
751 rrd->rrd_value = NULL;
755 /* free RRD header data. */
761 if (rrd->legacy_last_up) { /* this gets set for version < 3 only */
762 free(rrd->live_head);
769 free(rrd->live_head);
770 free(rrd->stat_head);
776 free(rrd->rrd_value);
781 /* routine used by external libraries to free memory allocated by
791 * rra_update informs us about the RRAs being updated
792 * The low level storage API may use this information for
793 * aligning RRAs within stripes, or other performance enhancements
796 rrd_file_t UNUSED(*rrd_file),
798 unsigned long UNUSED(rra_row),
799 time_t UNUSED(rra_time))
804 * This function is called when creating a new RRD
805 * The storage implementation can use this opportunity to select
806 * a sensible starting row within the file.
807 * The default implementation is random, to ensure that all RRAs
808 * don't change to a new disk block at the same time
810 unsigned long rrd_select_initial_row(
811 rrd_file_t UNUSED(*rrd_file),
816 return rrd_random() % rra->row_cnt;