#include "verneuil.h" #include "vfs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SQLITE_EXTENSION_INIT1 /** * The Verneuil VFS layer is based on the Linux VFS, with Rust hooks * around file read, write, and locking operations. * * Unlike the Linux VFS, this VFS accidentally passes * `unixexcl-1.2.4.singleproc`, but fails `unixexcl-1.1.4.multiproc`: * the `sqlite3PendingByte` offset is only available when built with * the amalgamated source, so we can't use the test-only lock range. */ /* Temporary directories must have some room left for filenames. */ #define LINUX_VFS_TEMPPATH_MAX (PATH_MAX - 200) #ifndef SQLITE_DEFAULT_FILE_PERMISSIONS # define SQLITE_DEFAULT_FILE_PERMISSIONS 0644 #endif /* * Avoid using file descriptors lower than this value: we don't want * writes to what should be stdout or stderr to stomp over our data. */ #ifndef SQLITE_MINIMUM_FILE_DESCRIPTOR # define SQLITE_MINIMUM_FILE_DESCRIPTOR 3 #endif #ifndef SQLITE_TEMP_FILE_PREFIX # define SQLITE_TEMP_FILE_PREFIX "etilqs_" #endif /** * Some older distributions fail to expose these constants. Hardcode * them if necessary: open file description locks have been around * since Linux 3.15, and we don't try to support anything that old * (getrandom was introduced in 3.17, and we also use that). */ #ifndef F_OFD_GETLK #define F_OFD_GETLK 36 #define F_OFD_SETLK 37 #define F_OFD_SETLKW 38 #endif #if (F_OFD_GETLK != 36) || (F_OFD_SETLK != 37) || (F_OFD_SETLKW != 38) # error "Mismatch in fallback OFD fcntl constants." #endif #ifdef SQLITE_TEST /* * Define these test-only counters in BSS; when the sqlite test driver * also has a definition for these, they will be merged at link-time. */ int sqlite3_sync_count; int sqlite3_fullsync_count; int sqlite3_open_file_count; int sqlite3_current_time; #endif struct verneuil_tracker; struct verneuil_snapshot; struct linux_file { sqlite3_file base; /* * The descriptor for this file object. The underlying kernel * file object must be unique owned by this file struct (i.e., * can't dup this). */ int fd; /* * Current lock level for this file object on the underlying * inode. Matches the level parameters of xLock / xUnlock: * * SQLITE_LOCK_NONE 0 * SQLITE_LOCK_SHARED 1 * SQLITE_LOCK_RESERVED 2 * SQLITE_LOCK_PENDING 3 * SQLITE_LOCK_EXCLUSIVE 4 */ int lock_level; /* * NULLable. Only NULL for temporary files, otherwise this is * an absolute path to the file. */ const char *path; /* * This metadata identifies the inode that `fd` refers to. */ dev_t device; ino_t inode; /* * The Rust-side's change tracking state. */ struct verneuil_tracker *tracker; /* * Wait up to this many milliseconds for each lock * acquisition. */ uint32_t lock_timeout_ms; /* * sqlite expects us to fsync the parent directory along with * this (journal) file, in order to guarantee its visibility. * * We don't actually do that, but we still want to fake it for * tests. */ bool dirsync_pending; /* * This flag is set to true once sqlite acquires the write * lock, and cleared after the first write to the locked file. * * `verneuil__file_write` (in `vfs_ops.rs`) uses this to * remember to update the file's xattr before the first write. */ bool first_write_in_transaction; /* * When this flag is true, the VFS should attempt to * synchronously flush all spooled replication data when * closing the file. */ bool flush_on_close; }; static_assert(sizeof(struct sqlite3_file) == sizeof(void *), "vfs_ops.rs assumes sqlite3_file consists of one vtable pointer."); static_assert(sizeof(dev_t) == sizeof(uint64_t), "vfs_ops.rs assumes dev_t as a u64."); static_assert(sizeof(ino_t) == sizeof(uint64_t), "vfs_ops.rs assumes ino_t as a u64."); struct snapshot_file { sqlite3_file base; bool locked; bool auto_refresh; struct verneuil_snapshot *snapshot; }; static_assert(SQLITE_LOCK_NONE == 0, "vfs_ops.rs assumes NONE == 0"); static_assert(SQLITE_LOCK_SHARED == 1, "vfs_ops.rs assumes SHARED == 1"); static_assert(SQLITE_LOCK_RESERVED == 2, "vfs_ops.rs assumes RESERVED == 2"); static_assert(SQLITE_LOCK_PENDING == 3, "vfs_ops.rs assumes PENDING == 3"); static_assert(SQLITE_LOCK_EXCLUSIVE == 4, "vfs_ops.rs assumes EXCLUSIVE == 4"); typedef void dlfun_t(void); static int linux_open(sqlite3_vfs *, const char *name, sqlite3_file *, int flags, int *OUT_flags); static int linux_delete(sqlite3_vfs *, const char *name, int syncDir); static int linux_access(sqlite3_vfs *, const char *name, int flags, int *OUT_res); static int linux_full_pathname(sqlite3_vfs *, const char *name, int n, char *dst); static void *linux_dlopen(sqlite3_vfs *, const char *name); static void linux_dlerror(sqlite3_vfs *, int n, char *OUT_error); static dlfun_t *linux_dlsym(sqlite3_vfs *, void *, const char *symbol); static void linux_dlclose(sqlite3_vfs *, void *); static int linux_randomness(sqlite3_vfs *, int n, char *dst); static int linux_sleep(sqlite3_vfs *, int microseconds); static int linux_get_last_error(sqlite3_vfs *, int n, char *OUT_error); static int linux_current_time_int64(sqlite3_vfs *, sqlite3_int64 *); static int linux_set_syscall(sqlite3_vfs *, const char *, sqlite3_syscall_ptr); static sqlite3_syscall_ptr linux_get_syscall(sqlite3_vfs *, const char *); static const char *linux_next_syscall(sqlite3_vfs *, const char *); static int linux_file_check_reserved_lock(sqlite3_file *, int *OUT_result); static int linux_file_control(sqlite3_file *, int op, void *arg); static int linux_file_sector_size(sqlite3_file *); static int linux_file_device_characteristics(sqlite3_file *); static int snapshot_open(sqlite3_vfs *, const char *name, sqlite3_file *, int flags, int *OUT_flags); static int snapshot_full_pathname(sqlite3_vfs *, const char *name, int n, char *dst); static int snapshot_check_reserved_lock(sqlite3_file *, int *OUT_result); static int snapshot_file_control(sqlite3_file *, int op, void *arg); static int snapshot_device_characteristics(sqlite3_file *); /* * The directory for temporary files. It is lazily computed once, * and then cached. The first value published to this variable * is sticky: all writes must compare-and-swap with NULL. */ static const char *_Atomic linux_vfs_tempdir = NULL; /* * We use this vtable of IO methods for files that do not require * write tracking for replication, i.e., everything but main DB files. */ static const struct sqlite3_io_methods verneuil_io_methods = { .iVersion = 1, /* No WAL or mmap method */ .xClose = verneuil__file_close_impl, .xRead = verneuil__file_read_impl, .xWrite = verneuil__file_write_impl, .xTruncate = verneuil__file_truncate_impl, .xSync = verneuil__file_sync_impl, .xFileSize = verneuil__file_size_impl, .xLock = verneuil__file_lock_impl, .xUnlock = verneuil__file_unlock_impl, .xCheckReservedLock = linux_file_check_reserved_lock, .xFileControl = linux_file_control, .xSectorSize = linux_file_sector_size, .xDeviceCharacteristics = linux_file_device_characteristics, }; /* * We use this vtable for main DB files, to let Rust code intercept * I/O. */ static const struct sqlite3_io_methods verneuil_intercept_io_methods = { .iVersion = 1, /* No WAL or mmap method */ .xClose = verneuil__file_close, .xRead = verneuil__file_read, .xWrite = verneuil__file_write, .xTruncate = verneuil__file_truncate, .xSync = verneuil__file_sync, .xFileSize = verneuil__file_size, .xLock = verneuil__file_lock, .xUnlock = verneuil__file_unlock, .xCheckReservedLock = linux_file_check_reserved_lock, .xFileControl = linux_file_control, .xSectorSize = linux_file_sector_size, .xDeviceCharacteristics = linux_file_device_characteristics, }; /* * This vtable handles read-only snapshot files, where writes always fail, * locking always succeeds, and nearly everything is serviced by Rust code. */ static const struct sqlite3_io_methods verneuil_snapshot_io_methods = { .iVersion = 1, /* No WAL or mmap method */ .xClose = verneuil__snapshot_close, .xRead = verneuil__snapshot_read, .xWrite = verneuil__snapshot_write, .xTruncate = verneuil__snapshot_truncate, .xSync = verneuil__snapshot_sync, .xFileSize = verneuil__snapshot_size, .xLock = verneuil__snapshot_lock, .xUnlock = verneuil__snapshot_unlock, .xCheckReservedLock = snapshot_check_reserved_lock, .xFileControl = snapshot_file_control, .xSectorSize = linux_file_sector_size, .xDeviceCharacteristics = snapshot_device_characteristics, }; static sqlite3_vfs verneuil_vfs = { .iVersion = 3, .szOsFile = sizeof(struct linux_file), .mxPathname = PATH_MAX, .zName = "verneuil", .xOpen = linux_open, .xDelete = linux_delete, .xAccess = linux_access, .xFullPathname = linux_full_pathname, .xDlOpen = linux_dlopen, .xDlError = linux_dlerror, .xDlSym = linux_dlsym, .xDlClose = linux_dlclose, .xRandomness = linux_randomness, .xSleep = linux_sleep, /* CurrentTime isn't used when CurrentTimeInt64 is available. */ .xGetLastError = linux_get_last_error, .xCurrentTimeInt64 = linux_current_time_int64, /* * Parts of the test suite requires these methods to exist, * although they don't have to actually do anything. */ .xSetSystemCall = linux_set_syscall, .xGetSystemCall = linux_get_syscall, .xNextSystemCall = linux_next_syscall, }; static sqlite3_vfs verneuil_snapshot_vfs = { .iVersion = 3, /* * The VFS uses snapshot_file for the main DB file, and * linux_file for everything else. */ .szOsFile = sizeof(struct linux_file), .mxPathname = PATH_MAX, .zName = "verneuil_snapshot", .xOpen = snapshot_open, .xDelete = linux_delete, .xAccess = linux_access, .xFullPathname = snapshot_full_pathname, .xDlOpen = linux_dlopen, .xDlError = linux_dlerror, .xDlSym = linux_dlsym, .xDlClose = linux_dlclose, .xRandomness = linux_randomness, .xSleep = linux_sleep, /* CurrentTime isn't used when CurrentTimeInt64 is available. */ .xGetLastError = linux_get_last_error, .xCurrentTimeInt64 = linux_current_time_int64, }; static_assert(sizeof(struct linux_file) >= sizeof(struct snapshot_file), "verneuil_snapshot_open assumes a snapshot_file fits in a linux_file"); /** * Is `dir` currently a valid temporary directory? */ static bool is_valid_tempdir(const char *dir) { struct stat sb; int r; /* Is the path too long? */ if (strnlen(dir, LINUX_VFS_TEMPPATH_MAX) >= LINUX_VFS_TEMPPATH_MAX) return false; /* Is it a directory? */ do { r = stat(dir, &sb); } while (r != 0 && errno == EINTR); if (r != 0 || S_ISDIR(sb.st_mode) == 0) return false; /* Do we have read and write access to it? */ do { r = access(dir, W_OK | X_OK); } while (r != 0 && errno == EINTR); return r == 0; } static int verneuil_set_tempdir(const char *dir) { static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; /* Reserve room for the file name itself. */ static char tempdir_copy[LINUX_VFS_TEMPPATH_MAX]; const char *expected; int r; bool success = false; if (is_valid_tempdir(dir) == false) return SQLITE_MISUSE; r = pthread_mutex_lock(&lock); if (r != 0) return SQLITE_INTERNAL; /* tempdir already computed. */ if (atomic_load(&linux_vfs_tempdir) != NULL) goto out; /* * This strncpy is safe because `tempdir_copy` must be empty: * no other thread may be in the current critical section, and * we ensure `linux_vfs_tempdir` is non-NULL before releasing * the lock. If we get here, it's the first call to * `sqlite3_linux_vfs_set_tempdir`. */ strncpy(tempdir_copy, dir, sizeof(tempdir_copy)); tempdir_copy[sizeof(tempdir_copy) - 1] = '\0'; expected = NULL; success = atomic_compare_exchange_strong(&linux_vfs_tempdir, &expected, tempdir_copy); out: r = pthread_mutex_unlock(&lock); assert(r == 0 && "pthread_mutex_unlock failed"); return (success == true) ? SQLITE_OK : SQLITE_LOCKED; } char *sqlite3_temp_directory __attribute__((weak)); static const char * compute_tempdir(void) { const char *dirs[] = { /* Use the same priority as the Unix VFS. */ sqlite3_temp_directory, /* * We call `getenv` the first time we compute the * temporary directory, and then cache the result. * That's not safe in there are calls to `setenv`, but * there's essentially nothing we can do to fix that; * the environment should be considered static. * * POSIX also allows the implementation to reuse a * mutable buffer for the return value of `getenv`. * We assume no libc is that broken. */ NULL, NULL, "/var/tmp", "/usr/tmp", "/tmp", ".", }; #ifdef _GNU_SOURCE /* * When GNU extensions are available, use secure_getenv to * ignore the environment in suid binaries: we don't want * untrusted user to control where our writes go. */ dirs[1] = secure_getenv("SQLITE_TMPDIR"); dirs[2] = secure_getenv("TMPDIR"); #else dirs[1] = getenv("SQLITE_TMPDIR"); dirs[2] = getenv("TMPDIR"); #endif for (size_t i = 0; i < sizeof(dirs) / sizeof(dirs[0]); i++) { const char *dir = dirs[i]; if (dir != NULL && is_valid_tempdir(dir)) return dir; } /* If nothing works, default to "/tmp" */ return "/tmp"; } static const char * get_tempdir_base(void) { const char *copy; const char *computed; copy = atomic_load_explicit(&linux_vfs_tempdir, memory_order_relaxed); if (copy != NULL) return copy; computed = compute_tempdir(); if (atomic_compare_exchange_strong(&linux_vfs_tempdir, ©, computed) == true) return computed; return copy; } /** * Ensures that `fd >= SQLITE_MINIMUM_FILE_DESCRIPTOR`, or * returns a negative value (failure). * * If `fd < 0`, it is immediately returned. */ static int linux_ensure_high_fd(int fd) { int err, fd2; if (fd < 0 || fd >= SQLITE_MINIMUM_FILE_DESCRIPTOR) return fd; /* * This FD is too low. Dup it up higher. */ do { fd2 = fcntl(fd, F_DUPFD_CLOEXEC, SQLITE_MINIMUM_FILE_DESCRIPTOR); } while (fd2 < 0 && errno == EINTR); err = errno; close(fd); errno = err; return fd2; } /** * Wraps open(2) to retry on EINTR and avoid returning file * descriptors below `SQLITE_MINIMUM_FILE_DESCRIPTOR` (usually 3, * to avoid stdin/stdout/stderr). */ static int linux_safe_open(const char *path, int flags, mode_t mode) { int fd; do { fd = open(path, flags | O_CLOEXEC, mode); } while (fd < 0 && errno == EINTR); fd = linux_ensure_high_fd(fd); assert(fd < 0 || fd >= SQLITE_MINIMUM_FILE_DESCRIPTOR); return fd; } static bool linux_path_exists(const char *path) { int r; do { r = access(path, F_OK); } while (r != 0 && errno == EINTR); return r == 0; } static int linux_open(sqlite3_vfs *vfs, const char *name, sqlite3_file *vfile, int flags, int *OUT_flags) { struct linux_file *file = (void *)vfile; const struct sqlite3_io_methods *io_methods; const int journal_mask = SQLITE_OPEN_SUPER_JOURNAL | SQLITE_OPEN_MAIN_JOURNAL | SQLITE_OPEN_WAL; int open_flags = O_CLOEXEC | O_LARGEFILE | O_NOFOLLOW; int fd, rc; const bool is_uri = (flags & SQLITE_OPEN_URI) != 0; (void)vfs; /* * Intercept IO calls on main DB files. These DB files should * have a name and be persistent, but if we are ever asked to * open such temporary main DBs, don't intercept their IO. * * When locking is disabled we can't tell what's a good time * to snapshot; there's also no change to track for immutable * files. In both cases, we don't want to intercept IO. */ if ((flags & SQLITE_OPEN_MAIN_DB) != 0 && name != NULL && (flags & SQLITE_OPEN_DELETEONCLOSE) == 0 && sqlite3_uri_boolean(is_uri ? name : NULL, "nolock", 0) == 0 && sqlite3_uri_boolean(is_uri ? name : NULL, "immutable", 0) == 0) { io_methods = &verneuil_intercept_io_methods; } else { io_methods = &verneuil_io_methods; } /* * It is safe to borrow `name` for the lifetime of the `file`. * https://www.sqlite.org/c3ref/vfs.html says * * SQLite further guarantees that the string will be valid * and unchanged until xClose() is called. Because of the * previous sentence, the sqlite3_file can safely store a * pointer to the filename if it needs to remember the * filename for some reason. */ *file = (struct linux_file) { .base.pMethods = io_methods, .fd = -1, .path = name, /* * If we're creating a new journal file, test code * expects the parent directory to be fsynced the * first time the journal itself is synced. We don't * do that, but we do want to increment the sync * counter in tests. */ .dirsync_pending = (flags & SQLITE_OPEN_CREATE) != 0 && (flags & journal_mask) != 0, }; if ((flags & SQLITE_OPEN_READONLY) != 0) open_flags |= O_RDONLY; if ((flags & SQLITE_OPEN_READWRITE) != 0) open_flags |= O_RDWR; /* * name == NULL means a temporary file, and implies * delete-on-close. The default unix VFS implements that flag * latter as open + unlink; let's skip that intermediate step * and get an `O_TMPFILE` in `get_tempdir_base()`. */ if (name == NULL || (flags & SQLITE_OPEN_DELETEONCLOSE) != 0) { const char *base; base = get_tempdir_base(); file->path = NULL; fd = linux_safe_open(base, O_TMPFILE | O_EXCL | open_flags, 0600); if (fd < 0 && errno == EACCES) { rc = SQLITE_READONLY_DIRECTORY; goto fail; } /* * Some filesystems do not support O_TMPFILE. Try a * minimalistic mkstemp-based fallback. If anything * goes wrong, propagate the EOPNOTSUPP error: we * don't really care too much about such filesystems. */ if (fd < 0 && errno == EOPNOTSUPP) { char *path; if (asprintf(&path, "%s/verneuil.XXXXXX", base) < 0) { rc = SQLITE_CANTOPEN_NOTEMPDIR; goto fail; } fd = mkostemp(path, O_CLOEXEC | O_LARGEFILE); if (fd >= 0) (void)unlink(path); free(path); fd = linux_ensure_high_fd(fd); if (fd < 0) { rc = SQLITE_CANTOPEN_NOTEMPDIR; goto fail; } } } else { if ((flags & SQLITE_OPEN_CREATE) != 0) open_flags |= O_CREAT; if ((flags & SQLITE_OPEN_EXCLUSIVE) != 0) open_flags |= O_EXCL; /* * TODO: we should ideally inherit the permission and * ownership of the main DB when opening journals. * * In most cases however, it's fine to just use the * the default (0644), and let umask do its thing. */ fd = linux_safe_open(name, open_flags, SQLITE_DEFAULT_FILE_PERMISSIONS); /* Bail early if we found a directory. */ if (fd < 0 && errno == EISDIR) { rc = SQLITE_CANTOPEN; goto fail; } /* * If we failed to create a new file that does not * already exist, report a read-only parent directory. */ if (fd < 0 && (flags & SQLITE_OPEN_CREATE) != 0 && errno == EACCES && linux_path_exists(name) == false) { rc = SQLITE_READONLY_DIRECTORY; goto fail; } /* * Try again in read-only mode. */ if (fd < 0 && (flags & SQLITE_OPEN_READWRITE) != 0) { flags &= ~(SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE); flags |= SQLITE_OPEN_READONLY; open_flags &= ~(O_RDWR | O_CREAT); open_flags |= O_RDONLY; fd = linux_safe_open(name, open_flags, 0); } } if (fd < 0) { rc = SQLITE_CANTOPEN; goto fail; } #ifdef SQLITE_TEST extern int sqlite3_open_file_count; sqlite3_open_file_count++; #endif file->fd = fd; /* * fstat the file to remember its current identity. */ if (name != NULL) { struct stat sb; int r; do { r = fstat(fd, &sb); } while (r != 0 && errno == EINTR); if (r != 0) { verneuil__file_close_impl(vfile); return SQLITE_IOERR_FSTAT; } file->device = sb.st_dev; file->inode = sb.st_ino; } /* * If we're intercepting this file's IO in Rust, run the * post-open logic. */ if (io_methods == &verneuil_intercept_io_methods) { rc = verneuil__file_post_open(file); if (rc != SQLITE_OK) { verneuil__file_close((struct sqlite3_file *)file); goto fail; } } if (OUT_flags != NULL) *OUT_flags = flags; return SQLITE_OK; fail: *file = (struct linux_file) { .fd = -1 }; return rc; } static int snapshot_open(sqlite3_vfs *vfs, const char *name, sqlite3_file *vfile, int flags, int *OUT_flags) { struct snapshot_file *file = (void *)vfile; int rc; (void)vfs; /* Snapshots don't have journals. */ if ((flags & SQLITE_OPEN_MAIN_JOURNAL) != 0) { errno = ENOENT; return SQLITE_CANTOPEN; } /* * Open a regular linux file unless the target is a persistent * main db file. */ if ((flags & SQLITE_OPEN_MAIN_DB) == 0 || name == NULL || (flags & SQLITE_OPEN_DELETEONCLOSE) != 0) { return linux_open(vfs, name, vfile, flags, OUT_flags); } /* * Simulate a read-only file. */ flags &= ~(SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE); flags |= SQLITE_OPEN_READONLY; *file = (struct snapshot_file) { .base.pMethods = &verneuil_snapshot_io_methods, }; rc = verneuil__snapshot_open(file, name); if (rc == SQLITE_OK && OUT_flags != NULL) *OUT_flags = flags; return rc; } static int linux_delete(sqlite3_vfs *vfs, const char *name, int sync_dir) { int r; (void)vfs; /* * We never fsync the parent directory (i.e., assume * `SQLITE_DISABLE_DIRSYNC` is always set). In the worst * case, this means a rollback journal could remain visible if * the OS crashes soon after a transaction commit. If that * happens, that last transaction may be lost. However, while * we take a hit on durability, the db will always be valid. * * We're willing to take that risk, for simpler code. */ (void)sync_dir; #ifndef SQLITE_DISABLE_DIRSYNC # warning "The Linux VFS assumes DIRSYNC is disabled." #endif do { r = unlink(name); } while (r != 0 && errno == EINTR); if (r != 0) { if (errno == ENOENT) return SQLITE_IOERR_DELETE_NOENT; return SQLITE_IOERR_DELETE; } #ifdef SQLITE_TEST /* * Some tests assert on the number of sync calls. Update that * counter as expected, even if we don't actually fsync. */ if ((sync_dir & 1) != 0) { extern int sqlite3_sync_count; sqlite3_sync_count++; } #endif return SQLITE_OK; } static int linux_access(sqlite3_vfs *vfs, const char *name, int flags, int *OUT_res) { int access_mode = F_OK; int r; (void)vfs; /* * The Unix VFS says a file exists if it's not a regular file, * or it's non-empty... Replicate that logic here. */ if (flags == SQLITE_ACCESS_EXISTS) { struct stat sb; do { r = stat(name, &sb); } while (r != 0 && errno == EINTR); /* If we can't stat the file, assume it doesn't exist. */ if (r != 0) { *OUT_res = 0; return SQLITE_OK; } if (S_ISREG(sb.st_mode) != 0) { *OUT_res = 1; return SQLITE_OK; } *OUT_res = (sb.st_size > 0) ? 1 : 0; return SQLITE_OK; } if (flags == SQLITE_ACCESS_READWRITE) { access_mode = W_OK | R_OK; } else if (flags == SQLITE_ACCESS_READ) { access_mode = R_OK; } do { r = access(name, access_mode); } while (r != 0 && errno == EINTR); *OUT_res = (r == 0) ? 1 : 0; return SQLITE_OK; } /** * Determines whether `name` is a symlink. * * @returns true on success, false on failure; */ static bool check_if_symlink(const char *name, bool *OUT_is_symlink) { struct stat sb; int r; *OUT_is_symlink = false; do { r = lstat(name, &sb); } while (r != 0 && errno == EINTR); if (r != 0) { /* * If the path doesn't exist yet, it's clearly not a * symlink. */ return errno == ENOENT; } *OUT_is_symlink = S_ISLNK(sb.st_mode) != 0; return true; } /** * Attempts to copy the result of `realpath` to dst. * * @returns true on success, false on failure. */ static bool resolve_path(const char *name, int n, char *dst) { char buf[PATH_MAX]; const char *resolved; size_t resolved_size; resolved = realpath(name, buf); if (resolved == NULL) return false; resolved_size = strlen(resolved) + 1; if ((int)resolved_size > n) return false; memcpy(dst, resolved, resolved_size); return true; } /** * Iteratively constructs the symlinked path for `link` in `dst`, * until `dst` is not a symlink. * * @returns true on success, false on failure. */ static bool walk_symlink(const char *link, char *dst, size_t n) { #ifdef SQLITE_MAX_SYMLINKS const size_t limit = SQLITE_MAX_SYMLINKS; #else const size_t limit = 100; #endif /* Copy `link` to `dst` */ { size_t link_len; link_len = strnlen(link, n); if (link_len >= n) return false; memcpy(dst, link, link_len + 1); } /* * Expand the last component of `dst` in place. */ for (size_t i = 0; i <= limit; i++) { char target[PATH_MAX]; char *last_slash; ssize_t r; bool is_symlink; /* * If we don't have a symlink, there's nothing else to * do. Also ignore all errors: either `resolve_parent_path` * will handle it, or `open(2)` will fail. */ if (check_if_symlink(dst, &is_symlink) == false || is_symlink == false) return true; do { r = readlink(dst, target, sizeof(target)); } while (r < 0 && errno == EINTR); if ((size_t)r >= sizeof(target)) return false; target[r] = '\0'; last_slash = strrchr(dst, '/'); if (last_slash == NULL) { if ((size_t)r >= n) return false; memcpy(dst, target, r + 1); } else { size_t remaining = (dst + n) - (last_slash + 1); if ((size_t)r > remaining) return false; memcpy(last_slash + 1, target, r + 1); } } /* * If we hit the iteration limit, that was too many symlinks, * and we should fail. */ return false; } /** * Find the real path for a file that might not exist yet, by * resolving only the parent directory, and appending the file to it. * The `name` string will be modified in place. * * This will fail to do something useful if the parent directory also * does not exist, but that's fine: that means there can't be funny * symlink shenanigans, and the open call will fail anyway. * * @return true on success, false on failure; */ static bool resolve_parent_path(char *name, int n_dst, char *dst) { char resolved_dir[PATH_MAX]; const char *parent; const char *filename; const char *resolved; int r; /* * Split `name` in a parent directory and a trailing filename. */ { char *last_slash; last_slash = strrchr(name, '/'); if (last_slash == NULL) { parent = "."; filename = name; } else { filename = last_slash + 1; /* While the last two chars are "//", go back one. */ while (last_slash > name + 1 && last_slash[-1] == '/') last_slash--; *last_slash = '\0'; parent = name; } } resolved = realpath(parent, resolved_dir); if (resolved == NULL) { /* * If some component is missing, we want to use the * parent directory as is, and let the open(2) call deal * with any problem. Similarly for EACCES, which * realpath can also return if we lack read * permissions (that aren't needed for open(2)). */ if (errno != ENOENT && errno != EACCES) return false; resolved = parent; } if (strcmp(resolved, "/") == 0) { r = snprintf(dst, n_dst, "/%s", filename); } else { r = snprintf(dst, n_dst, "%s/%s", resolved, filename); } return r >= 0 && r < n_dst; } /** * sqlite uses xFullPathName to figure out where a database file * actually lives: journal files are created in the same directory. * * This name-based scheme also means that opening DB files through * symlinks can be a bad idea; that's why we must return * SQLITE_OK_SYMLINK when we successfully construct the real path for * `name`, but `name` is actually a symlink. */ static int linux_full_pathname(sqlite3_vfs *vfs, const char *name, int n, char *dst) { char target[PATH_MAX]; int ok; bool is_symlink; (void)vfs; if (check_if_symlink(name, &is_symlink) == false) return SQLITE_CANTOPEN; ok = (is_symlink == true) ? SQLITE_OK_SYMLINK : SQLITE_OK; if (resolve_path(name, n, dst) == true) return ok; /* * We now have to handle paths that don't exist yet, and * compute something that matches the file that would be * created if we were to open(2) through that path. */ /* First, iteratively resolve when the direct `name` is a symlink. */ if (walk_symlink(name, target, sizeof(target)) == false) return SQLITE_CANTOPEN; /* Next, try to resolve the parent directory. */ if (resolve_parent_path(target, n, dst) == false) return SQLITE_CANTOPEN; return ok; } /* * Snapshot paths can look like URIs; if they do, skip any * normalisation. We don't expect to find or process any journal file * next to the manifest file, so the full path doesn't really matter. */ static int snapshot_full_pathname(sqlite3_vfs *vfs, const char *name, int n, char *dst) { static const char *const prefixes[] = { "file://", "http://", "https://", "s3://", "verneuil://", }; for (size_t i = 0; i < sizeof(prefixes) / sizeof(prefixes[0]); i++) { const char *prefix = prefixes[i]; if (strncmp(name, prefix, strlen(prefix)) == 0) { size_t len; len = strlen(name); if (n < 0 || len >= (size_t)n) return SQLITE_CANTOPEN; memcpy(dst, name, len + 1); return SQLITE_OK; } } return linux_full_pathname(vfs, name, n, dst); } static void * linux_dlopen(sqlite3_vfs *vfs, const char *name) { (void)vfs; return dlopen(name, RTLD_NOW | RTLD_GLOBAL); } static void linux_dlerror(sqlite3_vfs *vfs, int n, char *OUT_error) { const char *err; (void)vfs; /* * The whole dlerror interface is thread-hostile. Let's hope * this is good enough for sqlite. */ err = dlerror(); if (err != NULL) snprintf(OUT_error, n, "%s", err); return; } static dlfun_t * linux_dlsym(sqlite3_vfs *vfs, void *handle, const char *symbol) { (void)vfs; return (dlfun_t *)dlsym(handle, symbol); } static void linux_dlclose(sqlite3_vfs *vfs, void *handle) { (void)vfs; dlclose(handle); return; } /* * Linux added the getrandom syscall in 3.17, but glibc only recently * added a wrapper. Define our own getrandom. */ static ssize_t getrandom_compat(void *buf, size_t buflen, unsigned int flags) { return syscall(SYS_getrandom, buf, buflen, flags); } static int linux_randomness(sqlite3_vfs *vfs, int n, char *dst) { (void)vfs; memset(dst, 0, n); #if !defined(SQLITE_TEST) && !defined(SQLITE_OMIT_RANDOMNESS) ssize_t r; do { /* * This is only called to initialise a small * (256-byte) seed, so we don't have to worry * about short reads. */ r = getrandom_compat(dst, n, /*flags=*/0); } while (r < 0 && errno == EINTR); /* * It's ok to fail silently: we zero-filled, so no UB, * and the output is only used for a non-crypto PRNG. */ #endif return n; } static int linux_sleep(sqlite3_vfs *vfs, int microseconds) { struct timespec to_sleep = { .tv_sec = microseconds / 1000000, .tv_nsec = (microseconds % 1000000) * 1000, }; (void)vfs; nanosleep(&to_sleep, NULL); return microseconds; } static int linux_get_last_error(sqlite3_vfs *vfs, int n, char *OUT_error) { (void)vfs; (void)n; (void)OUT_error; /* As of sqlite 3.35.5, only the return code is used. */ return errno; } static int linux_current_time_int64(sqlite3_vfs *vfs, sqlite3_int64 *out) { /* Offset copied from os_unix.c */ static const int64_t epoch = 24405875 * (int64_t)8640000; struct timespec now; (void)vfs; clock_gettime(CLOCK_REALTIME, &now); *out = epoch + (int64_t)1000 * now.tv_sec + now.tv_nsec / 1000000; #ifdef SQLITE_TEST extern int sqlite3_current_time; if (sqlite3_current_time != 0) *out = epoch + (int64_t)1000 * sqlite3_current_time; #endif return SQLITE_OK; } /** * We don't actually implement any syscall fault injection logic. * However, some test code fails if we don't implement the interface * at all. Expose no-op implementations to improve test coverage. */ static int linux_set_syscall(sqlite3_vfs *vfs, const char *name, sqlite3_syscall_ptr ptr) { (void)vfs; (void)ptr; /* No name -> reset. */ if (name == NULL) return SQLITE_OK; return SQLITE_NOTFOUND; } static sqlite3_syscall_ptr linux_get_syscall(sqlite3_vfs *vfs, const char *name) { (void)vfs; (void)name; return NULL; } static const char * linux_next_syscall(sqlite3_vfs *vfs, const char *name) { (void)vfs; (void)name; return NULL; } int verneuil__file_close_impl(sqlite3_file *vfile) { struct linux_file *file = (void *)vfile; if (file->fd >= 0) { /* * *never* retry close: it might fail after recycling * the file descriptor id. */ close(file->fd); #ifdef SQLITE_TEST extern int sqlite3_open_file_count; sqlite3_open_file_count--; #endif } *file = (struct linux_file) { .fd = -1 }; return SQLITE_OK; } int verneuil__file_read_impl(sqlite3_file *vfile, void *dst, int n, sqlite3_int64 off) { struct linux_file *file = (void *)vfile; while (n > 0) { ssize_t r; r = pread(file->fd, dst, n, off); if (r == 0) break; if (r > 0) { assert(r <= n); dst = (char *)dst + r; n -= r; off += r; } else if (errno != EINTR) { switch (errno) { /* Upstream converts these to CORRUPTFS. */ case ERANGE: case EIO: case ENXIO: #ifdef EDEVERR case EDEVERR: #endif return SQLITE_IOERR_CORRUPTFS; default: return SQLITE_IOERR_READ; } } } if (n == 0) return SQLITE_OK; /* * We don't return the actual read length; short reads must * instead zero-fill the remainder of the destination buffer. */ memset(dst, 0, n); return SQLITE_IOERR_SHORT_READ; } int verneuil__file_write_impl(sqlite3_file *vfile, const void *src, int n, sqlite3_int64 off) { struct linux_file *file = (void *)vfile; while (n > 0) { ssize_t r; r = pwrite(file->fd, src, n, off); /* r == 0 shouldn't happen... */ if (r >= 0) { assert(r <= n); src = (const char *)src + r; n -= r; off += r; } else if (errno != EINTR) { switch (errno) { case EDQUOT: case ENOSPC: return SQLITE_FULL; default: return SQLITE_IOERR_WRITE; } } } return SQLITE_OK; } int verneuil__file_truncate_impl(sqlite3_file *vfile, sqlite3_int64 size) { struct linux_file *file = (void *)vfile; int r; #ifdef __ANDROID__ /* * Sqlite says ftruncate() always uses 32-bit offsets on * android, and it's safe to just ignore any requests * for more than 2GB!? */ if (size > INT32_MAX) return SQLITE_OK; #endif do { r = ftruncate(file->fd, size); } while (r != 0 && errno == EINTR); if (r != 0) return SQLITE_IOERR_TRUNCATE; return SQLITE_OK; } int verneuil__file_sync_impl(sqlite3_file *vfile, int flags) { struct linux_file *file = (void *)vfile; int r; (void)flags; #ifdef SQLITE_TEST extern int sqlite3_fullsync_count; extern int sqlite3_sync_count; if ((flags & 0x0F) == SQLITE_SYNC_FULL) sqlite3_fullsync_count++; sqlite3_sync_count++; /* Fake fsync-ing the parent directory, if needed.*/ if (file->dirsync_pending == true) sqlite3_sync_count++; #endif /* * If we did implement dirsync, we would only have to do so * once after file creation. */ file->dirsync_pending = false; /* * If the file doesn't exist in a directory, it won't be * visible after a crash, so there's nothing to sync. */ if (file->path == NULL) return SQLITE_OK; do { r = fsync(file->fd); } while (r != 0 && errno == EINTR); if (r != 0) { switch (errno) { case ENOSPC: case EDQUOT: return SQLITE_FULL; default: return SQLITE_IOERR_FSYNC; } } return SQLITE_OK; } int verneuil__file_size_impl(sqlite3_file *vfile, sqlite3_int64 *OUT_size) { struct stat sb; struct linux_file *file = (void *)vfile; int r; do { r = fstat(file->fd, &sb); } while (r != 0 && errno == EINTR); if (r != 0) return SQLITE_IOERR_FSTAT; *OUT_size = sb.st_size; return SQLITE_OK; } /** * The Linux VFS replicates the default Unix VFS's locking scheme. * * The lock bytes start at `sqlite3PendingByte` or `0x40000000`. * * The first lock byte is the "PENDING" lock: when a writer has * acquired this lock, new read locks may not be acquired. * * The next one is the "RESERVED" lock: writers race for this lock to * determine which one can enter the write state machine. * * Finally, the next 510 bytes are for "SHARED" (read) locks. Sqlite * uses this range to allow concurrent readers on systems that do not * support shared file locks. We always acquire the whole range. */ static inline off_t linux_file_pending_lock_offset(void) { return 0x40000000; } static inline off_t linux_file_reserved_lock_offset(void) { return linux_file_pending_lock_offset() + 1; } static inline off_t linux_file_shared_lock_offset(void) { return linux_file_pending_lock_offset() + 2; } static const size_t linux_file_shared_lock_size = 510; /* * The locks all live in a 512-byte region starting at * `linux_file_pending_lock_offset()`. */ static const size_t linux_file_all_lock_size = 512; static uint64_t now_ms_boottime(void) { struct timespec now; clock_gettime(CLOCK_BOOTTIME, &now); return 1000 * (uint64_t)now.tv_sec + (now.tv_nsec / 1000000); } /** * Sleeps for a few milliseconds, or until `end_ms`. * * @param num_attempts the number of times we have already slept * while trying to acquire the same lock. * Returns false if the current time is past the `end_ms` deadline. */ static bool sleep_until_at_most(uint64_t num_attempts, uint64_t end_ms) { struct timespec to_sleep = { .tv_sec = 0 }; uint64_t now; double random_value; double max_sleep_ms; double sleep_ms; now = now_ms_boottime(); if (now >= end_ms) return false; { uint64_t random_bits; sqlite3_randomness(sizeof(random_bits), &random_bits); random_value = (1.0 / UINT64_MAX) * random_bits; } /* * Use exponential backoff to set the maximum sleep value, * from an initial value 0.1 milliseconds up to 10 * milliseconds. * * We use `num_attempts / 4` as our backoff exponent to * approximate a smoother base factor of ~1.2x instead of 2x * per attempt. This asymptotically guarantees that, if the * lock is held continuously until we acquire it, we will * sleep at most ~50% (modulo jitter) longer than the time * the lock was actually unavailable to us. */ if (num_attempts < 7 * 4) { max_sleep_ms = 0.1 * (1UL << (num_attempts / 4)); } else { max_sleep_ms = 10.0; } if (max_sleep_ms > end_ms - now) max_sleep_ms = end_ms - now; /* And jitter uniformly within that limit. */ sleep_ms = random_value * max_sleep_ms; to_sleep.tv_nsec = 1e6 * sleep_ms + 0.5; nanosleep(&to_sleep, NULL); return true; } static int linux_flock_op(const struct linux_file *file, int op, struct flock *fl, int default_error) { uint64_t begin, end; uint64_t num_sleep = 0; uint32_t timeout = file->lock_timeout_ms; int r; begin = now_ms_boottime(); end = begin + timeout; do { r = fcntl(file->fd, op, fl); /* * If we failed to acquire the lock, and there's a * non-zero timeout, sleep for a bit at most 20 times * per timeout millisecond: this limit guarantees that * we won't sleep forever in case the clock isn't * monotonic. * * The initial sleeps are expected to be ~0.05 ms on * average and start ramping up after the 4th, so * allowing 20 sleeps per millisecond of timeout means * that we will definitely not stop due to this sleep * *count* limit unless time goes wonky or our calls * to nanosleep are interrupted very frequently. */ if (r < 0 && num_sleep / 20 < timeout && (errno == EACCES || errno == EAGAIN)) { if (sleep_until_at_most(num_sleep, end) == false) { errno = EACCES; break; } num_sleep++; /* * Pretend we were just interrupted after * sleeping for a bit, to let the do / while * loop try again. */ errno = EINTR; } } while (r < 0 && errno == EINTR); if (r != 0) { switch (errno) { case EACCES: case EAGAIN: case ETIMEDOUT: case EBUSY: case EINTR: case ENOLCK: return SQLITE_BUSY; case EPERM: return SQLITE_PERM; default: return default_error; } } return SQLITE_OK; } static int acquire_shared_lock(struct linux_file *file) { struct flock fl = { .l_type = F_RDLCK, .l_whence = SEEK_SET, .l_start = linux_file_pending_lock_offset(), .l_len = 1, }; int r; /* * Before acquiring a shared lock, we must make sure that the * PENDING byte is free. It's OK if the byte is then taken * while we try to grab a read lock on the shared lock range: * the PENDING byte only exists to protect against starvation, * when new readers keep preventing the writer from acquiring * the shared range for writes. * * Once the writer has acquired the PENDING byte, only a * bounded number of readers may have already observed it as * free, but not acquired the shared range for reads yet. * A race here cannot starve the writer forever. */ r = linux_flock_op(file, F_OFD_GETLK, &fl, SQLITE_IOERR_LOCK); if (r != SQLITE_OK) return r; if (fl.l_type != F_UNLCK) return SQLITE_BUSY; /* Now, we can try to acquire the shared lock range for reads. */ r = linux_flock_op(file, F_OFD_SETLK, &(struct flock) { .l_type = F_RDLCK, .l_whence = SEEK_SET, .l_start = linux_file_shared_lock_offset(), .l_len = linux_file_shared_lock_size, }, SQLITE_IOERR_LOCK); if (r != SQLITE_OK) return r; file->lock_level = SQLITE_LOCK_SHARED; return SQLITE_OK; } static int acquire_reserved_lock(struct linux_file *file) { int r; r = linux_flock_op(file, F_OFD_SETLK, &(struct flock) { .l_type = F_WRLCK, .l_whence = SEEK_SET, .l_start = linux_file_reserved_lock_offset(), .l_len = 1, }, SQLITE_IOERR_LOCK); if (r != SQLITE_OK) return r; file->lock_level = SQLITE_LOCK_RESERVED; return SQLITE_OK; } static int acquire_exclusive_lock(struct linux_file *file) { int r; /* * Acquire the "intent to write" lock if necessary. * * I don't think this can happen with sqlite, but let's make * the locking code obviously correct. */ if (file->lock_level < SQLITE_LOCK_RESERVED) { r = acquire_reserved_lock(file); if (r != SQLITE_OK) return r; } /* * Before acquiring an exclusive lock, we must first acquire * the pending lock byte, to tell readers to drain out. * * This should not fail with SQLITE_BUSY, now that we own the * RESERVED byte. */ r = linux_flock_op(file, F_OFD_SETLK, &(struct flock) { .l_type = F_WRLCK, .l_whence = SEEK_SET, .l_start = linux_file_pending_lock_offset(), .l_len = 1, }, SQLITE_IOERR_LOCK); if (r != SQLITE_OK) return r; file->lock_level = SQLITE_LOCK_PENDING; /* Now, we can try to acquire the shared lock range exclusively. */ r = linux_flock_op(file, F_OFD_SETLK, &(struct flock) { .l_type = F_WRLCK, .l_whence = SEEK_SET, .l_start = linux_file_shared_lock_offset(), .l_len = linux_file_shared_lock_size, }, SQLITE_IOERR_LOCK); if (r != SQLITE_OK) return r; file->lock_level = SQLITE_LOCK_EXCLUSIVE; return SQLITE_OK; } int verneuil__file_lock_impl(sqlite3_file *vfile, int level) { struct linux_file *file = (void *)vfile; /* xLock never downgrades, and instead no-ops. */ if (file->lock_level >= level) return SQLITE_OK; switch (level) { case SQLITE_LOCK_SHARED: return acquire_shared_lock(file); case SQLITE_LOCK_RESERVED: /* We're not supposed to go from NONE to RESERVED. */ assert(file->lock_level == SQLITE_LOCK_SHARED); return acquire_reserved_lock(file); case SQLITE_LOCK_EXCLUSIVE: return acquire_exclusive_lock(file); case SQLITE_LOCK_NONE: case SQLITE_LOCK_PENDING: /* PENDING is an internal state. */ default: /* Shouldn't happen. */ return SQLITE_ERROR; } } static int release_all_locks(struct linux_file *file) { int r; r = linux_flock_op(file, F_OFD_SETLK, &(struct flock) { .l_type = F_UNLCK, .l_whence = SEEK_SET, .l_start = linux_file_pending_lock_offset(), .l_len = linux_file_all_lock_size, }, SQLITE_IOERR_UNLOCK); if (r != SQLITE_OK) return r; file->lock_level = SQLITE_LOCK_NONE; return SQLITE_OK; } static int downgrade_write_lock_to_shared(struct linux_file *file) { int r; /* * Start by converting all our locks to shared. */ r = linux_flock_op(file, F_OFD_SETLK, &(struct flock) { .l_type = F_RDLCK, .l_whence = SEEK_SET, .l_start = linux_file_pending_lock_offset(), .l_len = linux_file_all_lock_size, }, SQLITE_IOERR_UNLOCK); /* * Downgrades should not fail with SQLITE_BUSY (that can only * happen if another process locks the file with an * incompatible scheme). If they do, the unix VFS says we * should instead return IOERR_RDLOCK to avoid asserts. */ if (r == SQLITE_BUSY) return SQLITE_IOERR_RDLOCK; if (r != SQLITE_OK) return r; /* * At this point, we definitely don't have an exclusive lock, * and new read locks can be acquired. * * If the next step fails, other writers will not be able to * make progress until we release all our locks, but they * would have been blocked when upgrading from RESERVED to * EXCLUSIVE anyway. */ file->lock_level = SQLITE_LOCK_SHARED; /* * And now release the (shared) reserved and pending lock: we * may not actually own the pending lock, but, given that we * do hold the reserved lock, no one can. */ r = linux_flock_op(file, F_OFD_SETLK, &(struct flock) { .l_type = F_UNLCK, .l_whence = SEEK_SET, .l_start = linux_file_pending_lock_offset(), .l_len = 2, }, SQLITE_IOERR_UNLOCK); if (r != SQLITE_OK) return SQLITE_IOERR_RDLOCK; return SQLITE_OK; } int verneuil__file_unlock_impl(sqlite3_file *vfile, int level) { struct linux_file *file = (void *)vfile; /* xUnlock never upgrades, and instead no-ops. */ if (file->lock_level <= level) return SQLITE_OK; switch (level) { case SQLITE_LOCK_NONE: return release_all_locks(file); case SQLITE_LOCK_SHARED: return downgrade_write_lock_to_shared(file); case SQLITE_LOCK_RESERVED: case SQLITE_LOCK_EXCLUSIVE: case SQLITE_LOCK_PENDING: /* PENDING is an internal state. */ default: /* Shouldn't happen. */ return SQLITE_ERROR; } } static int linux_file_check_reserved_lock(sqlite3_file *vfile, int *OUT_result) { struct flock fl = { /* The reserved byte is only really acquired for writes. */ .l_type = F_RDLCK, .l_whence = SEEK_SET, .l_start = linux_file_reserved_lock_offset(), .l_len = 1, }; struct linux_file *file = (void *)vfile; int r; *OUT_result = 0; /* * This fcntl should say "yes the RESERVED byte lock is taken" * even when it's owned by the current `file`. OFD locks * can't tell us that (a file description can always override * its own locks), so we must first look at the file's current * lock level. */ if (file->lock_level >= SQLITE_LOCK_RESERVED) { *OUT_result = 1; return SQLITE_OK; } r = linux_flock_op(file, F_OFD_GETLK, &fl, SQLITE_IOERR_CHECKRESERVEDLOCK); if (r != SQLITE_OK) return r; *OUT_result = (fl.l_type == F_UNLCK) ? 0 : 1; return SQLITE_OK; } static int snapshot_check_reserved_lock(sqlite3_file *vfile, int *OUT_result) { (void)vfile; /* * The reserved lock is always taken: we will never let sqlite * acquire the write lock on a snapshot. */ *OUT_result = 1; return SQLITE_OK; } static bool linux_file_has_moved(const struct linux_file *file) { struct stat sb; const char *path = file->path; int r; /* * Assume temporary files don't move: we ideally don't even * want them to have a name. */ if (path == NULL) return false; do { r = stat(path, &sb); } while (r != 0 && errno == EINTR); /* Assume this means trouble. */ if (r != 0) return true; return sb.st_dev != file->device || sb.st_ino != file->inode; } static int linux_tempfilename(char **dst) { static _Atomic uint64_t counter = 0; struct timespec now = { 0 }; uint64_t noise[2] = { 0 }; uint64_t unique; ssize_t r; int pid; clock_gettime(CLOCK_REALTIME, &now); do { r = getrandom_compat(noise, sizeof(noise), 0); } while (r <= 0 && errno == EINTR); unique = atomic_fetch_add(&counter, 1); pid = getpid(); static_assert( LINUX_VFS_TEMPPATH_MAX + sizeof(SQLITE_TEMP_FILE_PREFIX) /* the harcoded string pattern is < 100 chars. */ + 100 /* pid fits in 10 chars. */ + 10 /* time and counter fit in 20 chars each. */ + 2 * 20 /* finally, the two random u64 are 16 chars each. */ + 2 * 16 < PATH_MAX, "TEMPPATH_MAX + suffix must fit in PATH_MAX"); *dst = sqlite3_mprintf( "%s/"SQLITE_TEMP_FILE_PREFIX"linux_vfs.pid=%i" ".time=%lld.counter=%"PRIu64".rand=%016"PRIx64"%016"PRIx64".tmp", get_tempdir_base(), pid, (long long)now.tv_sec, unique, noise[0], noise[1]); if (*dst == NULL) return SQLITE_NOMEM; return SQLITE_OK; } /** * Converts a string parameter to a truth value: * * If the string is empty, returns the default value. * Otherwise, returns false iff the string's first character * is '0', 'f'(alse), or 'n'(o). */ static bool parse_bool_param(const char *param, bool default_value) { if (param == NULL) return default_value; /* * https://www.sqlite.org/pragma.html says boolean parameters * map "0", "no", "false", and "off" to false. Extend the * classic switch on the first character to also handle "off". */ switch (param[0]) { case '0': case 'f': case 'F': case 'n': case 'N': return false; default: if (strcasecmp(param, "off") == 0) return false; return true; } } static int linux_file_control(sqlite3_file *vfile, int op, void *arg) { struct linux_file *file = (void *)vfile; switch (op) { /* Advisory fcntl used in tests. */ case SQLITE_FCNTL_CHUNK_SIZE: return SQLITE_OK; case SQLITE_FCNTL_VFSNAME: { #ifdef TEST_VFS /* * Pretend we're "unix" in tests, to avoid * accidentally losing coverage. */ const char *vfsname = "unix"; #else const char *vfsname = "linux"; #endif *(char**)arg = sqlite3_mprintf("%s", vfsname); return SQLITE_OK; } case SQLITE_FCNTL_HAS_MOVED: *(int *)arg = linux_file_has_moved(file) ? 1 : 0; return SQLITE_OK; case SQLITE_FCNTL_TEMPFILENAME: return linux_tempfilename(arg); case SQLITE_FCNTL_PRAGMA: { char **argv = arg; char **dst = &argv[0]; const char *pragma = argv[1]; const char *param = argv[2]; if (strcmp(pragma, "verneuil_flush_replication_data") == 0) { bool ret; if (param == NULL || strcmp(param, "now") == 0 || strcmp(param, "force") == 0 || strcmp(param, "2") == 0) { ret = verneuil__file_flush_replication_data(file) == 0; } else { bool value; value = parse_bool_param(param, true); ret = file->flush_on_close; file->flush_on_close = value; } *dst = sqlite3_mprintf("%s", ret ? "1" : "0"); return SQLITE_OK; } return SQLITE_NOTFOUND; } /* These are used in tests, and should be implemented. */ case SQLITE_FCNTL_LOCKSTATE: *(int *)arg = file->lock_level; return SQLITE_OK; case SQLITE_FCNTL_LOCK_TIMEOUT: { uint32_t old = file->lock_timeout_ms; file->lock_timeout_ms = *(uint32_t *)arg; *(uint32_t *)arg = old; return SQLITE_OK; } default: return SQLITE_NOTFOUND; } } static int snapshot_refresh(sqlite3_file *vfile, char **dst, uint32_t force_level) { struct snapshot_file *file = (void *)vfile; struct timestamp updated = { 0 }; const char *result; size_t len; if (file->locked == true) { *dst = sqlite3_mprintf( "verneuil snapshot may not be %srefreshed within a transaction", (force_level > 1 ? "force " : "")); return SQLITE_LOCKED; } result = verneuil__snapshot_refresh(file, &updated, &len, force_level); if (result == NULL) { *dst = sqlite3_mprintf(TIMESTAMP_FMT, TIMESTAMP_ARG(updated)); return SQLITE_OK; } *dst = sqlite3_mprintf("failed to %srefresh verneuil snapshot: %*s", (force_level > 1 ? "force " : ""), (int)len, result); return SQLITE_ERROR; } static int snapshot_file_control(sqlite3_file *vfile, int op, void *arg) { switch (op) { case SQLITE_FCNTL_VFSNAME: *(char**)arg = sqlite3_mprintf("%s", "verneuil_snapshot"); return SQLITE_OK; case SQLITE_FCNTL_HAS_MOVED: *(int *)arg = 0; return SQLITE_OK; case SQLITE_FCNTL_TEMPFILENAME: return linux_tempfilename(arg); case SQLITE_FCNTL_PRAGMA: { char **argv = arg; char **dst = &argv[0]; const char *pragma = argv[1]; const char *param = argv[2]; if (strcmp(pragma, "verneuil_snapshot_async_reload") == 0) { bool refreshed; refreshed = verneuil__snapshot_async_reload((void *)vfile); *dst = sqlite3_mprintf("%s", refreshed ? "1" : "0"); return SQLITE_OK; } if (strcmp(pragma, "verneuil_snapshot_auto_refresh") == 0) { struct snapshot_file *file = (void *)vfile; bool prev; /* No param -> get the current value. */ if (param == NULL) { prev = file->auto_refresh; } else { bool enable; enable = parse_bool_param(param, true); prev = verneuil__snapshot_auto_refresh(file, enable); } /* * https://www.sqlite.org/pragma.html says * boolean return values are often returned as * numeric 0/1; it certainly seems to be the * norm. */ *dst = sqlite3_mprintf("%s", (prev ? "1" : "0")); return SQLITE_OK; } if (strcmp(pragma, "verneuil_snapshot_ctime") == 0) { struct timestamp ctime; ctime = verneuil__snapshot_ctime((void *)vfile); *dst = sqlite3_mprintf(TIMESTAMP_FMT, TIMESTAMP_ARG(ctime)); return SQLITE_OK; } if (strcmp(pragma, "verneuil_snapshot_refresh") == 0) { uint32_t force_level; if (param == NULL) { force_level = 0; } else if (strcmp(param, "2") == 0 || strncasecmp(param, "force", strlen("force")) == 0) { force_level = 2; } else { force_level = parse_bool_param(param, false) ? 1 : 0; } return snapshot_refresh(vfile, dst, force_level); } if (strcmp(pragma, "verneuil_snapshot_updated") == 0) { struct timestamp updated; updated = verneuil__snapshot_updated((void *)vfile); *dst = sqlite3_mprintf(TIMESTAMP_FMT, TIMESTAMP_ARG(updated)); return SQLITE_OK; } return SQLITE_NOTFOUND; } default: return SQLITE_NOTFOUND; } } static int linux_file_sector_size(sqlite3_file *vfile) { enum { /* * sqlite assumes the sector size is a multiple of its * min value, 512 bytes. */ MIN_SECTOR_SIZE = 512, #ifdef PAGE_SIZE DEFAULT_SECTOR_SIZE = PAGE_SIZE, #else DEFAULT_SECTOR_SIZE = 4096, #endif }; static atomic_int cached_sector_size = 0; long sector_size; (void)vfile; /* * Our sector size value is independent of the underlying * filesystem, so we can use a process-global cache. * * Any non-zero value must be valid: the sector size * computation should be deterministic, so redundant writes * will always store the same value. */ sector_size = atomic_load_explicit(&cached_sector_size, memory_order_relaxed); if (sector_size != 0) return sector_size; /* * Let the sector size match the OS page size, if we can find * it. That's the granularity at which Linux's buffered IO * works, so should be safe. It also happens to match the * sqlite default on x86 and x86-64. */ sector_size = sysconf(_SC_PAGESIZE); if (sector_size <= 0 || sector_size > INT_MAX) sector_size = DEFAULT_SECTOR_SIZE; if ((sector_size % MIN_SECTOR_SIZE) != 0) sector_size = MIN_SECTOR_SIZE; atomic_store_explicit(&cached_sector_size, sector_size, memory_order_relaxed); return sector_size; } static int linux_file_device_characteristics(sqlite3_file *vfile) { /* * We don't have any kind of failure-atomic write guarantee on * Linux, so SQLITE_IOCAP_ATOMIC* must be off. * * We have seen what looks like files being extended with zero * bytes before storing the new data, when VMs froze while * writing to ext4 / Ceph, so `SQLITE_IOCAP_SAFE_APPEND` isn't. * * Linux will reorder buffered I/O, so * `SQLITE_IOCAP_SEQUENTIAL` is not a thing. * * Open files can be unlinked under POSIX, so * `SQLITE_IOCAP_UNDELETABLE_WHEN_OPEN` is also not a thing. * * We know data is overwritten a full OS page at a time, and * maybe worse with SSDs, so `SQLITE_IOCAP_POWERSAFE_OVERWRITE` * doesn't seem true at all... however, stock sqlite turns it * on by default because otherwise the journaling I/O is * really sucky. See https://sqlite.org/psow.html for * details. * * Although some filesystems (e.g., xfs) support fully * immutable files, we don't check for it, so let's assume * `SQLITE_IOCAP_IMMUTABLE` is false. * * Finally, we also don't implement F2FS-style batch atomic * commits, so `SQLITE_IOCAP_BATCH_ATOMIC` is false. */ (void)vfile; return SQLITE_IOCAP_POWERSAFE_OVERWRITE; } static int snapshot_device_characteristics(sqlite3_file *vfile) { (void)vfile; /* * Should we tag snapshots that will never be refreshed, * and let those return `SQLITE_IOCAP_IMMUTABLE`? */ return SQLITE_IOCAP_POWERSAFE_OVERWRITE; } int verneuil_configure_impl(const struct verneuil_options *options) { static const struct verneuil_options default_options; int rc; if (options == NULL) options = &default_options; #ifdef SQLITE_CORE rc = sqlite3_initialize(); if (rc != SQLITE_OK) return rc; #endif rc = sqlite3_vfs_register(&verneuil_vfs, options->make_default ? 1 : 0); if (rc != SQLITE_OK) return rc; rc = sqlite3_vfs_register(&verneuil_snapshot_vfs, /*makeDflt=*/0); if (rc != SQLITE_OK) return rc; if (options->tempdir != NULL) return verneuil_set_tempdir(options->tempdir); return SQLITE_OK; } int verneuil_init_impl(sqlite3 *db, char **pzErrMsg, const sqlite3_api_routines *pApi, const char *tempdir, bool make_default) { int rc; (void)db; (void)pzErrMsg; SQLITE_EXTENSION_INIT2(pApi); /* * When building in test mode, also shadow the "unix" vfs: * some tests instantiate it directly, and we want that * coverage. */ #ifdef TEST_VFS { static sqlite3_vfs verneuil_fake_unix_vfs; if (verneuil_fake_unix_vfs.zName == NULL) { verneuil_fake_unix_vfs = verneuil_vfs; verneuil_fake_unix_vfs.zName = "unix"; } rc = sqlite3_vfs_register(&verneuil_fake_unix_vfs, /*makeDflt=*/0); if (rc != SQLITE_OK) return rc; } #endif rc = sqlite3_vfs_register(&verneuil_vfs, make_default); if (rc != SQLITE_OK) return rc; rc = sqlite3_vfs_register(&verneuil_snapshot_vfs, /*makeDflt=*/0); if (rc != SQLITE_OK) return rc; if (tempdir != NULL) { rc = verneuil_set_tempdir(tempdir); if (rc != SQLITE_OK) return rc; } return SQLITE_OK_LOAD_PERMANENTLY; } #if defined(TEST_VFS) && defined(SQLITE_CORE) int verneuil_test_only_register(void) { char *error = NULL; int rc; rc = verneuil_init_impl(NULL, &error, NULL, NULL, true); sqlite3_free(error); if (rc == SQLITE_OK_LOAD_PERMANENTLY) rc = SQLITE_OK; return rc; } #endif /* * We pass this dummy result callback to sqlite3 to make it obvious * our SQL statements must be executed. */ static int dummy_cb(void *vsize, int ncol, char **values, char **columns) { (void)vsize; (void)ncol; (void)values; (void)columns; return 0; } int verneuil__cycle_db(const char *path, bool vacuum) { sqlite3 *db; const char *statement; int rc; rc = sqlite3_open_v2(path, &db, SQLITE_OPEN_READWRITE, "verneuil"); if (rc != SQLITE_OK) return rc; if (vacuum == true) { statement = "PRAGMA page_size = 65536; " "VACUUM;"; } else { statement = "SELECT COUNT(*) FROM sqlite_schema;"; } rc = sqlite3_exec(db, statement, dummy_cb, NULL, NULL); (void)sqlite3_close_v2(db); return rc; }