

new file mode 100644

index 0000000000000..64087c34327fe

--- /dev/null

+++ b/ diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txtnew file mode 100644index 0000000000000..64087c34327fe--- /dev/null+++ b/ Documentation/filesystems/btrfs.txt @@ -0,0 +1,91 @@ + + BTRFS + ===== + +Btrfs is a new copy on write filesystem for Linux aimed at +implementing advanced features while focusing on fault tolerance, +repair and easy administration. Initially developed by Oracle, Btrfs +is licensed under the GPL and open for contribution from anyone. + +Linux has a wealth of filesystems to choose from, but we are facing a +number of challenges with scaling to the large storage subsystems that +are becoming common in today's data centers. Filesystems need to scale +in their ability to address and manage large storage, and also in +their ability to detect, repair and tolerate errors in the data stored +on disk. Btrfs is under heavy development, and is not suitable for +any uses other than benchmarking and review. The Btrfs disk format is +not yet finalized. + +The main Btrfs features include: + + * Extent based file storage (2^64 max file size) + * Space efficient packing of small files + * Space efficient indexed directories + * Dynamic inode allocation + * Writable snapshots + * Subvolumes (separate internal filesystem roots) + * Object level mirroring and striping + * Checksums on data and metadata (multiple algorithms available) + * Compression + * Integrated multiple device support, with several raid algorithms + * Online filesystem check (not yet implemented) + * Very fast offline filesystem check + * Efficient incremental backup and FS mirroring (not yet implemented) + * Online filesystem defragmentation + + + + MAILING LIST + ============ + +There is a Btrfs mailing list hosted on vger.kernel.org. You can +find details on how to subscribe here: + +http://vger.kernel.org/vger-lists.html#linux-btrfs + +Mailing list archives are available from gmane: + +http://dir.gmane.org/gmane.comp.file-systems.btrfs + + + + IRC + === + +Discussion of Btrfs also occurs on the #btrfs channel of the Freenode +IRC network. + + + + UTILITIES + ========= + +Userspace tools for creating and manipulating Btrfs file systems are +available from the git repository at the following location: + + http://git.kernel.org/?p=linux/kernel/git/mason/btrfs-progs-unstable.git + git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-progs-unstable.git + +These include the following tools: + +mkfs.btrfs: create a filesystem + +btrfsctl: control program to create snapshots and subvolumes: + + mount /dev/sda2 /mnt + btrfsctl -s new_subvol_name /mnt + btrfsctl -s snapshot_of_default /mnt/default + btrfsctl -s snapshot_of_new_subvol /mnt/new_subvol_name + btrfsctl -s snapshot_of_a_snapshot /mnt/snapshot_of_new_subvol + ls /mnt + default snapshot_of_a_snapshot snapshot_of_new_subvol + new_subvol_name snapshot_of_default + + Snapshots and subvolumes cannot be deleted right now, but you can + rm -rf all the files and directories inside them. + +btrfsck: do a limited check of the FS extent trees. + +btrfs-debug-tree: print all of the FS metadata in text form. Example: + + btrfs-debug-tree /dev/sda2 >& big_output_file

index 32883589ee54d..02cff86af1b4f 100644

--- a/

+++ b/ diff --git a/fs/Kconfig b/fs/Kconfigindex 32883589ee54d..02cff86af1b4f 100644--- a/ fs/Kconfig +++ b/ fs/Kconfig @@ -269,6 +269,25 @@ config OCFS2_FS_POSIX_ACL Posix Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. +config BTRFS_FS + tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" + depends on EXPERIMENTAL + select LIBCRC32C + select ZLIB_INFLATE + select ZLIB_DEFLATE + help + Btrfs is a new filesystem with extents, writable snapshotting, + support for multiple devices and many more features. + + Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET + FINALIZED. You should say N here unless you are interested in + testing Btrfs with non-critical data. + + To compile this file system support as a module, choose M here. The + module will be called btrfs. + + If unsure, say N. + endif # BLOCK source "fs/notify/Kconfig"

index c830611550d3d..bc4e14df10823 100644

--- a/

+++ b/ diff --git a/fs/Makefile b/fs/Makefileindex c830611550d3d..bc4e14df10823 100644--- a/ fs/Makefile +++ b/ fs/Makefile @@ -119,4 +119,5 @@ obj-$(CONFIG_HOSTFS) += hostfs/ obj-$(CONFIG_HPPFS) += hppfs/ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ +obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/

new file mode 100644

index 0000000000000..d2cf5a54a4b81

--- /dev/null

+++ b/ diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefilenew file mode 100644index 0000000000000..d2cf5a54a4b81--- /dev/null+++ b/ fs/btrfs/Makefile @@ -0,0 +1,25 @@ +ifneq ($(KERNELRELEASE),) +# kbuild part of makefile + +obj-$(CONFIG_BTRFS_FS) := btrfs.o +btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ + file-item.o inode-item.o inode-map.o disk-io.o \ + transaction.o inode.o file.o tree-defrag.o \ + extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ + extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ + ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ + compression.o +else + +# Normal Makefile + +KERNELDIR := /lib/modules/`uname -r`/build +all: + $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules + +modules_install: + $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install +clean: + $(MAKE) -C $(KERNELDIR) M=`pwd` clean + +endif

new file mode 100644

index 0000000000000..1d53b62dbba51

--- /dev/null

+++ b/ diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.cnew file mode 100644index 0000000000000..1d53b62dbba51--- /dev/null+++ b/ fs/btrfs/acl.c @@ -0,0 +1,351 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> +#include <linux/posix_acl.h> +#include <linux/sched.h> + +#include "ctree.h" +#include "btrfs_inode.h" +#include "xattr.h" + +#ifdef CONFIG_FS_POSIX_ACL + +static void btrfs_update_cached_acl(struct inode *inode, + struct posix_acl **p_acl, + struct posix_acl *acl) +{ + spin_lock(&inode->i_lock); + if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED) + posix_acl_release(*p_acl); + *p_acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); +} + +static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) +{ + int size; + const char *name; + char *value = NULL; + struct posix_acl *acl = NULL, **p_acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + p_acl = &BTRFS_I(inode)->i_acl; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + p_acl = &BTRFS_I(inode)->i_default_acl; + break; + default: + return ERR_PTR(-EINVAL); + } + + spin_lock(&inode->i_lock); + if (*p_acl != BTRFS_ACL_NOT_CACHED) + acl = posix_acl_dup(*p_acl); + spin_unlock(&inode->i_lock); + + if (acl) + return acl; + + + size = __btrfs_getxattr(inode, name, "", 0); + if (size > 0) { + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + size = __btrfs_getxattr(inode, name, value, size); + if (size > 0) { + acl = posix_acl_from_xattr(value, size); + btrfs_update_cached_acl(inode, p_acl, acl); + } + kfree(value); + } else if (size == -ENOENT) { + acl = NULL; + btrfs_update_cached_acl(inode, p_acl, acl); + } + + return acl; +} + +static int btrfs_xattr_get_acl(struct inode *inode, int type, + void *value, size_t size) +{ + struct posix_acl *acl; + int ret = 0; + + acl = btrfs_get_acl(inode, type); + + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + ret = posix_acl_to_xattr(acl, value, size); + posix_acl_release(acl); + + return ret; +} + +/* + * Needs to be called with fs_mutex held + */ +static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int ret, size = 0; + const char *name; + struct posix_acl **p_acl; + char *value = NULL; + mode_t mode; + + if (acl) { + ret = posix_acl_valid(acl); + if (ret < 0) + return ret; + ret = 0; + } + + switch (type) { + case ACL_TYPE_ACCESS: + mode = inode->i_mode; + ret = posix_acl_equiv_mode(acl, &mode); + if (ret < 0) + return ret; + ret = 0; + inode->i_mode = mode; + name = POSIX_ACL_XATTR_ACCESS; + p_acl = &BTRFS_I(inode)->i_acl; + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) + return acl ? -EINVAL : 0; + name = POSIX_ACL_XATTR_DEFAULT; + p_acl = &BTRFS_I(inode)->i_default_acl; + break; + default: + return -EINVAL; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_NOFS); + if (!value) { + ret = -ENOMEM; + goto out; + } + + ret = posix_acl_to_xattr(acl, value, size); + if (ret < 0) + goto out; + } + + ret = __btrfs_setxattr(inode, name, value, size, 0); + +out: + kfree(value); + + if (!ret) + btrfs_update_cached_acl(inode, p_acl, acl); + + return ret; +} + +static int btrfs_xattr_set_acl(struct inode *inode, int type, + const void *value, size_t size) +{ + int ret = 0; + struct posix_acl *acl = NULL; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (acl == NULL) { + value = NULL; + size = 0; + } else if (IS_ERR(acl)) { + return PTR_ERR(acl); + } + } + + ret = btrfs_set_acl(inode, acl, type); + + posix_acl_release(acl); + + return ret; +} + + +static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name, + void *value, size_t size) +{ + return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size); +} + +static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); +} + +static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name, + void *value, size_t size) +{ + return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size); +} + +static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); +} + +int btrfs_check_acl(struct inode *inode, int mask) +{ + struct posix_acl *acl; + int error = -EAGAIN; + + acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); + + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + } + + return error; +} + +/* + * btrfs_init_acl is already generally called under fs_mutex, so the locking + * stuff has been fixed to work with that. If the locking stuff changes, we + * need to re-evaluate the acl locking stuff. + */ +int btrfs_init_acl(struct inode *inode, struct inode *dir) +{ + struct posix_acl *acl = NULL; + int ret = 0; + + /* this happens with subvols */ + if (!dir) + return 0; + + if (!S_ISLNK(inode->i_mode)) { + if (IS_POSIXACL(dir)) { + acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + + if (!acl) + inode->i_mode &= ~current->fs->umask; + } + + if (IS_POSIXACL(dir) && acl) { + struct posix_acl *clone; + mode_t mode; + + if (S_ISDIR(inode->i_mode)) { + ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT); + if (ret) + goto failed; + } + clone = posix_acl_clone(acl, GFP_NOFS); + ret = -ENOMEM; + if (!clone) + goto failed; + + mode = inode->i_mode; + ret = posix_acl_create_masq(clone, &mode); + if (ret >= 0) { + inode->i_mode = mode; + if (ret > 0) { + /* we need an acl */ + ret = btrfs_set_acl(inode, clone, + ACL_TYPE_ACCESS); + } + } + } +failed: + posix_acl_release(acl); + + return ret; +} + +int btrfs_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl, *clone; + int ret = 0; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (!IS_POSIXACL(inode)) + return 0; + + acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + + ret = posix_acl_chmod_masq(clone, inode->i_mode); + if (!ret) + ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS); + + posix_acl_release(clone); + + return ret; +} + +struct xattr_handler btrfs_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .get = btrfs_xattr_acl_default_get, + .set = btrfs_xattr_acl_default_set, +}; + +struct xattr_handler btrfs_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .get = btrfs_xattr_acl_access_get, + .set = btrfs_xattr_acl_access_set, +}; + +#else /* CONFIG_FS_POSIX_ACL */ + +int btrfs_acl_chmod(struct inode *inode) +{ + return 0; +} + +int btrfs_init_acl(struct inode *inode, struct inode *dir) +{ + return 0; +} + +int btrfs_check_acl(struct inode *inode, int mask) +{ + return 0; +} + +#endif /* CONFIG_FS_POSIX_ACL */

new file mode 100644

index 0000000000000..8e2fec05dbe0c

--- /dev/null

+++ b/ diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.cnew file mode 100644index 0000000000000..8e2fec05dbe0c--- /dev/null+++ b/ fs/btrfs/async-thread.c @@ -0,0 +1,419 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/version.h> +#include <linux/kthread.h> +#include <linux/list.h> +#include <linux/spinlock.h> +# include <linux/freezer.h> +#include "async-thread.h" + +#define WORK_QUEUED_BIT 0 +#define WORK_DONE_BIT 1 +#define WORK_ORDER_DONE_BIT 2 + +/* + * container for the kthread task pointer and the list of pending work + * One of these is allocated per thread. + */ +struct btrfs_worker_thread { + /* pool we belong to */ + struct btrfs_workers *workers; + + /* list of struct btrfs_work that are waiting for service */ + struct list_head pending; + + /* list of worker threads from struct btrfs_workers */ + struct list_head worker_list; + + /* kthread */ + struct task_struct *task; + + /* number of things on the pending list */ + atomic_t num_pending; + + unsigned long sequence; + + /* protects the pending list. */ + spinlock_t lock; + + /* set to non-zero when this thread is already awake and kicking */ + int working; + + /* are we currently idle */ + int idle; +}; + +/* + * helper function to move a thread onto the idle list after it + * has finished some requests. + */ +static void check_idle_worker(struct btrfs_worker_thread *worker) +{ + if (!worker->idle && atomic_read(&worker->num_pending) < + worker->workers->idle_thresh / 2) { + unsigned long flags; + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 1; + list_move(&worker->worker_list, &worker->workers->idle_list); + spin_unlock_irqrestore(&worker->workers->lock, flags); + } +} + +/* + * helper function to move a thread off the idle list after new + * pending work is added. + */ +static void check_busy_worker(struct btrfs_worker_thread *worker) +{ + if (worker->idle && atomic_read(&worker->num_pending) >= + worker->workers->idle_thresh) { + unsigned long flags; + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 0; + list_move_tail(&worker->worker_list, + &worker->workers->worker_list); + spin_unlock_irqrestore(&worker->workers->lock, flags); + } +} + +static noinline int run_ordered_completions(struct btrfs_workers *workers, + struct btrfs_work *work) +{ + unsigned long flags; + + if (!workers->ordered) + return 0; + + set_bit(WORK_DONE_BIT, &work->flags); + + spin_lock_irqsave(&workers->lock, flags); + + while (!list_empty(&workers->order_list)) { + work = list_entry(workers->order_list.next, + struct btrfs_work, order_list); + + if (!test_bit(WORK_DONE_BIT, &work->flags)) + break; + + /* we are going to call the ordered done function, but + * we leave the work item on the list as a barrier so + * that later work items that are done don't have their + * functions called before this one returns + */ + if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) + break; + + spin_unlock_irqrestore(&workers->lock, flags); + + work->ordered_func(work); + + /* now take the lock again and call the freeing code */ + spin_lock_irqsave(&workers->lock, flags); + list_del(&work->order_list); + work->ordered_free(work); + } + + spin_unlock_irqrestore(&workers->lock, flags); + return 0; +} + +/* + * main loop for servicing work items + */ +static int worker_loop(void *arg) +{ + struct btrfs_worker_thread *worker = arg; + struct list_head *cur; + struct btrfs_work *work; + do { + spin_lock_irq(&worker->lock); + while (!list_empty(&worker->pending)) { + cur = worker->pending.next; + work = list_entry(cur, struct btrfs_work, list); + list_del(&work->list); + clear_bit(WORK_QUEUED_BIT, &work->flags); + + work->worker = worker; + spin_unlock_irq(&worker->lock); + + work->func(work); + + atomic_dec(&worker->num_pending); + /* + * unless this is an ordered work queue, + * 'work' was probably freed by func above. + */ + run_ordered_completions(worker->workers, work); + + spin_lock_irq(&worker->lock); + check_idle_worker(worker); + + } + worker->working = 0; + if (freezing(current)) { + refrigerator(); + } else { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&worker->lock); + if (!kthread_should_stop()) + schedule(); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +/* + * this will wait for all the worker threads to shutdown + */ +int btrfs_stop_workers(struct btrfs_workers *workers) +{ + struct list_head *cur; + struct btrfs_worker_thread *worker; + + list_splice_init(&workers->idle_list, &workers->worker_list); + while (!list_empty(&workers->worker_list)) { + cur = workers->worker_list.next; + worker = list_entry(cur, struct btrfs_worker_thread, + worker_list); + kthread_stop(worker->task); + list_del(&worker->worker_list); + kfree(worker); + } + return 0; +} + +/* + * simple init on struct btrfs_workers + */ +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) +{ + workers->num_workers = 0; + INIT_LIST_HEAD(&workers->worker_list); + INIT_LIST_HEAD(&workers->idle_list); + INIT_LIST_HEAD(&workers->order_list); + spin_lock_init(&workers->lock); + workers->max_workers = max; + workers->idle_thresh = 32; + workers->name = name; + workers->ordered = 0; +} + +/* + * starts new worker threads. This does not enforce the max worker + * count in case you need to temporarily go past it. + */ +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +{ + struct btrfs_worker_thread *worker; + int ret = 0; + int i; + + for (i = 0; i < num_workers; i++) { + worker = kzalloc(sizeof(*worker), GFP_NOFS); + if (!worker) { + ret = -ENOMEM; + goto fail; + } + + INIT_LIST_HEAD(&worker->pending); + INIT_LIST_HEAD(&worker->worker_list); + spin_lock_init(&worker->lock); + atomic_set(&worker->num_pending, 0); + worker->task = kthread_run(worker_loop, worker, + "btrfs-%s-%d", workers->name, + workers->num_workers + i); + worker->workers = workers; + if (IS_ERR(worker->task)) { + kfree(worker); + ret = PTR_ERR(worker->task); + goto fail; + } + + spin_lock_irq(&workers->lock); + list_add_tail(&worker->worker_list, &workers->idle_list); + worker->idle = 1; + workers->num_workers++; + spin_unlock_irq(&workers->lock); + } + return 0; +fail: + btrfs_stop_workers(workers); + return ret; +} + +/* + * run through the list and find a worker thread that doesn't have a lot + * to do right now. This can return null if we aren't yet at the thread + * count limit and all of the threads are busy. + */ +static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) +{ + struct btrfs_worker_thread *worker; + struct list_head *next; + int enforce_min = workers->num_workers < workers->max_workers; + + /* + * if we find an idle thread, don't move it to the end of the + * idle list. This improves the chance that the next submission + * will reuse the same thread, and maybe catch it while it is still + * working + */ + if (!list_empty(&workers->idle_list)) { + next = workers->idle_list.next; + worker = list_entry(next, struct btrfs_worker_thread, + worker_list); + return worker; + } + if (enforce_min || list_empty(&workers->worker_list)) + return NULL; + + /* + * if we pick a busy task, move the task to the end of the list. + * hopefully this will keep things somewhat evenly balanced. + * Do the move in batches based on the sequence number. This groups + * requests submitted at roughly the same time onto the same worker. + */ + next = workers->worker_list.next; + worker = list_entry(next, struct btrfs_worker_thread, worker_list); + atomic_inc(&worker->num_pending); + worker->sequence++; + + if (worker->sequence % workers->idle_thresh == 0) + list_move_tail(next, &workers->worker_list); + return worker; +} + +/* + * selects a worker thread to take the next job. This will either find + * an idle worker, start a new worker up to the max count, or just return + * one of the existing busy workers. + */ +static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) +{ + struct btrfs_worker_thread *worker; + unsigned long flags; + +again: + spin_lock_irqsave(&workers->lock, flags); + worker = next_worker(workers); + spin_unlock_irqrestore(&workers->lock, flags); + + if (!worker) { + spin_lock_irqsave(&workers->lock, flags); + if (workers->num_workers >= workers->max_workers) { + struct list_head *fallback = NULL; + /* + * we have failed to find any workers, just + * return the force one + */ + if (!list_empty(&workers->worker_list)) + fallback = workers->worker_list.next; + if (!list_empty(&workers->idle_list)) + fallback = workers->idle_list.next; + BUG_ON(!fallback); + worker = list_entry(fallback, + struct btrfs_worker_thread, worker_list); + spin_unlock_irqrestore(&workers->lock, flags); + } else { + spin_unlock_irqrestore(&workers->lock, flags); + /* we're below the limit, start another worker */ + btrfs_start_workers(workers, 1); + goto again; + } + } + return worker; +} + +/* + * btrfs_requeue_work just puts the work item back on the tail of the list + * it was taken from. It is intended for use with long running work functions + * that make some progress and want to give the cpu up for others. + */ +int btrfs_requeue_work(struct btrfs_work *work) +{ + struct btrfs_worker_thread *worker = work->worker; + unsigned long flags; + + if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) + goto out; + + spin_lock_irqsave(&worker->lock, flags); + atomic_inc(&worker->num_pending); + list_add_tail(&work->list, &worker->pending); + + /* by definition we're busy, take ourselves off the idle + * list + */ + if (worker->idle) { + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 0; + list_move_tail(&worker->worker_list, + &worker->workers->worker_list); + spin_unlock_irqrestore(&worker->workers->lock, flags); + } + + spin_unlock_irqrestore(&worker->lock, flags); + +out: + return 0; +} + +/* + * places a struct btrfs_work into the pending queue of one of the kthreads + */ +int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) +{ + struct btrfs_worker_thread *worker; + unsigned long flags; + int wake = 0; + + /* don't requeue something already on a list */ + if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) + goto out; + + worker = find_worker(workers); + if (workers->ordered) { + spin_lock_irqsave(&workers->lock, flags); + list_add_tail(&work->order_list, &workers->order_list); + spin_unlock_irqrestore(&workers->lock, flags); + } else { + INIT_LIST_HEAD(&work->order_list); + } + + spin_lock_irqsave(&worker->lock, flags); + atomic_inc(&worker->num_pending); + check_busy_worker(worker); + list_add_tail(&work->list, &worker->pending); + + /* + * avoid calling into wake_up_process if this thread has already + * been kicked + */ + if (!worker->working) + wake = 1; + worker->working = 1; + + spin_unlock_irqrestore(&worker->lock, flags); + + if (wake) + wake_up_process(worker->task); +out: + return 0; +}

new file mode 100644

index 0000000000000..31be4ed8b63ea

--- /dev/null

+++ b/ diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.hnew file mode 100644index 0000000000000..31be4ed8b63ea--- /dev/null+++ b/ fs/btrfs/async-thread.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_ASYNC_THREAD_ +#define __BTRFS_ASYNC_THREAD_ + +struct btrfs_worker_thread; + +/* + * This is similar to a workqueue, but it is meant to spread the operations + * across all available cpus instead of just the CPU that was used to + * queue the work. There is also some batching introduced to try and + * cut down on context switches. + * + * By default threads are added on demand up to 2 * the number of cpus. + * Changing struct btrfs_workers->max_workers is one way to prevent + * demand creation of kthreads. + * + * the basic model of these worker threads is to embed a btrfs_work + * structure in your own data struct, and use container_of in a + * work function to get back to your data struct. + */ +struct btrfs_work { + /* + * func should be set to the function you want called + * your work struct is passed as the only arg + * + * ordered_func must be set for work sent to an ordered work queue, + * and it is called to complete a given work item in the same + * order they were sent to the queue. + */ + void (*func)(struct btrfs_work *work); + void (*ordered_func)(struct btrfs_work *work); + void (*ordered_free)(struct btrfs_work *work); + + /* + * flags should be set to zero. It is used to make sure the + * struct is only inserted once into the list. + */ + unsigned long flags; + + /* don't touch these */ + struct btrfs_worker_thread *worker; + struct list_head list; + struct list_head order_list; +}; + +struct btrfs_workers { + /* current number of running workers */ + int num_workers; + + /* max number of workers allowed. changed by btrfs_start_workers */ + int max_workers; + + /* once a worker has this many requests or fewer, it is idle */ + int idle_thresh; + + /* force completions in the order they were queued */ + int ordered; + + /* list with all the work threads. The workers on the idle thread + * may be actively servicing jobs, but they haven't yet hit the + * idle thresh limit above. + */ + struct list_head worker_list; + struct list_head idle_list; + + /* + * when operating in ordered mode, this maintains the list + * of work items waiting for completion + */ + struct list_head order_list; + + /* lock for finding the next worker thread to queue on */ + spinlock_t lock; + + /* extra name for this worker, used for current->name */ + char *name; +}; + +int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); +int btrfs_stop_workers(struct btrfs_workers *workers); +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); +int btrfs_requeue_work(struct btrfs_work *work); +#endif

new file mode 100644

index 0000000000000..a8c9693b75ac4

--- /dev/null

+++ b/ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.hnew file mode 100644index 0000000000000..a8c9693b75ac4--- /dev/null+++ b/ fs/btrfs/btrfs_inode.h @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_I__ +#define __BTRFS_I__ + +#include "extent_map.h" +#include "extent_io.h" +#include "ordered-data.h" + +/* in memory btrfs inode */ +struct btrfs_inode { + /* which subvolume this inode belongs to */ + struct btrfs_root *root; + + /* key used to find this inode on disk. This is used by the code + * to read in roots of subvolumes + */ + struct btrfs_key location; + + /* the extent_tree has caches of all the extent mappings to disk */ + struct extent_map_tree extent_tree; + + /* the io_tree does range state (DIRTY, LOCKED etc) */ + struct extent_io_tree io_tree; + + /* special utility tree used to record which mirrors have already been + * tried when checksums fail for a given block + */ + struct extent_io_tree io_failure_tree; + + /* held while inesrting or deleting extents from files */ + struct mutex extent_mutex; + + /* held while logging the inode in tree-log.c */ + struct mutex log_mutex; + + /* used to order data wrt metadata */ + struct btrfs_ordered_inode_tree ordered_tree; + + /* standard acl pointers */ + struct posix_acl *i_acl; + struct posix_acl *i_default_acl; + + /* for keeping track of orphaned inodes */ + struct list_head i_orphan; + + /* list of all the delalloc inodes in the FS. There are times we need + * to write all the delalloc pages to disk, and this list is used + * to walk them all. + */ + struct list_head delalloc_inodes; + + /* full 64 bit generation number, struct vfs_inode doesn't have a big + * enough field for this. + */ + u64 generation; + + /* sequence number for NFS changes */ + u64 sequence; + + /* + * transid of the trans_handle that last modified this inode + */ + u64 last_trans; + /* + * transid that last logged this inode + */ + u64 logged_trans; + + /* + * trans that last made a change that should be fully fsync'd. This + * gets reset to zero each time the inode is logged + */ + u64 log_dirty_trans; + + /* total number of bytes pending delalloc, used by stat to calc the + * real block usage of the file + */ + u64 delalloc_bytes; + + /* + * the size of the file stored in the metadata on disk. data=ordered + * means the in-memory i_size might be larger than the size on disk + * because not all the blocks are written yet. + */ + u64 disk_i_size; + + /* flags field from the on disk inode */ + u32 flags; + + /* + * if this is a directory then index_cnt is the counter for the index + * number for new files that are created + */ + u64 index_cnt; + + /* the start of block group preferred for allocations. */ + u64 block_group; + + struct inode vfs_inode; +}; + +static inline struct btrfs_inode *BTRFS_I(struct inode *inode) +{ + return container_of(inode, struct btrfs_inode, vfs_inode); +} + +static inline void btrfs_i_size_write(struct inode *inode, u64 size) +{ + inode->i_size = size; + BTRFS_I(inode)->disk_i_size = size; +} + + +#endif

new file mode 100644

index 0000000000000..7c4503ef6efdc

--- /dev/null

+++ b/ diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.hnew file mode 100644index 0000000000000..7c4503ef6efdc--- /dev/null+++ b/ fs/btrfs/compat.h @@ -0,0 +1,7 @@ +#ifndef _COMPAT_H_ +#define _COMPAT_H_ + +#define btrfs_drop_nlink(inode) drop_nlink(inode) +#define btrfs_inc_nlink(inode) inc_nlink(inode) + +#endif /* _COMPAT_H_ */

new file mode 100644

index 0000000000000..ee848d8585d9b

--- /dev/null

+++ b/ diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.cnew file mode 100644index 0000000000000..ee848d8585d9b--- /dev/null+++ b/ fs/btrfs/compression.c @@ -0,0 +1,709 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/mpage.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/bit_spinlock.h> +#include <linux/version.h> +#include <linux/pagevec.h> +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "volumes.h" +#include "ordered-data.h" +#include "compression.h" +#include "extent_io.h" +#include "extent_map.h" + +struct compressed_bio { + /* number of bios pending for this compressed extent */ + atomic_t pending_bios; + + /* the pages with the compressed data on them */ + struct page **compressed_pages; + + /* inode that owns this data */ + struct inode *inode; + + /* starting offset in the inode for our pages */ + u64 start; + + /* number of bytes in the inode we're working on */ + unsigned long len; + + /* number of bytes on disk */ + unsigned long compressed_len; + + /* number of compressed pages in the array */ + unsigned long nr_pages; + + /* IO errors */ + int errors; + int mirror_num; + + /* for reads, this is the bio we are copying the data into */ + struct bio *orig_bio; + + /* + * the start of a variable length array of checksums only + * used by reads + */ + u32 sums; +}; + +static inline int compressed_bio_size(struct btrfs_root *root, + unsigned long disk_size) +{ + u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + return sizeof(struct compressed_bio) + + ((disk_size + root->sectorsize - 1) / root->sectorsize) * + csum_size; +} + +static struct bio *compressed_bio_alloc(struct block_device *bdev, + u64 first_byte, gfp_t gfp_flags) +{ + struct bio *bio; + int nr_vecs; + + nr_vecs = bio_get_nr_vecs(bdev); + bio = bio_alloc(gfp_flags, nr_vecs); + + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) + bio = bio_alloc(gfp_flags, nr_vecs); + } + + if (bio) { + bio->bi_size = 0; + bio->bi_bdev = bdev; + bio->bi_sector = first_byte >> 9; + } + return bio; +} + +static int check_compressed_csum(struct inode *inode, + struct compressed_bio *cb, + u64 disk_start) +{ + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page *page; + unsigned long i; + char *kaddr; + u32 csum; + u32 *cb_sum = &cb->sums; + + if (btrfs_test_flag(inode, NODATASUM)) + return 0; + + for (i = 0; i < cb->nr_pages; i++) { + page = cb->compressed_pages[i]; + csum = ~(u32)0; + + kaddr = kmap_atomic(page, KM_USER0); + csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE); + btrfs_csum_final(csum, (char *)&csum); + kunmap_atomic(kaddr, KM_USER0); + + if (csum != *cb_sum) { + printk(KERN_INFO "btrfs csum failed ino %lu " + "extent %llu csum %u " + "wanted %u mirror %d

", inode->i_ino, + (unsigned long long)disk_start, + csum, *cb_sum, cb->mirror_num); + ret = -EIO; + goto fail; + } + cb_sum++; + + } + ret = 0; +fail: + return ret; +} + +/* when we finish reading compressed pages from the disk, we + * decompress them and then run the bio end_io routines on the + * decompressed pages (in the inode address space). + * + * This allows the checksumming and other IO error handling routines + * to work normally + * + * The compressed pages are freed here, and it must be run + * in process context + */ +static void end_compressed_bio_read(struct bio *bio, int err) +{ + struct extent_io_tree *tree; + struct compressed_bio *cb = bio->bi_private; + struct inode *inode; + struct page *page; + unsigned long index; + int ret; + + if (err) + cb->errors = 1; + + /* if there are more bios still pending for this compressed + * extent, just exit + */ + if (!atomic_dec_and_test(&cb->pending_bios)) + goto out; + + inode = cb->inode; + ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9); + if (ret) + goto csum_failed; + + /* ok, we're the last bio for this extent, lets start + * the decompression. + */ + tree = &BTRFS_I(inode)->io_tree; + ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, + cb->start, + cb->orig_bio->bi_io_vec, + cb->orig_bio->bi_vcnt, + cb->compressed_len); +csum_failed: + if (ret) + cb->errors = 1; + + /* release the compressed pages */ + index = 0; + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + page_cache_release(page); + } + + /* do io completion on the original bio */ + if (cb->errors) { + bio_io_error(cb->orig_bio); + } else { + int bio_index = 0; + struct bio_vec *bvec = cb->orig_bio->bi_io_vec; + + /* + * we have verified the checksum already, set page + * checked so the end_io handlers know about it + */ + while (bio_index < cb->orig_bio->bi_vcnt) { + SetPageChecked(bvec->bv_page); + bvec++; + bio_index++; + } + bio_endio(cb->orig_bio, 0); + } + + /* finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +out: + bio_put(bio); +} + +/* + * Clear the writeback bits on all of the file + * pages for a compressed write + */ +static noinline int end_compressed_writeback(struct inode *inode, u64 start, + unsigned long ram_size) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; + struct page *pages[16]; + unsigned long nr_pages = end_index - index + 1; + int i; + int ret; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); + if (ret == 0) { + nr_pages -= 1; + index += 1; + continue; + } + for (i = 0; i < ret; i++) { + end_page_writeback(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + } + /* the inode may be gone now */ + return 0; +} + +/* + * do the cleanup once all the compressed pages hit the disk. + * This will clear writeback on the file pages and free the compressed + * pages. + * + * This also calls the writeback end hooks for the file pages so that + * metadata and checksums can be updated in the file. + */ +static void end_compressed_bio_write(struct bio *bio, int err) +{ + struct extent_io_tree *tree; + struct compressed_bio *cb = bio->bi_private; + struct inode *inode; + struct page *page; + unsigned long index; + + if (err) + cb->errors = 1; + + /* if there are more bios still pending for this compressed + * extent, just exit + */ + if (!atomic_dec_and_test(&cb->pending_bios)) + goto out; + + /* ok, we're the last bio for this extent, step one is to + * call back into the FS and do all the end_io operations + */ + inode = cb->inode; + tree = &BTRFS_I(inode)->io_tree; + cb->compressed_pages[0]->mapping = cb->inode->i_mapping; + tree->ops->writepage_end_io_hook(cb->compressed_pages[0], + cb->start, + cb->start + cb->len - 1, + NULL, 1); + cb->compressed_pages[0]->mapping = NULL; + + end_compressed_writeback(inode, cb->start, cb->len); + /* note, our inode could be gone now */ + + /* + * release the compressed pages, these came from alloc_page and + * are not attached to the inode at all + */ + index = 0; + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + page_cache_release(page); + } + + /* finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +out: + bio_put(bio); +} + +/* + * worker function to build and submit bios for previously compressed pages. + * The corresponding pages in the inode should be marked for writeback + * and the compressed pages should have a reference on them for dropping + * when the IO is complete. + * + * This also checksums the file bytes and gets things ready for + * the end io hooks. + */ +int btrfs_submit_compressed_write(struct inode *inode, u64 start, + unsigned long len, u64 disk_start, + unsigned long compressed_len, + struct page **compressed_pages, + unsigned long nr_pages) +{ + struct bio *bio = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct compressed_bio *cb; + unsigned long bytes_left; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int page_index = 0; + struct page *page; + u64 first_byte = disk_start; + struct block_device *bdev; + int ret; + + WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); + cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + atomic_set(&cb->pending_bios, 0); + cb->errors = 0; + cb->inode = inode; + cb->start = start; + cb->len = len; + cb->mirror_num = 0; + cb->compressed_pages = compressed_pages; + cb->compressed_len = compressed_len; + cb->orig_bio = NULL; + cb->nr_pages = nr_pages; + + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + atomic_inc(&cb->pending_bios); + + /* create and submit bios for the compressed pages */ + bytes_left = compressed_len; + for (page_index = 0; page_index < cb->nr_pages; page_index++) { + page = compressed_pages[page_index]; + page->mapping = inode->i_mapping; + if (bio->bi_size) + ret = io_tree->ops->merge_bio_hook(page, 0, + PAGE_CACHE_SIZE, + bio, 0); + else + ret = 0; + + page->mapping = NULL; + if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + bio_get(bio); + + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the cb might get + * freed before we're done setting it up + */ + atomic_inc(&cb->pending_bios); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + ret = btrfs_csum_one_bio(root, inode, bio, start, 1); + BUG_ON(ret); + + ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + BUG_ON(ret); + + bio_put(bio); + + bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + } + if (bytes_left < PAGE_CACHE_SIZE) { + printk("bytes left %lu compress len %lu nr %lu

", + bytes_left, cb->compressed_len, cb->nr_pages); + } + bytes_left -= PAGE_CACHE_SIZE; + first_byte += PAGE_CACHE_SIZE; + cond_resched(); + } + bio_get(bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + ret = btrfs_csum_one_bio(root, inode, bio, start, 1); + BUG_ON(ret); + + ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + BUG_ON(ret); + + bio_put(bio); + return 0; +} + +static noinline int add_ra_bio_pages(struct inode *inode, + u64 compressed_end, + struct compressed_bio *cb) +{ + unsigned long end_index; + unsigned long page_index; + u64 last_offset; + u64 isize = i_size_read(inode); + int ret; + struct page *page; + unsigned long nr_pages = 0; + struct extent_map *em; + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + struct extent_map_tree *em_tree; + struct extent_io_tree *tree; + u64 end; + int misses = 0; + + page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page; + last_offset = (page_offset(page) + PAGE_CACHE_SIZE); + em_tree = &BTRFS_I(inode)->extent_tree; + tree = &BTRFS_I(inode)->io_tree; + + if (isize == 0) + return 0; + + end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + + pagevec_init(&pvec, 0); + while (last_offset < compressed_end) { + page_index = last_offset >> PAGE_CACHE_SHIFT; + + if (page_index > end_index) + break; + + rcu_read_lock(); + page = radix_tree_lookup(&mapping->page_tree, page_index); + rcu_read_unlock(); + if (page) { + misses++; + if (misses > 4) + break; + goto next; + } + + page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS); + if (!page) + break; + + page->index = page_index; + /* + * what we want to do here is call add_to_page_cache_lru, + * but that isn't exported, so we reproduce it here + */ + if (add_to_page_cache(page, mapping, + page->index, GFP_NOFS)) { + page_cache_release(page); + goto next; + } + + /* open coding of lru_cache_add, also not exported */ + page_cache_get(page); + if (!pagevec_add(&pvec, page)) + __pagevec_lru_add_file(&pvec); + + end = last_offset + PAGE_CACHE_SIZE - 1; + /* + * at this point, we have a locked page in the page cache + * for these bytes in the file. But, we have to make + * sure they map to this compressed extent on disk. + */ + set_page_extent_mapped(page); + lock_extent(tree, last_offset, end, GFP_NOFS); + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, last_offset, + PAGE_CACHE_SIZE); + spin_unlock(&em_tree->lock); + + if (!em || last_offset < em->start || + (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || + (em->block_start >> 9) != cb->orig_bio->bi_sector) { + free_extent_map(em); + unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + break; + } + free_extent_map(em); + + if (page->index == end_index) { + char *userpage; + size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1); + + if (zero_offset) { + int zeros; + zeros = PAGE_CACHE_SIZE - zero_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + zero_offset, 0, zeros); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + } + } + + ret = bio_add_page(cb->orig_bio, page, + PAGE_CACHE_SIZE, 0); + + if (ret == PAGE_CACHE_SIZE) { + nr_pages++; + page_cache_release(page); + } else { + unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + break; + } +next: + last_offset += PAGE_CACHE_SIZE; + } + if (pagevec_count(&pvec)) + __pagevec_lru_add_file(&pvec); + return 0; +} + +/* + * for a compressed read, the bio we get passed has all the inode pages + * in it. We don't actually do IO on those pages but allocate new ones + * to hold the compressed pages on disk. + * + * bio->bi_sector points to the compressed extent on disk + * bio->bi_io_vec points to all of the inode pages + * bio->bi_vcnt is a count of pages + * + * After the compressed pages are read, we copy the bytes into the + * bio we were passed and then call the bio end_io calls + */ +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *em_tree; + struct compressed_bio *cb; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + unsigned long compressed_len; + unsigned long nr_pages; + unsigned long page_index; + struct page *page; + struct block_device *bdev; + struct bio *comp_bio; + u64 cur_disk_byte = (u64)bio->bi_sector << 9; + u64 em_len; + u64 em_start; + struct extent_map *em; + int ret; + u32 *sums; + + tree = &BTRFS_I(inode)->io_tree; + em_tree = &BTRFS_I(inode)->extent_tree; + + /* we need the actual starting offset of this extent in the file */ + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, + page_offset(bio->bi_io_vec->bv_page), + PAGE_CACHE_SIZE); + spin_unlock(&em_tree->lock); + + compressed_len = em->block_len; + cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + atomic_set(&cb->pending_bios, 0); + cb->errors = 0; + cb->inode = inode; + cb->mirror_num = mirror_num; + sums = &cb->sums; + + cb->start = em->orig_start; + em_len = em->len; + em_start = em->start; + + free_extent_map(em); + em = NULL; + + cb->len = uncompressed_len; + cb->compressed_len = compressed_len; + cb->orig_bio = bio; + + nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, + GFP_NOFS); + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + for (page_index = 0; page_index < nr_pages; page_index++) { + cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | + __GFP_HIGHMEM); + } + cb->nr_pages = nr_pages; + + add_ra_bio_pages(inode, em_start + em_len, cb); + + /* include any pages we added in add_ra-bio_pages */ + uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + cb->len = uncompressed_len; + + comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + comp_bio->bi_private = cb; + comp_bio->bi_end_io = end_compressed_bio_read; + atomic_inc(&cb->pending_bios); + + for (page_index = 0; page_index < nr_pages; page_index++) { + page = cb->compressed_pages[page_index]; + page->mapping = inode->i_mapping; + page->index = em_start >> PAGE_CACHE_SHIFT; + + if (comp_bio->bi_size) + ret = tree->ops->merge_bio_hook(page, 0, + PAGE_CACHE_SIZE, + comp_bio, 0); + else + ret = 0; + + page->mapping = NULL; + if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + bio_get(comp_bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + BUG_ON(ret); + + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the cb might get + * freed before we're done setting it up + */ + atomic_inc(&cb->pending_bios); + + if (!btrfs_test_flag(inode, NODATASUM)) { + btrfs_lookup_bio_sums(root, inode, comp_bio, + sums); + } + sums += (comp_bio->bi_size + root->sectorsize - 1) / + root->sectorsize; + + ret = btrfs_map_bio(root, READ, comp_bio, + mirror_num, 0); + BUG_ON(ret); + + bio_put(comp_bio); + + comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, + GFP_NOFS); + comp_bio->bi_private = cb; + comp_bio->bi_end_io = end_compressed_bio_read; + + bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0); + } + cur_disk_byte += PAGE_CACHE_SIZE; + } + bio_get(comp_bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + BUG_ON(ret); + + if (!btrfs_test_flag(inode, NODATASUM)) + btrfs_lookup_bio_sums(root, inode, comp_bio, sums); + + ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); + BUG_ON(ret); + + bio_put(comp_bio); + return 0; +}

new file mode 100644

index 0000000000000..421f5b4aa7151

--- /dev/null

+++ b/ diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.hnew file mode 100644index 0000000000000..421f5b4aa7151--- /dev/null+++ b/ fs/btrfs/compression.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_COMPRESSION_ +#define __BTRFS_COMPRESSION_ + +int btrfs_zlib_decompress(unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen); +int btrfs_zlib_compress_pages(struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); +int btrfs_zlib_decompress_biovec(struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen); +void btrfs_zlib_exit(void); +int btrfs_submit_compressed_write(struct inode *inode, u64 start, + unsigned long len, u64 disk_start, + unsigned long compressed_len, + struct page **compressed_pages, + unsigned long nr_pages); +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); +#endif

new file mode 100644

index 0000000000000..6e1b3de367004

--- /dev/null

+++ b/ diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.hnew file mode 100644index 0000000000000..6e1b3de367004--- /dev/null+++ b/ fs/btrfs/crc32c.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_CRC32C__ +#define __BTRFS_CRC32C__ +#include <linux/crc32c.h> + +/* + * this file used to do more for selecting the HW version of crc32c, + * perhaps it will one day again soon. + */ +#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length) +#endif +

new file mode 100644

index 0000000000000..9e46c07768167

--- /dev/null

+++ b/ diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.cnew file mode 100644index 0000000000000..9e46c07768167--- /dev/null+++ b/ fs/btrfs/ctree.c @@ -0,0 +1,3953 @@ +/* + * Copyright (C) 2007,2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "locking.h" + +static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int level); +static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *ins_key, + struct btrfs_path *path, int data_size, int extend); +static int push_node_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *dst, + struct extent_buffer *src, int empty); +static int balance_node_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *dst_buf, + struct extent_buffer *src_buf); +static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot); + +inline void btrfs_init_path(struct btrfs_path *p) +{ + memset(p, 0, sizeof(*p)); +} + +struct btrfs_path *btrfs_alloc_path(void) +{ + struct btrfs_path *path; + path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS); + if (path) { + btrfs_init_path(path); + path->reada = 1; + } + return path; +} + +/* this also releases the path */ +void btrfs_free_path(struct btrfs_path *p) +{ + btrfs_release_path(NULL, p); + kmem_cache_free(btrfs_path_cachep, p); +} + +/* + * path release drops references on the extent buffers in the path + * and it drops any locks held by this path + * + * It is safe to call this on paths that no locks or extent buffers held. + */ +noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) +{ + int i; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + p->slots[i] = 0; + if (!p->nodes[i]) + continue; + if (p->locks[i]) { + btrfs_tree_unlock(p->nodes[i]); + p->locks[i] = 0; + } + free_extent_buffer(p->nodes[i]); + p->nodes[i] = NULL; + } +} + +/* + * safely gets a reference on the root node of a tree. A lock + * is not taken, so a concurrent writer may put a different node + * at the root of the tree. See btrfs_lock_root_node for the + * looping required. + * + * The extent buffer returned by this has a reference taken, so + * it won't disappear. It may stop being the root of the tree + * at any time because there are no locks held. + */ +struct extent_buffer *btrfs_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + spin_lock(&root->node_lock); + eb = root->node; + extent_buffer_get(eb); + spin_unlock(&root->node_lock); + return eb; +} + +/* loop around taking references on and locking the root node of the + * tree until you end up with a lock on the root. A locked buffer + * is returned, with a reference held. + */ +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + btrfs_tree_lock(eb); + + spin_lock(&root->node_lock); + if (eb == root->node) { + spin_unlock(&root->node_lock); + break; + } + spin_unlock(&root->node_lock); + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* cowonly root (everything not a reference counted cow subvolume), just get + * put onto a simple dirty list. transaction.c walks this to make sure they + * get properly updated on disk. + */ +static void add_root_to_dirty_list(struct btrfs_root *root) +{ + if (root->track_dirty && list_empty(&root->dirty_list)) { + list_add(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); + } +} + +/* + * used by snapshot creation to make a copy of a root for a tree with + * a given objectid. The buffer with the new root node is returned in + * cow_ret, and this func returns zero on success or a negative error code. + */ +int btrfs_copy_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer **cow_ret, u64 new_root_objectid) +{ + struct extent_buffer *cow; + u32 nritems; + int ret = 0; + int level; + struct btrfs_root *new_root; + + new_root = kmalloc(sizeof(*new_root), GFP_NOFS); + if (!new_root) + return -ENOMEM; + + memcpy(new_root, root, sizeof(*new_root)); + new_root->root_key.objectid = new_root_objectid; + + WARN_ON(root->ref_cows && trans->transid != + root->fs_info->running_transaction->transid); + WARN_ON(root->ref_cows && trans->transid != root->last_trans); + + level = btrfs_header_level(buf); + nritems = btrfs_header_nritems(buf); + + cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0, + new_root_objectid, trans->transid, + level, buf->start, 0); + if (IS_ERR(cow)) { + kfree(new_root); + return PTR_ERR(cow); + } + + copy_extent_buffer(cow, buf, 0, 0, cow->len); + btrfs_set_header_bytenr(cow, cow->start); + btrfs_set_header_generation(cow, trans->transid); + btrfs_set_header_owner(cow, new_root_objectid); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); + + write_extent_buffer(cow, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(cow), + BTRFS_FSID_SIZE); + + WARN_ON(btrfs_header_generation(buf) > trans->transid); + ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL); + kfree(new_root); + + if (ret) + return ret; + + btrfs_mark_buffer_dirty(cow); + *cow_ret = cow; + return 0; +} + +/* + * does the dirty work in cow of a single block. The parent block (if + * supplied) is updated to point to the new cow copy. The new buffer is marked + * dirty and returned locked. If you modify the block it needs to be marked + * dirty again. + * + * search_start -- an allocation hint for the new block + * + * empty_size -- a hint that you plan on doing more cow. This is the size in + * bytes the allocator should try to find free next to the block it returns. + * This is just a hint and may be ignored by the allocator. + * + * prealloc_dest -- if you have already reserved a destination for the cow, + * this uses that block instead of allocating a new one. + * btrfs_alloc_reserved_extent is used to finish the allocation. + */ +static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + u64 search_start, u64 empty_size, + u64 prealloc_dest) +{ + u64 parent_start; + struct extent_buffer *cow; + u32 nritems; + int ret = 0; + int level; + int unlock_orig = 0; + + if (*cow_ret == buf) + unlock_orig = 1; + + WARN_ON(!btrfs_tree_locked(buf)); + + if (parent) + parent_start = parent->start; + else + parent_start = 0; + + WARN_ON(root->ref_cows && trans->transid != + root->fs_info->running_transaction->transid); + WARN_ON(root->ref_cows && trans->transid != root->last_trans); + + level = btrfs_header_level(buf); + nritems = btrfs_header_nritems(buf); + + if (prealloc_dest) { + struct btrfs_key ins; + + ins.objectid = prealloc_dest; + ins.offset = buf->len; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + ret = btrfs_alloc_reserved_extent(trans, root, parent_start, + root->root_key.objectid, + trans->transid, level, &ins); + BUG_ON(ret); + cow = btrfs_init_new_buffer(trans, root, prealloc_dest, + buf->len); + } else { + cow = btrfs_alloc_free_block(trans, root, buf->len, + parent_start, + root->root_key.objectid, + trans->transid, level, + search_start, empty_size); + } + if (IS_ERR(cow)) + return PTR_ERR(cow); + + copy_extent_buffer(cow, buf, 0, 0, cow->len); + btrfs_set_header_bytenr(cow, cow->start); + btrfs_set_header_generation(cow, trans->transid); + btrfs_set_header_owner(cow, root->root_key.objectid); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); + + write_extent_buffer(cow, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(cow), + BTRFS_FSID_SIZE); + + WARN_ON(btrfs_header_generation(buf) > trans->transid); + if (btrfs_header_generation(buf) != trans->transid) { + u32 nr_extents; + ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents); + if (ret) + return ret; + + ret = btrfs_cache_ref(trans, root, buf, nr_extents); + WARN_ON(ret); + } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) { + /* + * There are only two places that can drop reference to + * tree blocks owned by living reloc trees, one is here, + * the other place is btrfs_drop_subtree. In both places, + * we check reference count while tree block is locked. + * Furthermore, if reference count is one, it won't get + * increased by someone else. + */ + u32 refs; + ret = btrfs_lookup_extent_ref(trans, root, buf->start, + buf->len, &refs); + BUG_ON(ret); + if (refs == 1) { + ret = btrfs_update_ref(trans, root, buf, cow, + 0, nritems); + clean_tree_block(trans, root, buf); + } else { + ret = btrfs_inc_ref(trans, root, buf, cow, NULL); + } + BUG_ON(ret); + } else { + ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems); + if (ret) + return ret; + clean_tree_block(trans, root, buf); + } + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start); + WARN_ON(ret); + } + + if (buf == root->node) { + WARN_ON(parent && parent != buf); + + spin_lock(&root->node_lock); + root->node = cow; + extent_buffer_get(cow); + spin_unlock(&root->node_lock); + + if (buf != root->commit_root) { + btrfs_free_extent(trans, root, buf->start, + buf->len, buf->start, + root->root_key.objectid, + btrfs_header_generation(buf), + level, 1); + } + free_extent_buffer(buf); + add_root_to_dirty_list(root); + } else { + btrfs_set_node_blockptr(parent, parent_slot, + cow->start); + WARN_ON(trans->transid == 0); + btrfs_set_node_ptr_generation(parent, parent_slot, + trans->transid); + btrfs_mark_buffer_dirty(parent); + WARN_ON(btrfs_header_generation(parent) != trans->transid); + btrfs_free_extent(trans, root, buf->start, buf->len, + parent_start, btrfs_header_owner(parent), + btrfs_header_generation(parent), level, 1); + } + if (unlock_orig) + btrfs_tree_unlock(buf); + free_extent_buffer(buf); + btrfs_mark_buffer_dirty(cow); + *cow_ret = cow; + return 0; +} + +/* + * cows a single block, see __btrfs_cow_block for the real work. + * This version of it has extra checks so that a block isn't cow'd more than + * once per transaction, as long as it hasn't been written yet + */ +noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, u64 prealloc_dest) +{ + u64 search_start; + int ret; + + if (trans->transaction != root->fs_info->running_transaction) { + printk(KERN_CRIT "trans %llu running %llu

", + (unsigned long long)trans->transid, + (unsigned long long) + root->fs_info->running_transaction->transid); + WARN_ON(1); + } + if (trans->transid != root->fs_info->generation) { + printk(KERN_CRIT "trans %llu running %llu

", + (unsigned long long)trans->transid, + (unsigned long long)root->fs_info->generation); + WARN_ON(1); + } + + spin_lock(&root->fs_info->hash_lock); + if (btrfs_header_generation(buf) == trans->transid && + btrfs_header_owner(buf) == root->root_key.objectid && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + *cow_ret = buf; + spin_unlock(&root->fs_info->hash_lock); + WARN_ON(prealloc_dest); + return 0; + } + spin_unlock(&root->fs_info->hash_lock); + search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); + ret = __btrfs_cow_block(trans, root, buf, parent, + parent_slot, cow_ret, search_start, 0, + prealloc_dest); + return ret; +} + +/* + * helper function for defrag to decide if two blocks pointed to by a + * node are actually close by + */ +static int close_blocks(u64 blocknr, u64 other, u32 blocksize) +{ + if (blocknr < other && other - (blocknr + blocksize) < 32768) + return 1; + if (blocknr > other && blocknr - (other + blocksize) < 32768) + return 1; + return 0; +} + +/* + * compare two keys in a memcmp fashion + */ +static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) +{ + struct btrfs_key k1; + + btrfs_disk_key_to_cpu(&k1, disk); + + if (k1.objectid > k2->objectid) + return 1; + if (k1.objectid < k2->objectid) + return -1; + if (k1.type > k2->type) + return 1; + if (k1.type < k2->type) + return -1; + if (k1.offset > k2->offset) + return 1; + if (k1.offset < k2->offset) + return -1; + return 0; +} + +/* + * same as comp_keys only with two btrfs_key's + */ +static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) +{ + if (k1->objectid > k2->objectid) + return 1; + if (k1->objectid < k2->objectid) + return -1; + if (k1->type > k2->type) + return 1; + if (k1->type < k2->type) + return -1; + if (k1->offset > k2->offset) + return 1; + if (k1->offset < k2->offset) + return -1; + return 0; +} + +/* + * this is used by the defrag code to go through all the + * leaves pointed to by a node and reallocate them so that + * disk order is close to key order + */ +int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *parent, + int start_slot, int cache_only, u64 *last_ret, + struct btrfs_key *progress) +{ + struct extent_buffer *cur; + u64 blocknr; + u64 gen; + u64 search_start = *last_ret; + u64 last_block = 0; + u64 other; + u32 parent_nritems; + int end_slot; + int i; + int err = 0; + int parent_level; + int uptodate; + u32 blocksize; + int progress_passed = 0; + struct btrfs_disk_key disk_key; + + parent_level = btrfs_header_level(parent); + if (cache_only && parent_level != 1) + return 0; + + if (trans->transaction != root->fs_info->running_transaction) + WARN_ON(1); + if (trans->transid != root->fs_info->generation) + WARN_ON(1); + + parent_nritems = btrfs_header_nritems(parent); + blocksize = btrfs_level_size(root, parent_level - 1); + end_slot = parent_nritems; + + if (parent_nritems == 1) + return 0; + + for (i = start_slot; i < end_slot; i++) { + int close = 1; + + if (!parent->map_token) { + map_extent_buffer(parent, + btrfs_node_key_ptr_offset(i), + sizeof(struct btrfs_key_ptr), + &parent->map_token, &parent->kaddr, + &parent->map_start, &parent->map_len, + KM_USER1); + } + btrfs_node_key(parent, &disk_key, i); + if (!progress_passed && comp_keys(&disk_key, progress) < 0) + continue; + + progress_passed = 1; + blocknr = btrfs_node_blockptr(parent, i); + gen = btrfs_node_ptr_generation(parent, i); + if (last_block == 0) + last_block = blocknr; + + if (i > 0) { + other = btrfs_node_blockptr(parent, i - 1); + close = close_blocks(blocknr, other, blocksize); + } + if (!close && i < end_slot - 2) { + other = btrfs_node_blockptr(parent, i + 1); + close = close_blocks(blocknr, other, blocksize); + } + if (close) { + last_block = blocknr; + continue; + } + if (parent->map_token) { + unmap_extent_buffer(parent, parent->map_token, + KM_USER1); + parent->map_token = NULL; + } + + cur = btrfs_find_tree_block(root, blocknr, blocksize); + if (cur) + uptodate = btrfs_buffer_uptodate(cur, gen); + else + uptodate = 0; + if (!cur || !uptodate) { + if (cache_only) { + free_extent_buffer(cur); + continue; + } + if (!cur) { + cur = read_tree_block(root, blocknr, + blocksize, gen); + } else if (!uptodate) { + btrfs_read_buffer(cur, gen); + } + } + if (search_start == 0) + search_start = last_block; + + btrfs_tree_lock(cur); + err = __btrfs_cow_block(trans, root, cur, parent, i, + &cur, search_start, + min(16 * blocksize, + (end_slot - i) * blocksize), 0); + if (err) { + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + break; + } + search_start = cur->start; + last_block = cur->start; + *last_ret = search_start; + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + } + if (parent->map_token) { + unmap_extent_buffer(parent, parent->map_token, + KM_USER1); + parent->map_token = NULL; + } + return err; +} + +/* + * The leaf data grows from end-to-front in the node. + * this returns the address of the start of the last item, + * which is the stop of the leaf data stack + */ +static inline unsigned int leaf_data_end(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + u32 nr = btrfs_header_nritems(leaf); + if (nr == 0) + return BTRFS_LEAF_DATA_SIZE(root); + return btrfs_item_offset_nr(leaf, nr - 1); +} + +/* + * extra debugging checks to make sure all the items in a key are + * well formed and in the proper order + */ +static int check_node(struct btrfs_root *root, struct btrfs_path *path, + int level) +{ + struct extent_buffer *parent = NULL; + struct extent_buffer *node = path->nodes[level]; + struct btrfs_disk_key parent_key; + struct btrfs_disk_key node_key; + int parent_slot; + int slot; + struct btrfs_key cpukey; + u32 nritems = btrfs_header_nritems(node); + + if (path->nodes[level + 1]) + parent = path->nodes[level + 1]; + + slot = path->slots[level]; + BUG_ON(nritems == 0); + if (parent) { + parent_slot = path->slots[level + 1]; + btrfs_node_key(parent, &parent_key, parent_slot); + btrfs_node_key(node, &node_key, 0); + BUG_ON(memcmp(&parent_key, &node_key, + sizeof(struct btrfs_disk_key))); + BUG_ON(btrfs_node_blockptr(parent, parent_slot) != + btrfs_header_bytenr(node)); + } + BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root)); + if (slot != 0) { + btrfs_node_key_to_cpu(node, &cpukey, slot - 1); + btrfs_node_key(node, &node_key, slot); + BUG_ON(comp_keys(&node_key, &cpukey) <= 0); + } + if (slot < nritems - 1) { + btrfs_node_key_to_cpu(node, &cpukey, slot + 1); + btrfs_node_key(node, &node_key, slot); + BUG_ON(comp_keys(&node_key, &cpukey) >= 0); + } + return 0; +} + +/* + * extra checking to make sure all the items in a leaf are + * well formed and in the proper order + */ +static int check_leaf(struct btrfs_root *root, struct btrfs_path *path, + int level) +{ + struct extent_buffer *leaf = path->nodes[level]; + struct extent_buffer *parent = NULL; + int parent_slot; + struct btrfs_key cpukey; + struct btrfs_disk_key parent_key; + struct btrfs_disk_key leaf_key; + int slot = path->slots[0]; + + u32 nritems = btrfs_header_nritems(leaf); + + if (path->nodes[level + 1]) + parent = path->nodes[level + 1]; + + if (nritems == 0) + return 0; + + if (parent) { + parent_slot = path->slots[level + 1]; + btrfs_node_key(parent, &parent_key, parent_slot); + btrfs_item_key(leaf, &leaf_key, 0); + + BUG_ON(memcmp(&parent_key, &leaf_key, + sizeof(struct btrfs_disk_key))); + BUG_ON(btrfs_node_blockptr(parent, parent_slot) != + btrfs_header_bytenr(leaf)); + } + if (slot != 0 && slot < nritems - 1) { + btrfs_item_key(leaf, &leaf_key, slot); + btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1); + if (comp_keys(&leaf_key, &cpukey) <= 0) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d offset bad key

", slot); + BUG_ON(1); + } + if (btrfs_item_offset_nr(leaf, slot - 1) != + btrfs_item_end_nr(leaf, slot)) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d offset bad

", slot); + BUG_ON(1); + } + } + if (slot < nritems - 1) { + btrfs_item_key(leaf, &leaf_key, slot); + btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1); + BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0); + if (btrfs_item_offset_nr(leaf, slot) != + btrfs_item_end_nr(leaf, slot + 1)) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d offset bad

", slot); + BUG_ON(1); + } + } + BUG_ON(btrfs_item_offset_nr(leaf, 0) + + btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root)); + return 0; +} + +static noinline int check_block(struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + return 0; + if (level == 0) + return check_leaf(root, path, level); + return check_node(root, path, level); +} + +/* + * search for key in the extent_buffer. The items start at offset p, + * and they are item_size apart. There are 'max' items in p. + * + * the slot in the array is returned via slot, and it points to + * the place where you would insert key if it is not found in + * the array. + * + * slot may point to max if the key is bigger than all of the keys + */ +static noinline int generic_bin_search(struct extent_buffer *eb, + unsigned long p, + int item_size, struct btrfs_key *key, + int max, int *slot) +{ + int low = 0; + int high = max; + int mid; + int ret; + struct btrfs_disk_key *tmp = NULL; + struct btrfs_disk_key unaligned; + unsigned long offset; + char *map_token = NULL; + char *kaddr = NULL; + unsigned long map_start = 0; + unsigned long map_len = 0; + int err; + + while (low < high) { + mid = (low + high) / 2; + offset = p + mid * item_size; + + if (!map_token || offset < map_start || + (offset + sizeof(struct btrfs_disk_key)) > + map_start + map_len) { + if (map_token) { + unmap_extent_buffer(eb, map_token, KM_USER0); + map_token = NULL; + } + + err = map_private_extent_buffer(eb, offset, + sizeof(struct btrfs_disk_key), + &map_token, &kaddr, + &map_start, &map_len, KM_USER0); + + if (!err) { + tmp = (struct btrfs_disk_key *)(kaddr + offset - + map_start); + } else { + read_extent_buffer(eb, &unaligned, + offset, sizeof(unaligned)); + tmp = &unaligned; + } + + } else { + tmp = (struct btrfs_disk_key *)(kaddr + offset - + map_start); + } + ret = comp_keys(tmp, key); + + if (ret < 0) + low = mid + 1; + else if (ret > 0) + high = mid; + else { + *slot = mid; + if (map_token) + unmap_extent_buffer(eb, map_token, KM_USER0); + return 0; + } + } + *slot = low; + if (map_token) + unmap_extent_buffer(eb, map_token, KM_USER0); + return 1; +} + +/* + * simple bin_search frontend that does the right thing for + * leaves vs nodes + */ +static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot) +{ + if (level == 0) { + return generic_bin_search(eb, + offsetof(struct btrfs_leaf, items), + sizeof(struct btrfs_item), + key, btrfs_header_nritems(eb), + slot); + } else { + return generic_bin_search(eb, + offsetof(struct btrfs_node, ptrs), + sizeof(struct btrfs_key_ptr), + key, btrfs_header_nritems(eb), + slot); + } + return -1; +} + +/* given a node and slot number, this reads the blocks it points to. The + * extent buffer is returned with a reference taken (but unlocked). + * NULL is returned on error. + */ +static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, + struct extent_buffer *parent, int slot) +{ + int level = btrfs_header_level(parent); + if (slot < 0) + return NULL; + if (slot >= btrfs_header_nritems(parent)) + return NULL; + + BUG_ON(level == 0); + + return read_tree_block(root, btrfs_node_blockptr(parent, slot), + btrfs_level_size(root, level - 1), + btrfs_node_ptr_generation(parent, slot)); +} + +/* + * node level balancing, used to make sure nodes are in proper order for + * item deletion. We balance from the top down, so we have to make sure + * that a deletion won't leave an node completely empty later on. + */ +static noinline int balance_level(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *right = NULL; + struct extent_buffer *mid; + struct extent_buffer *left = NULL; + struct extent_buffer *parent = NULL; + int ret = 0; + int wret; + int pslot; + int orig_slot = path->slots[level]; + int err_on_enospc = 0; + u64 orig_ptr; + + if (level == 0) + return 0; + + mid = path->nodes[level]; + WARN_ON(!path->locks[level]); + WARN_ON(btrfs_header_generation(mid) != trans->transid); + + orig_ptr = btrfs_node_blockptr(mid, orig_slot); + + if (level < BTRFS_MAX_LEVEL - 1) + parent = path->nodes[level + 1]; + pslot = path->slots[level + 1]; + + /* + * deal with the case where there is only one pointer in the root + * by promoting the node below to a root + */ + if (!parent) { + struct extent_buffer *child; + + if (btrfs_header_nritems(mid) != 1) + return 0; + + /* promote the child to a root */ + child = read_node_slot(root, mid, 0); + btrfs_tree_lock(child); + BUG_ON(!child); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); + BUG_ON(ret); + + spin_lock(&root->node_lock); + root->node = child; + spin_unlock(&root->node_lock); + + ret = btrfs_update_extent_ref(trans, root, child->start, + mid->start, child->start, + root->root_key.objectid, + trans->transid, level - 1); + BUG_ON(ret); + + add_root_to_dirty_list(root); + btrfs_tree_unlock(child); + path->locks[level] = 0; + path->nodes[level] = NULL; + clean_tree_block(trans, root, mid); + btrfs_tree_unlock(mid); + /* once for the path */ + free_extent_buffer(mid); + ret = btrfs_free_extent(trans, root, mid->start, mid->len, + mid->start, root->root_key.objectid, + btrfs_header_generation(mid), + level, 1); + /* once for the root ptr */ + free_extent_buffer(mid); + return ret; + } + if (btrfs_header_nritems(mid) > + BTRFS_NODEPTRS_PER_BLOCK(root) / 4) + return 0; + + if (btrfs_header_nritems(mid) < 2) + err_on_enospc = 1; + + left = read_node_slot(root, parent, pslot - 1); + if (left) { + btrfs_tree_lock(left); + wret = btrfs_cow_block(trans, root, left, + parent, pslot - 1, &left, 0); + if (wret) { + ret = wret; + goto enospc; + } + } + right = read_node_slot(root, parent, pslot + 1); + if (right) { + btrfs_tree_lock(right); + wret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, &right, 0); + if (wret) { + ret = wret; + goto enospc; + } + } + + /* first, try to make some room in the middle buffer */ + if (left) { + orig_slot += btrfs_header_nritems(left); + wret = push_node_left(trans, root, left, mid, 1); + if (wret < 0) + ret = wret; + if (btrfs_header_nritems(mid) < 2) + err_on_enospc = 1; + } + + /* + * then try to empty the right most buffer into the middle + */ + if (right) { + wret = push_node_left(trans, root, mid, right, 1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + if (btrfs_header_nritems(right) == 0) { + u64 bytenr = right->start; + u64 generation = btrfs_header_generation(parent); + u32 blocksize = right->len; + + clean_tree_block(trans, root, right); + btrfs_tree_unlock(right); + free_extent_buffer(right); + right = NULL; + wret = del_ptr(trans, root, path, level + 1, pslot + + 1); + if (wret) + ret = wret; + wret = btrfs_free_extent(trans, root, bytenr, + blocksize, parent->start, + btrfs_header_owner(parent), + generation, level, 1); + if (wret) + ret = wret; + } else { + struct btrfs_disk_key right_key; + btrfs_node_key(right, &right_key, 0); + btrfs_set_node_key(parent, &right_key, pslot + 1); + btrfs_mark_buffer_dirty(parent); + } + } + if (btrfs_header_nritems(mid) == 1) { + /* + * we're not allowed to leave a node with one item in the + * tree during a delete. A deletion from lower in the tree + * could try to delete the only pointer in this node. + * So, pull some keys from the left. + * There has to be a left pointer at this point because + * otherwise we would have pulled some pointers from the + * right + */ + BUG_ON(!left); + wret = balance_node_right(trans, root, mid, left); + if (wret < 0) { + ret = wret; + goto enospc; + } + if (wret == 1) { + wret = push_node_left(trans, root, left, mid, 1); + if (wret < 0) + ret = wret; + } + BUG_ON(wret == 1); + } + if (btrfs_header_nritems(mid) == 0) { + /* we've managed to empty the middle node, drop it */ + u64 root_gen = btrfs_header_generation(parent); + u64 bytenr = mid->start; + u32 blocksize = mid->len; + + clean_tree_block(trans, root, mid); + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + mid = NULL; + wret = del_ptr(trans, root, path, level + 1, pslot); + if (wret) + ret = wret; + wret = btrfs_free_extent(trans, root, bytenr, blocksize, + parent->start, + btrfs_header_owner(parent), + root_gen, level, 1); + if (wret) + ret = wret; + } else { + /* update the parent key to reflect our changes */ + struct btrfs_disk_key mid_key; + btrfs_node_key(mid, &mid_key, 0); + btrfs_set_node_key(parent, &mid_key, pslot); + btrfs_mark_buffer_dirty(parent); + } + + /* update the path */ + if (left) { + if (btrfs_header_nritems(left) > orig_slot) { + extent_buffer_get(left); + /* left was locked after cow */ + path->nodes[level] = left; + path->slots[level + 1] -= 1; + path->slots[level] = orig_slot; + if (mid) { + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } + } else { + orig_slot -= btrfs_header_nritems(left); + path->slots[level] = orig_slot; + } + } + /* double check we haven't messed things up */ + check_block(root, path, level); + if (orig_ptr != + btrfs_node_blockptr(path->nodes[level], path->slots[level])) + BUG(); +enospc: + if (right) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + if (left) { + if (path->nodes[level] != left) + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + return ret; +} + +/* Node balancing for insertion. Here we only split or push nodes around + * when they are completely full. This is also done top down, so we + * have to be pessimistic. + */ +static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *right = NULL; + struct extent_buffer *mid; + struct extent_buffer *left = NULL; + struct extent_buffer *parent = NULL; + int ret = 0; + int wret; + int pslot; + int orig_slot = path->slots[level]; + u64 orig_ptr; + + if (level == 0) + return 1; + + mid = path->nodes[level]; + WARN_ON(btrfs_header_generation(mid) != trans->transid); + orig_ptr = btrfs_node_blockptr(mid, orig_slot); + + if (level < BTRFS_MAX_LEVEL - 1) + parent = path->nodes[level + 1]; + pslot = path->slots[level + 1]; + + if (!parent) + return 1; + + left = read_node_slot(root, parent, pslot - 1); + + /* first, try to make some room in the middle buffer */ + if (left) { + u32 left_nr; + + btrfs_tree_lock(left); + left_nr = btrfs_header_nritems(left); + if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { + wret = 1; + } else { + ret = btrfs_cow_block(trans, root, left, parent, + pslot - 1, &left, 0); + if (ret) + wret = 1; + else { + wret = push_node_left(trans, root, + left, mid, 0); + } + } + if (wret < 0) + ret = wret; + if (wret == 0) { + struct btrfs_disk_key disk_key; + orig_slot += left_nr; + btrfs_node_key(mid, &disk_key, 0); + btrfs_set_node_key(parent, &disk_key, pslot); + btrfs_mark_buffer_dirty(parent); + if (btrfs_header_nritems(left) > orig_slot) { + path->nodes[level] = left; + path->slots[level + 1] -= 1; + path->slots[level] = orig_slot; + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } else { + orig_slot -= + btrfs_header_nritems(left); + path->slots[level] = orig_slot; + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + return 0; + } + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + right = read_node_slot(root, parent, pslot + 1); + + /* + * then try to empty the right most buffer into the middle + */ + if (right) { + u32 right_nr; + btrfs_tree_lock(right); + right_nr = btrfs_header_nritems(right); + if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { + wret = 1; + } else { + ret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, + &right, 0); + if (ret) + wret = 1; + else { + wret = balance_node_right(trans, root, + right, mid); + } + } + if (wret < 0) + ret = wret; + if (wret == 0) { + struct btrfs_disk_key disk_key; + + btrfs_node_key(right, &disk_key, 0); + btrfs_set_node_key(parent, &disk_key, pslot + 1); + btrfs_mark_buffer_dirty(parent); + + if (btrfs_header_nritems(mid) <= orig_slot) { + path->nodes[level] = right; + path->slots[level + 1] += 1; + path->slots[level] = orig_slot - + btrfs_header_nritems(mid); + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 0; + } + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 1; +} + +/* + * readahead one full node of leaves, finding things that are close + * to the block in 'slot', and triggering ra on them. + */ +static noinline void reada_for_search(struct btrfs_root *root, + struct btrfs_path *path, + int level, int slot, u64 objectid) +{ + struct extent_buffer *node; + struct btrfs_disk_key disk_key; + u32 nritems; + u64 search; + u64 lowest_read; + u64 highest_read; + u64 nread = 0; + int direction = path->reada; + struct extent_buffer *eb; + u32 nr; + u32 blocksize; + u32 nscan = 0; + + if (level != 1) + return; + + if (!path->nodes[level]) + return; + + node = path->nodes[level]; + + search = btrfs_node_blockptr(node, slot); + blocksize = btrfs_level_size(root, level - 1); + eb = btrfs_find_tree_block(root, search, blocksize); + if (eb) { + free_extent_buffer(eb); + return; + } + + highest_read = search; + lowest_read = search; + + nritems = btrfs_header_nritems(node); + nr = slot; + while (1) { + if (direction < 0) { + if (nr == 0) + break; + nr--; + } else if (direction > 0) { + nr++; + if (nr >= nritems) + break; + } + if (path->reada < 0 && objectid) { + btrfs_node_key(node, &disk_key, nr); + if (btrfs_disk_key_objectid(&disk_key) != objectid) + break; + } + search = btrfs_node_blockptr(node, nr); + if ((search >= lowest_read && search <= highest_read) || + (search < lowest_read && lowest_read - search <= 16384) || + (search > highest_read && search - highest_read <= 16384)) { + readahead_tree_block(root, search, blocksize, + btrfs_node_ptr_generation(node, nr)); + nread += blocksize; + } + nscan++; + if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32)) + break; + + if (nread > (256 * 1024) || nscan > 128) + break; + + if (search < lowest_read) + lowest_read = search; + if (search > highest_read) + highest_read = search; + } +} + +/* + * when we walk down the tree, it is usually safe to unlock the higher layers + * in the tree. The exceptions are when our path goes through slot 0, because + * operations on the tree might require changing key pointers higher up in the + * tree. + * + * callers might also have set path->keep_locks, which tells this code to keep + * the lock if the path points to the last slot in the block. This is part of + * walking through the tree, and selecting the next slot in the higher block. + * + * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so + * if lowest_unlock is 1, level 0 won't be unlocked + */ +static noinline void unlock_up(struct btrfs_path *path, int level, + int lowest_unlock) +{ + int i; + int skip_level = level; + int no_skips = 0; + struct extent_buffer *t; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + break; + if (!path->locks[i]) + break; + if (!no_skips && path->slots[i] == 0) { + skip_level = i + 1; + continue; + } + if (!no_skips && path->keep_locks) { + u32 nritems; + t = path->nodes[i]; + nritems = btrfs_header_nritems(t); + if (nritems < 1 || path->slots[i] >= nritems - 1) { + skip_level = i + 1; + continue; + } + } + if (skip_level < i && i >= lowest_unlock) + no_skips = 1; + + t = path->nodes[i]; + if (i >= lowest_unlock && i > skip_level && path->locks[i]) { + btrfs_tree_unlock(t); + path->locks[i] = 0; + } + } +} + +/* + * look for key in the tree. path is filled in with nodes along the way + * if key is found, we return zero and you can find the item in the leaf + * level of the path (level 0) + * + * If the key isn't found, the path points to the slot where it should + * be inserted, and 1 is returned. If there are other errors during the + * search a negative error number is returned. + * + * if ins_len > 0, nodes and leaves will be split as we walk down the + * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if + * possible) + */ +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_path *p, int + ins_len, int cow) +{ + struct extent_buffer *b; + struct extent_buffer *tmp; + int slot; + int ret; + int level; + int should_reada = p->reada; + int lowest_unlock = 1; + int blocksize; + u8 lowest_level = 0; + u64 blocknr; + u64 gen; + struct btrfs_key prealloc_block; + + lowest_level = p->lowest_level; + WARN_ON(lowest_level && ins_len > 0); + WARN_ON(p->nodes[0] != NULL); + + if (ins_len < 0) + lowest_unlock = 2; + + prealloc_block.objectid = 0; + +again: + if (p->skip_locking) + b = btrfs_root_node(root); + else + b = btrfs_lock_root_node(root); + + while (b) { + level = btrfs_header_level(b); + + /* + * setup the path here so we can release it under lock + * contention with the cow code + */ + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = 1; + + if (cow) { + int wret; + + /* is a cow on this block not required */ + spin_lock(&root->fs_info->hash_lock); + if (btrfs_header_generation(b) == trans->transid && + btrfs_header_owner(b) == root->root_key.objectid && + !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { + spin_unlock(&root->fs_info->hash_lock); + goto cow_done; + } + spin_unlock(&root->fs_info->hash_lock); + + /* ok, we have to cow, is our old prealloc the right + * size? + */ + if (prealloc_block.objectid && + prealloc_block.offset != b->len) { + btrfs_free_reserved_extent(root, + prealloc_block.objectid, + prealloc_block.offset); + prealloc_block.objectid = 0; + } + + /* + * for higher level blocks, try not to allocate blocks + * with the block and the parent locks held. + */ + if (level > 1 && !prealloc_block.objectid && + btrfs_path_lock_waiting(p, level)) { + u32 size = b->len; + u64 hint = b->start; + + btrfs_release_path(root, p); + ret = btrfs_reserve_extent(trans, root, + size, size, 0, + hint, (u64)-1, + &prealloc_block, 0); + BUG_ON(ret); + goto again; + } + + wret = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], + &b, prealloc_block.objectid); + prealloc_block.objectid = 0; + if (wret) { + free_extent_buffer(b); + ret = wret; + goto done; + } + } +cow_done: + BUG_ON(!cow && ins_len); + if (level != btrfs_header_level(b)) + WARN_ON(1); + level = btrfs_header_level(b); + + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = 1; + + ret = check_block(root, p, level); + if (ret) { + ret = -1; + goto done; + } + + ret = bin_search(b, key, level, &slot); + if (level != 0) { + if (ret && slot > 0) + slot -= 1; + p->slots[level] = slot; + if ((p->search_for_split || ins_len > 0) && + btrfs_header_nritems(b) >= + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { + int sret = split_node(trans, root, p, level); + BUG_ON(sret > 0); + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + slot = p->slots[level]; + } else if (ins_len < 0) { + int sret = balance_level(trans, root, p, + level); + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + if (!b) { + btrfs_release_path(NULL, p); + goto again; + } + slot = p->slots[level]; + BUG_ON(btrfs_header_nritems(b) == 1); + } + unlock_up(p, level, lowest_unlock); + + /* this is only true while dropping a snapshot */ + if (level == lowest_level) { + ret = 0; + goto done; + } + + blocknr = btrfs_node_blockptr(b, slot); + gen = btrfs_node_ptr_generation(b, slot); + blocksize = btrfs_level_size(root, level - 1); + + tmp = btrfs_find_tree_block(root, blocknr, blocksize); + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + b = tmp; + } else { + /* + * reduce lock contention at high levels + * of the btree by dropping locks before + * we read. + */ + if (level > 1) { + btrfs_release_path(NULL, p); + if (tmp) + free_extent_buffer(tmp); + if (should_reada) + reada_for_search(root, p, + level, slot, + key->objectid); + + tmp = read_tree_block(root, blocknr, + blocksize, gen); + if (tmp) + free_extent_buffer(tmp); + goto again; + } else { + if (tmp) + free_extent_buffer(tmp); + if (should_reada) + reada_for_search(root, p, + level, slot, + key->objectid); + b = read_node_slot(root, b, slot); + } + } + if (!p->skip_locking) + btrfs_tree_lock(b); + } else { + p->slots[level] = slot; + if (ins_len > 0 && + btrfs_leaf_free_space(root, b) < ins_len) { + int sret = split_leaf(trans, root, key, + p, ins_len, ret == 0); + BUG_ON(sret > 0); + if (sret) { + ret = sret; + goto done; + } + } + if (!p->search_for_split) + unlock_up(p, level, lowest_unlock); + goto done; + } + } + ret = 1; +done: + if (prealloc_block.objectid) { + btrfs_free_reserved_extent(root, + prealloc_block.objectid, + prealloc_block.offset); + } + + return ret; +} + +int btrfs_merge_path(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *node_keys, + u64 *nodes, int lowest_level) +{ + struct extent_buffer *eb; + struct extent_buffer *parent; + struct btrfs_key key; + u64 bytenr; + u64 generation; + u32 blocksize; + int level; + int slot; + int key_match; + int ret; + + eb = btrfs_lock_root_node(root); + ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); + BUG_ON(ret); + + parent = eb; + while (1) { + level = btrfs_header_level(parent); + if (level == 0 || level <= lowest_level) + break; + + ret = bin_search(parent, &node_keys[lowest_level], level, + &slot); + if (ret && slot > 0) + slot--; + + bytenr = btrfs_node_blockptr(parent, slot); + if (nodes[level - 1] == bytenr) + break; + + blocksize = btrfs_level_size(root, level - 1); + generation = btrfs_node_ptr_generation(parent, slot); + btrfs_node_key_to_cpu(eb, &key, slot); + key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key)); + + if (generation == trans->transid) { + eb = read_tree_block(root, bytenr, blocksize, + generation); + btrfs_tree_lock(eb); + } + + /* + * if node keys match and node pointer hasn't been modified + * in the running transaction, we can merge the path. for + * blocks owened by reloc trees, the node pointer check is + * skipped, this is because these blocks are fully controlled + * by the space balance code, no one else can modify them. + */ + if (!nodes[level - 1] || !key_match || + (generation == trans->transid && + btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) { + if (level == 1 || level == lowest_level + 1) { + if (generation == trans->transid) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + break; + } + + if (generation != trans->transid) { + eb = read_tree_block(root, bytenr, blocksize, + generation); + btrfs_tree_lock(eb); + } + + ret = btrfs_cow_block(trans, root, eb, parent, slot, + &eb, 0); + BUG_ON(ret); + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) { + if (!nodes[level - 1]) { + nodes[level - 1] = eb->start; + memcpy(&node_keys[level - 1], &key, + sizeof(node_keys[0])); + } else { + WARN_ON(1); + } + } + + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + parent = eb; + continue; + } + + btrfs_set_node_blockptr(parent, slot, nodes[level - 1]); + btrfs_set_node_ptr_generation(parent, slot, trans->transid); + btrfs_mark_buffer_dirty(parent); + + ret = btrfs_inc_extent_ref(trans, root, + nodes[level - 1], + blocksize, parent->start, + btrfs_header_owner(parent), + btrfs_header_generation(parent), + level - 1); + BUG_ON(ret); + + /* + * If the block was created in the running transaction, + * it's possible this is the last reference to it, so we + * should drop the subtree. + */ + if (generation == trans->transid) { + ret = btrfs_drop_subtree(trans, root, eb, parent); + BUG_ON(ret); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } else { + ret = btrfs_free_extent(trans, root, bytenr, + blocksize, parent->start, + btrfs_header_owner(parent), + btrfs_header_generation(parent), + level - 1, 1); + BUG_ON(ret); + } + break; + } + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + return 0; +} + +/* + * adjust the pointers going up the tree, starting at level + * making sure the right key of each node is points to 'key'. + * This is used after shifting pointers to the left, so it stops + * fixing up pointers when a given leaf/node is not in slot 0 of the + * higher levels + * + * If this fails to write a tree block, it returns -1, but continues + * fixing up the blocks in ram so the tree is consistent. + */ +static int fixup_low_keys(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_disk_key *key, int level) +{ + int i; + int ret = 0; + struct extent_buffer *t; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + int tslot = path->slots[i]; + if (!path->nodes[i]) + break; + t = path->nodes[i]; + btrfs_set_node_key(t, key, tslot); + btrfs_mark_buffer_dirty(path->nodes[i]); + if (tslot != 0) + break; + } + return ret; +} + +/* + * update item key. + * + * This function isn't completely safe. It's the caller's responsibility + * that the new key won't break the order + */ +int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *n