GNU Linux-libre 4.14.266-gnu1
[releases.git] / fs / btrfs / ctree.c
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include <linux/mm.h>
23 #include "ctree.h"
24 #include "disk-io.h"
25 #include "transaction.h"
26 #include "print-tree.h"
27 #include "locking.h"
28
29 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
30                       *root, struct btrfs_path *path, int level);
31 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
32                       const struct btrfs_key *ins_key, struct btrfs_path *path,
33                       int data_size, int extend);
34 static int push_node_left(struct btrfs_trans_handle *trans,
35                           struct btrfs_fs_info *fs_info,
36                           struct extent_buffer *dst,
37                           struct extent_buffer *src, int empty);
38 static int balance_node_right(struct btrfs_trans_handle *trans,
39                               struct btrfs_fs_info *fs_info,
40                               struct extent_buffer *dst_buf,
41                               struct extent_buffer *src_buf);
42 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
43                     int level, int slot);
44 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
45                                  struct extent_buffer *eb);
46
47 struct btrfs_path *btrfs_alloc_path(void)
48 {
49         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
50 }
51
52 /*
53  * set all locked nodes in the path to blocking locks.  This should
54  * be done before scheduling
55  */
56 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
57 {
58         int i;
59         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
60                 if (!p->nodes[i] || !p->locks[i])
61                         continue;
62                 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
63                 if (p->locks[i] == BTRFS_READ_LOCK)
64                         p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
65                 else if (p->locks[i] == BTRFS_WRITE_LOCK)
66                         p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
67         }
68 }
69
70 /*
71  * reset all the locked nodes in the patch to spinning locks.
72  *
73  * held is used to keep lockdep happy, when lockdep is enabled
74  * we set held to a blocking lock before we go around and
75  * retake all the spinlocks in the path.  You can safely use NULL
76  * for held
77  */
78 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
79                                         struct extent_buffer *held, int held_rw)
80 {
81         int i;
82
83         if (held) {
84                 btrfs_set_lock_blocking_rw(held, held_rw);
85                 if (held_rw == BTRFS_WRITE_LOCK)
86                         held_rw = BTRFS_WRITE_LOCK_BLOCKING;
87                 else if (held_rw == BTRFS_READ_LOCK)
88                         held_rw = BTRFS_READ_LOCK_BLOCKING;
89         }
90         btrfs_set_path_blocking(p);
91
92         for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
93                 if (p->nodes[i] && p->locks[i]) {
94                         btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
95                         if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
96                                 p->locks[i] = BTRFS_WRITE_LOCK;
97                         else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
98                                 p->locks[i] = BTRFS_READ_LOCK;
99                 }
100         }
101
102         if (held)
103                 btrfs_clear_lock_blocking_rw(held, held_rw);
104 }
105
106 /* this also releases the path */
107 void btrfs_free_path(struct btrfs_path *p)
108 {
109         if (!p)
110                 return;
111         btrfs_release_path(p);
112         kmem_cache_free(btrfs_path_cachep, p);
113 }
114
115 /*
116  * path release drops references on the extent buffers in the path
117  * and it drops any locks held by this path
118  *
119  * It is safe to call this on paths that no locks or extent buffers held.
120  */
121 noinline void btrfs_release_path(struct btrfs_path *p)
122 {
123         int i;
124
125         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
126                 p->slots[i] = 0;
127                 if (!p->nodes[i])
128                         continue;
129                 if (p->locks[i]) {
130                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
131                         p->locks[i] = 0;
132                 }
133                 free_extent_buffer(p->nodes[i]);
134                 p->nodes[i] = NULL;
135         }
136 }
137
138 /*
139  * safely gets a reference on the root node of a tree.  A lock
140  * is not taken, so a concurrent writer may put a different node
141  * at the root of the tree.  See btrfs_lock_root_node for the
142  * looping required.
143  *
144  * The extent buffer returned by this has a reference taken, so
145  * it won't disappear.  It may stop being the root of the tree
146  * at any time because there are no locks held.
147  */
148 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
149 {
150         struct extent_buffer *eb;
151
152         while (1) {
153                 rcu_read_lock();
154                 eb = rcu_dereference(root->node);
155
156                 /*
157                  * RCU really hurts here, we could free up the root node because
158                  * it was COWed but we may not get the new root node yet so do
159                  * the inc_not_zero dance and if it doesn't work then
160                  * synchronize_rcu and try again.
161                  */
162                 if (atomic_inc_not_zero(&eb->refs)) {
163                         rcu_read_unlock();
164                         break;
165                 }
166                 rcu_read_unlock();
167                 synchronize_rcu();
168         }
169         return eb;
170 }
171
172 /* loop around taking references on and locking the root node of the
173  * tree until you end up with a lock on the root.  A locked buffer
174  * is returned, with a reference held.
175  */
176 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
177 {
178         struct extent_buffer *eb;
179
180         while (1) {
181                 eb = btrfs_root_node(root);
182                 btrfs_tree_lock(eb);
183                 if (eb == root->node)
184                         break;
185                 btrfs_tree_unlock(eb);
186                 free_extent_buffer(eb);
187         }
188         return eb;
189 }
190
191 /* loop around taking references on and locking the root node of the
192  * tree until you end up with a lock on the root.  A locked buffer
193  * is returned, with a reference held.
194  */
195 static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
196 {
197         struct extent_buffer *eb;
198
199         while (1) {
200                 eb = btrfs_root_node(root);
201                 btrfs_tree_read_lock(eb);
202                 if (eb == root->node)
203                         break;
204                 btrfs_tree_read_unlock(eb);
205                 free_extent_buffer(eb);
206         }
207         return eb;
208 }
209
210 /* cowonly root (everything not a reference counted cow subvolume), just get
211  * put onto a simple dirty list.  transaction.c walks this to make sure they
212  * get properly updated on disk.
213  */
214 static void add_root_to_dirty_list(struct btrfs_root *root)
215 {
216         struct btrfs_fs_info *fs_info = root->fs_info;
217
218         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
219             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
220                 return;
221
222         spin_lock(&fs_info->trans_lock);
223         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
224                 /* Want the extent tree to be the last on the list */
225                 if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
226                         list_move_tail(&root->dirty_list,
227                                        &fs_info->dirty_cowonly_roots);
228                 else
229                         list_move(&root->dirty_list,
230                                   &fs_info->dirty_cowonly_roots);
231         }
232         spin_unlock(&fs_info->trans_lock);
233 }
234
235 /*
236  * used by snapshot creation to make a copy of a root for a tree with
237  * a given objectid.  The buffer with the new root node is returned in
238  * cow_ret, and this func returns zero on success or a negative error code.
239  */
240 int btrfs_copy_root(struct btrfs_trans_handle *trans,
241                       struct btrfs_root *root,
242                       struct extent_buffer *buf,
243                       struct extent_buffer **cow_ret, u64 new_root_objectid)
244 {
245         struct btrfs_fs_info *fs_info = root->fs_info;
246         struct extent_buffer *cow;
247         int ret = 0;
248         int level;
249         struct btrfs_disk_key disk_key;
250
251         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
252                 trans->transid != fs_info->running_transaction->transid);
253         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
254                 trans->transid != root->last_trans);
255
256         level = btrfs_header_level(buf);
257         if (level == 0)
258                 btrfs_item_key(buf, &disk_key, 0);
259         else
260                 btrfs_node_key(buf, &disk_key, 0);
261
262         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
263                         &disk_key, level, buf->start, 0);
264         if (IS_ERR(cow))
265                 return PTR_ERR(cow);
266
267         copy_extent_buffer_full(cow, buf);
268         btrfs_set_header_bytenr(cow, cow->start);
269         btrfs_set_header_generation(cow, trans->transid);
270         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
271         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
272                                      BTRFS_HEADER_FLAG_RELOC);
273         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
274                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
275         else
276                 btrfs_set_header_owner(cow, new_root_objectid);
277
278         write_extent_buffer_fsid(cow, fs_info->fsid);
279
280         WARN_ON(btrfs_header_generation(buf) > trans->transid);
281         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
282                 ret = btrfs_inc_ref(trans, root, cow, 1);
283         else
284                 ret = btrfs_inc_ref(trans, root, cow, 0);
285         if (ret) {
286                 btrfs_tree_unlock(cow);
287                 free_extent_buffer(cow);
288                 btrfs_abort_transaction(trans, ret);
289                 return ret;
290         }
291
292         btrfs_mark_buffer_dirty(cow);
293         *cow_ret = cow;
294         return 0;
295 }
296
297 enum mod_log_op {
298         MOD_LOG_KEY_REPLACE,
299         MOD_LOG_KEY_ADD,
300         MOD_LOG_KEY_REMOVE,
301         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
302         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
303         MOD_LOG_MOVE_KEYS,
304         MOD_LOG_ROOT_REPLACE,
305 };
306
307 struct tree_mod_move {
308         int dst_slot;
309         int nr_items;
310 };
311
312 struct tree_mod_root {
313         u64 logical;
314         u8 level;
315 };
316
317 struct tree_mod_elem {
318         struct rb_node node;
319         u64 logical;
320         u64 seq;
321         enum mod_log_op op;
322
323         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
324         int slot;
325
326         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
327         u64 generation;
328
329         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
330         struct btrfs_disk_key key;
331         u64 blockptr;
332
333         /* this is used for op == MOD_LOG_MOVE_KEYS */
334         struct tree_mod_move move;
335
336         /* this is used for op == MOD_LOG_ROOT_REPLACE */
337         struct tree_mod_root old_root;
338 };
339
340 /*
341  * Pull a new tree mod seq number for our operation.
342  */
343 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
344 {
345         return atomic64_inc_return(&fs_info->tree_mod_seq);
346 }
347
348 /*
349  * This adds a new blocker to the tree mod log's blocker list if the @elem
350  * passed does not already have a sequence number set. So when a caller expects
351  * to record tree modifications, it should ensure to set elem->seq to zero
352  * before calling btrfs_get_tree_mod_seq.
353  * Returns a fresh, unused tree log modification sequence number, even if no new
354  * blocker was added.
355  */
356 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
357                            struct seq_list *elem)
358 {
359         write_lock(&fs_info->tree_mod_log_lock);
360         if (!elem->seq) {
361                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
362                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
363         }
364         write_unlock(&fs_info->tree_mod_log_lock);
365
366         return elem->seq;
367 }
368
369 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
370                             struct seq_list *elem)
371 {
372         struct rb_root *tm_root;
373         struct rb_node *node;
374         struct rb_node *next;
375         struct seq_list *cur_elem;
376         struct tree_mod_elem *tm;
377         u64 min_seq = (u64)-1;
378         u64 seq_putting = elem->seq;
379
380         if (!seq_putting)
381                 return;
382
383         write_lock(&fs_info->tree_mod_log_lock);
384         list_del(&elem->list);
385         elem->seq = 0;
386
387         list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
388                 if (cur_elem->seq < min_seq) {
389                         if (seq_putting > cur_elem->seq) {
390                                 /*
391                                  * blocker with lower sequence number exists, we
392                                  * cannot remove anything from the log
393                                  */
394                                 write_unlock(&fs_info->tree_mod_log_lock);
395                                 return;
396                         }
397                         min_seq = cur_elem->seq;
398                 }
399         }
400
401         /*
402          * anything that's lower than the lowest existing (read: blocked)
403          * sequence number can be removed from the tree.
404          */
405         tm_root = &fs_info->tree_mod_log;
406         for (node = rb_first(tm_root); node; node = next) {
407                 next = rb_next(node);
408                 tm = rb_entry(node, struct tree_mod_elem, node);
409                 if (tm->seq >= min_seq)
410                         continue;
411                 rb_erase(node, tm_root);
412                 kfree(tm);
413         }
414         write_unlock(&fs_info->tree_mod_log_lock);
415 }
416
417 /*
418  * key order of the log:
419  *       node/leaf start address -> sequence
420  *
421  * The 'start address' is the logical address of the *new* root node
422  * for root replace operations, or the logical address of the affected
423  * block for all other operations.
424  *
425  * Note: must be called with write lock for fs_info::tree_mod_log_lock.
426  */
427 static noinline int
428 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
429 {
430         struct rb_root *tm_root;
431         struct rb_node **new;
432         struct rb_node *parent = NULL;
433         struct tree_mod_elem *cur;
434
435         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
436
437         tm_root = &fs_info->tree_mod_log;
438         new = &tm_root->rb_node;
439         while (*new) {
440                 cur = rb_entry(*new, struct tree_mod_elem, node);
441                 parent = *new;
442                 if (cur->logical < tm->logical)
443                         new = &((*new)->rb_left);
444                 else if (cur->logical > tm->logical)
445                         new = &((*new)->rb_right);
446                 else if (cur->seq < tm->seq)
447                         new = &((*new)->rb_left);
448                 else if (cur->seq > tm->seq)
449                         new = &((*new)->rb_right);
450                 else
451                         return -EEXIST;
452         }
453
454         rb_link_node(&tm->node, parent, new);
455         rb_insert_color(&tm->node, tm_root);
456         return 0;
457 }
458
459 /*
460  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
461  * returns zero with the tree_mod_log_lock acquired. The caller must hold
462  * this until all tree mod log insertions are recorded in the rb tree and then
463  * write unlock fs_info::tree_mod_log_lock.
464  */
465 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
466                                     struct extent_buffer *eb) {
467         smp_mb();
468         if (list_empty(&(fs_info)->tree_mod_seq_list))
469                 return 1;
470         if (eb && btrfs_header_level(eb) == 0)
471                 return 1;
472
473         write_lock(&fs_info->tree_mod_log_lock);
474         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
475                 write_unlock(&fs_info->tree_mod_log_lock);
476                 return 1;
477         }
478
479         return 0;
480 }
481
482 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
483 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
484                                     struct extent_buffer *eb)
485 {
486         smp_mb();
487         if (list_empty(&(fs_info)->tree_mod_seq_list))
488                 return 0;
489         if (eb && btrfs_header_level(eb) == 0)
490                 return 0;
491
492         return 1;
493 }
494
495 static struct tree_mod_elem *
496 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
497                     enum mod_log_op op, gfp_t flags)
498 {
499         struct tree_mod_elem *tm;
500
501         tm = kzalloc(sizeof(*tm), flags);
502         if (!tm)
503                 return NULL;
504
505         tm->logical = eb->start;
506         if (op != MOD_LOG_KEY_ADD) {
507                 btrfs_node_key(eb, &tm->key, slot);
508                 tm->blockptr = btrfs_node_blockptr(eb, slot);
509         }
510         tm->op = op;
511         tm->slot = slot;
512         tm->generation = btrfs_node_ptr_generation(eb, slot);
513         RB_CLEAR_NODE(&tm->node);
514
515         return tm;
516 }
517
518 static noinline int
519 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
520                         struct extent_buffer *eb, int slot,
521                         enum mod_log_op op, gfp_t flags)
522 {
523         struct tree_mod_elem *tm;
524         int ret;
525
526         if (!tree_mod_need_log(fs_info, eb))
527                 return 0;
528
529         tm = alloc_tree_mod_elem(eb, slot, op, flags);
530         if (!tm)
531                 return -ENOMEM;
532
533         if (tree_mod_dont_log(fs_info, eb)) {
534                 kfree(tm);
535                 return 0;
536         }
537
538         ret = __tree_mod_log_insert(fs_info, tm);
539         write_unlock(&eb->fs_info->tree_mod_log_lock);
540         if (ret)
541                 kfree(tm);
542
543         return ret;
544 }
545
546 static noinline int
547 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
548                          struct extent_buffer *eb, int dst_slot, int src_slot,
549                          int nr_items)
550 {
551         struct tree_mod_elem *tm = NULL;
552         struct tree_mod_elem **tm_list = NULL;
553         int ret = 0;
554         int i;
555         int locked = 0;
556
557         if (!tree_mod_need_log(fs_info, eb))
558                 return 0;
559
560         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
561         if (!tm_list)
562                 return -ENOMEM;
563
564         tm = kzalloc(sizeof(*tm), GFP_NOFS);
565         if (!tm) {
566                 ret = -ENOMEM;
567                 goto free_tms;
568         }
569
570         tm->logical = eb->start;
571         tm->slot = src_slot;
572         tm->move.dst_slot = dst_slot;
573         tm->move.nr_items = nr_items;
574         tm->op = MOD_LOG_MOVE_KEYS;
575
576         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
577                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
578                     MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
579                 if (!tm_list[i]) {
580                         ret = -ENOMEM;
581                         goto free_tms;
582                 }
583         }
584
585         if (tree_mod_dont_log(fs_info, eb))
586                 goto free_tms;
587         locked = 1;
588
589         /*
590          * When we override something during the move, we log these removals.
591          * This can only happen when we move towards the beginning of the
592          * buffer, i.e. dst_slot < src_slot.
593          */
594         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
595                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
596                 if (ret)
597                         goto free_tms;
598         }
599
600         ret = __tree_mod_log_insert(fs_info, tm);
601         if (ret)
602                 goto free_tms;
603         write_unlock(&eb->fs_info->tree_mod_log_lock);
604         kfree(tm_list);
605
606         return 0;
607 free_tms:
608         for (i = 0; i < nr_items; i++) {
609                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
610                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
611                 kfree(tm_list[i]);
612         }
613         if (locked)
614                 write_unlock(&eb->fs_info->tree_mod_log_lock);
615         kfree(tm_list);
616         kfree(tm);
617
618         return ret;
619 }
620
621 static inline int
622 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
623                        struct tree_mod_elem **tm_list,
624                        int nritems)
625 {
626         int i, j;
627         int ret;
628
629         for (i = nritems - 1; i >= 0; i--) {
630                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
631                 if (ret) {
632                         for (j = nritems - 1; j > i; j--)
633                                 rb_erase(&tm_list[j]->node,
634                                          &fs_info->tree_mod_log);
635                         return ret;
636                 }
637         }
638
639         return 0;
640 }
641
642 static noinline int
643 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
644                          struct extent_buffer *old_root,
645                          struct extent_buffer *new_root,
646                          int log_removal)
647 {
648         struct tree_mod_elem *tm = NULL;
649         struct tree_mod_elem **tm_list = NULL;
650         int nritems = 0;
651         int ret = 0;
652         int i;
653
654         if (!tree_mod_need_log(fs_info, NULL))
655                 return 0;
656
657         if (log_removal && btrfs_header_level(old_root) > 0) {
658                 nritems = btrfs_header_nritems(old_root);
659                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
660                                   GFP_NOFS);
661                 if (!tm_list) {
662                         ret = -ENOMEM;
663                         goto free_tms;
664                 }
665                 for (i = 0; i < nritems; i++) {
666                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
667                             MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
668                         if (!tm_list[i]) {
669                                 ret = -ENOMEM;
670                                 goto free_tms;
671                         }
672                 }
673         }
674
675         tm = kzalloc(sizeof(*tm), GFP_NOFS);
676         if (!tm) {
677                 ret = -ENOMEM;
678                 goto free_tms;
679         }
680
681         tm->logical = new_root->start;
682         tm->old_root.logical = old_root->start;
683         tm->old_root.level = btrfs_header_level(old_root);
684         tm->generation = btrfs_header_generation(old_root);
685         tm->op = MOD_LOG_ROOT_REPLACE;
686
687         if (tree_mod_dont_log(fs_info, NULL))
688                 goto free_tms;
689
690         if (tm_list)
691                 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
692         if (!ret)
693                 ret = __tree_mod_log_insert(fs_info, tm);
694
695         write_unlock(&fs_info->tree_mod_log_lock);
696         if (ret)
697                 goto free_tms;
698         kfree(tm_list);
699
700         return ret;
701
702 free_tms:
703         if (tm_list) {
704                 for (i = 0; i < nritems; i++)
705                         kfree(tm_list[i]);
706                 kfree(tm_list);
707         }
708         kfree(tm);
709
710         return ret;
711 }
712
713 static struct tree_mod_elem *
714 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
715                       int smallest)
716 {
717         struct rb_root *tm_root;
718         struct rb_node *node;
719         struct tree_mod_elem *cur = NULL;
720         struct tree_mod_elem *found = NULL;
721
722         read_lock(&fs_info->tree_mod_log_lock);
723         tm_root = &fs_info->tree_mod_log;
724         node = tm_root->rb_node;
725         while (node) {
726                 cur = rb_entry(node, struct tree_mod_elem, node);
727                 if (cur->logical < start) {
728                         node = node->rb_left;
729                 } else if (cur->logical > start) {
730                         node = node->rb_right;
731                 } else if (cur->seq < min_seq) {
732                         node = node->rb_left;
733                 } else if (!smallest) {
734                         /* we want the node with the highest seq */
735                         if (found)
736                                 BUG_ON(found->seq > cur->seq);
737                         found = cur;
738                         node = node->rb_left;
739                 } else if (cur->seq > min_seq) {
740                         /* we want the node with the smallest seq */
741                         if (found)
742                                 BUG_ON(found->seq < cur->seq);
743                         found = cur;
744                         node = node->rb_right;
745                 } else {
746                         found = cur;
747                         break;
748                 }
749         }
750         read_unlock(&fs_info->tree_mod_log_lock);
751
752         return found;
753 }
754
755 /*
756  * this returns the element from the log with the smallest time sequence
757  * value that's in the log (the oldest log item). any element with a time
758  * sequence lower than min_seq will be ignored.
759  */
760 static struct tree_mod_elem *
761 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
762                            u64 min_seq)
763 {
764         return __tree_mod_log_search(fs_info, start, min_seq, 1);
765 }
766
767 /*
768  * this returns the element from the log with the largest time sequence
769  * value that's in the log (the most recent log item). any element with
770  * a time sequence lower than min_seq will be ignored.
771  */
772 static struct tree_mod_elem *
773 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
774 {
775         return __tree_mod_log_search(fs_info, start, min_seq, 0);
776 }
777
778 static noinline int
779 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
780                      struct extent_buffer *src, unsigned long dst_offset,
781                      unsigned long src_offset, int nr_items)
782 {
783         int ret = 0;
784         struct tree_mod_elem **tm_list = NULL;
785         struct tree_mod_elem **tm_list_add, **tm_list_rem;
786         int i;
787         int locked = 0;
788
789         if (!tree_mod_need_log(fs_info, NULL))
790                 return 0;
791
792         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
793                 return 0;
794
795         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
796                           GFP_NOFS);
797         if (!tm_list)
798                 return -ENOMEM;
799
800         tm_list_add = tm_list;
801         tm_list_rem = tm_list + nr_items;
802         for (i = 0; i < nr_items; i++) {
803                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
804                     MOD_LOG_KEY_REMOVE, GFP_NOFS);
805                 if (!tm_list_rem[i]) {
806                         ret = -ENOMEM;
807                         goto free_tms;
808                 }
809
810                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
811                     MOD_LOG_KEY_ADD, GFP_NOFS);
812                 if (!tm_list_add[i]) {
813                         ret = -ENOMEM;
814                         goto free_tms;
815                 }
816         }
817
818         if (tree_mod_dont_log(fs_info, NULL))
819                 goto free_tms;
820         locked = 1;
821
822         for (i = 0; i < nr_items; i++) {
823                 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
824                 if (ret)
825                         goto free_tms;
826                 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
827                 if (ret)
828                         goto free_tms;
829         }
830
831         write_unlock(&fs_info->tree_mod_log_lock);
832         kfree(tm_list);
833
834         return 0;
835
836 free_tms:
837         for (i = 0; i < nr_items * 2; i++) {
838                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
839                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
840                 kfree(tm_list[i]);
841         }
842         if (locked)
843                 write_unlock(&fs_info->tree_mod_log_lock);
844         kfree(tm_list);
845
846         return ret;
847 }
848
849 static inline void
850 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
851                      int dst_offset, int src_offset, int nr_items)
852 {
853         int ret;
854         ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
855                                        nr_items);
856         BUG_ON(ret < 0);
857 }
858
859 static noinline void
860 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
861                           struct extent_buffer *eb, int slot, int atomic)
862 {
863         int ret;
864
865         ret = tree_mod_log_insert_key(fs_info, eb, slot,
866                                         MOD_LOG_KEY_REPLACE,
867                                         atomic ? GFP_ATOMIC : GFP_NOFS);
868         BUG_ON(ret < 0);
869 }
870
871 static noinline int
872 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
873 {
874         struct tree_mod_elem **tm_list = NULL;
875         int nritems = 0;
876         int i;
877         int ret = 0;
878
879         if (btrfs_header_level(eb) == 0)
880                 return 0;
881
882         if (!tree_mod_need_log(fs_info, NULL))
883                 return 0;
884
885         nritems = btrfs_header_nritems(eb);
886         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
887         if (!tm_list)
888                 return -ENOMEM;
889
890         for (i = 0; i < nritems; i++) {
891                 tm_list[i] = alloc_tree_mod_elem(eb, i,
892                     MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
893                 if (!tm_list[i]) {
894                         ret = -ENOMEM;
895                         goto free_tms;
896                 }
897         }
898
899         if (tree_mod_dont_log(fs_info, eb))
900                 goto free_tms;
901
902         ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
903         write_unlock(&eb->fs_info->tree_mod_log_lock);
904         if (ret)
905                 goto free_tms;
906         kfree(tm_list);
907
908         return 0;
909
910 free_tms:
911         for (i = 0; i < nritems; i++)
912                 kfree(tm_list[i]);
913         kfree(tm_list);
914
915         return ret;
916 }
917
918 static noinline void
919 tree_mod_log_set_root_pointer(struct btrfs_root *root,
920                               struct extent_buffer *new_root_node,
921                               int log_removal)
922 {
923         int ret;
924         ret = tree_mod_log_insert_root(root->fs_info, root->node,
925                                        new_root_node, log_removal);
926         BUG_ON(ret < 0);
927 }
928
929 /*
930  * check if the tree block can be shared by multiple trees
931  */
932 int btrfs_block_can_be_shared(struct btrfs_root *root,
933                               struct extent_buffer *buf)
934 {
935         /*
936          * Tree blocks not in reference counted trees and tree roots
937          * are never shared. If a block was allocated after the last
938          * snapshot and the block was not allocated by tree relocation,
939          * we know the block is not shared.
940          */
941         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
942             buf != root->node && buf != root->commit_root &&
943             (btrfs_header_generation(buf) <=
944              btrfs_root_last_snapshot(&root->root_item) ||
945              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
946                 return 1;
947 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
948         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
949             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
950                 return 1;
951 #endif
952         return 0;
953 }
954
955 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
956                                        struct btrfs_root *root,
957                                        struct extent_buffer *buf,
958                                        struct extent_buffer *cow,
959                                        int *last_ref)
960 {
961         struct btrfs_fs_info *fs_info = root->fs_info;
962         u64 refs;
963         u64 owner;
964         u64 flags;
965         u64 new_flags = 0;
966         int ret;
967
968         /*
969          * Backrefs update rules:
970          *
971          * Always use full backrefs for extent pointers in tree block
972          * allocated by tree relocation.
973          *
974          * If a shared tree block is no longer referenced by its owner
975          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
976          * use full backrefs for extent pointers in tree block.
977          *
978          * If a tree block is been relocating
979          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
980          * use full backrefs for extent pointers in tree block.
981          * The reason for this is some operations (such as drop tree)
982          * are only allowed for blocks use full backrefs.
983          */
984
985         if (btrfs_block_can_be_shared(root, buf)) {
986                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
987                                                btrfs_header_level(buf), 1,
988                                                &refs, &flags);
989                 if (ret)
990                         return ret;
991                 if (refs == 0) {
992                         ret = -EROFS;
993                         btrfs_handle_fs_error(fs_info, ret, NULL);
994                         return ret;
995                 }
996         } else {
997                 refs = 1;
998                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
999                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1000                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
1001                 else
1002                         flags = 0;
1003         }
1004
1005         owner = btrfs_header_owner(buf);
1006         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
1007                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
1008
1009         if (refs > 1) {
1010                 if ((owner == root->root_key.objectid ||
1011                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1012                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1013                         ret = btrfs_inc_ref(trans, root, buf, 1);
1014                         if (ret)
1015                                 return ret;
1016
1017                         if (root->root_key.objectid ==
1018                             BTRFS_TREE_RELOC_OBJECTID) {
1019                                 ret = btrfs_dec_ref(trans, root, buf, 0);
1020                                 if (ret)
1021                                         return ret;
1022                                 ret = btrfs_inc_ref(trans, root, cow, 1);
1023                                 if (ret)
1024                                         return ret;
1025                         }
1026                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
1027                 } else {
1028
1029                         if (root->root_key.objectid ==
1030                             BTRFS_TREE_RELOC_OBJECTID)
1031                                 ret = btrfs_inc_ref(trans, root, cow, 1);
1032                         else
1033                                 ret = btrfs_inc_ref(trans, root, cow, 0);
1034                         if (ret)
1035                                 return ret;
1036                 }
1037                 if (new_flags != 0) {
1038                         int level = btrfs_header_level(buf);
1039
1040                         ret = btrfs_set_disk_extent_flags(trans, fs_info,
1041                                                           buf->start,
1042                                                           buf->len,
1043                                                           new_flags, level, 0);
1044                         if (ret)
1045                                 return ret;
1046                 }
1047         } else {
1048                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1049                         if (root->root_key.objectid ==
1050                             BTRFS_TREE_RELOC_OBJECTID)
1051                                 ret = btrfs_inc_ref(trans, root, cow, 1);
1052                         else
1053                                 ret = btrfs_inc_ref(trans, root, cow, 0);
1054                         if (ret)
1055                                 return ret;
1056                         ret = btrfs_dec_ref(trans, root, buf, 1);
1057                         if (ret)
1058                                 return ret;
1059                 }
1060                 clean_tree_block(fs_info, buf);
1061                 *last_ref = 1;
1062         }
1063         return 0;
1064 }
1065
1066 /*
1067  * does the dirty work in cow of a single block.  The parent block (if
1068  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1069  * dirty and returned locked.  If you modify the block it needs to be marked
1070  * dirty again.
1071  *
1072  * search_start -- an allocation hint for the new block
1073  *
1074  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1075  * bytes the allocator should try to find free next to the block it returns.
1076  * This is just a hint and may be ignored by the allocator.
1077  */
1078 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1079                              struct btrfs_root *root,
1080                              struct extent_buffer *buf,
1081                              struct extent_buffer *parent, int parent_slot,
1082                              struct extent_buffer **cow_ret,
1083                              u64 search_start, u64 empty_size)
1084 {
1085         struct btrfs_fs_info *fs_info = root->fs_info;
1086         struct btrfs_disk_key disk_key;
1087         struct extent_buffer *cow;
1088         int level, ret;
1089         int last_ref = 0;
1090         int unlock_orig = 0;
1091         u64 parent_start = 0;
1092
1093         if (*cow_ret == buf)
1094                 unlock_orig = 1;
1095
1096         btrfs_assert_tree_locked(buf);
1097
1098         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1099                 trans->transid != fs_info->running_transaction->transid);
1100         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1101                 trans->transid != root->last_trans);
1102
1103         level = btrfs_header_level(buf);
1104
1105         if (level == 0)
1106                 btrfs_item_key(buf, &disk_key, 0);
1107         else
1108                 btrfs_node_key(buf, &disk_key, 0);
1109
1110         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1111                 parent_start = parent->start;
1112
1113         cow = btrfs_alloc_tree_block(trans, root, parent_start,
1114                         root->root_key.objectid, &disk_key, level,
1115                         search_start, empty_size);
1116         if (IS_ERR(cow))
1117                 return PTR_ERR(cow);
1118
1119         /* cow is set to blocking by btrfs_init_new_buffer */
1120
1121         copy_extent_buffer_full(cow, buf);
1122         btrfs_set_header_bytenr(cow, cow->start);
1123         btrfs_set_header_generation(cow, trans->transid);
1124         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1125         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1126                                      BTRFS_HEADER_FLAG_RELOC);
1127         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1128                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1129         else
1130                 btrfs_set_header_owner(cow, root->root_key.objectid);
1131
1132         write_extent_buffer_fsid(cow, fs_info->fsid);
1133
1134         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1135         if (ret) {
1136                 btrfs_tree_unlock(cow);
1137                 free_extent_buffer(cow);
1138                 btrfs_abort_transaction(trans, ret);
1139                 return ret;
1140         }
1141
1142         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1143                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1144                 if (ret) {
1145                         btrfs_tree_unlock(cow);
1146                         free_extent_buffer(cow);
1147                         btrfs_abort_transaction(trans, ret);
1148                         return ret;
1149                 }
1150         }
1151
1152         if (buf == root->node) {
1153                 WARN_ON(parent && parent != buf);
1154                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1155                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1156                         parent_start = buf->start;
1157
1158                 extent_buffer_get(cow);
1159                 tree_mod_log_set_root_pointer(root, cow, 1);
1160                 rcu_assign_pointer(root->node, cow);
1161
1162                 btrfs_free_tree_block(trans, root, buf, parent_start,
1163                                       last_ref);
1164                 free_extent_buffer(buf);
1165                 add_root_to_dirty_list(root);
1166         } else {
1167                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1168                 tree_mod_log_insert_key(fs_info, parent, parent_slot,
1169                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1170                 btrfs_set_node_blockptr(parent, parent_slot,
1171                                         cow->start);
1172                 btrfs_set_node_ptr_generation(parent, parent_slot,
1173                                               trans->transid);
1174                 btrfs_mark_buffer_dirty(parent);
1175                 if (last_ref) {
1176                         ret = tree_mod_log_free_eb(fs_info, buf);
1177                         if (ret) {
1178                                 btrfs_tree_unlock(cow);
1179                                 free_extent_buffer(cow);
1180                                 btrfs_abort_transaction(trans, ret);
1181                                 return ret;
1182                         }
1183                 }
1184                 btrfs_free_tree_block(trans, root, buf, parent_start,
1185                                       last_ref);
1186         }
1187         if (unlock_orig)
1188                 btrfs_tree_unlock(buf);
1189         free_extent_buffer_stale(buf);
1190         btrfs_mark_buffer_dirty(cow);
1191         *cow_ret = cow;
1192         return 0;
1193 }
1194
1195 /*
1196  * returns the logical address of the oldest predecessor of the given root.
1197  * entries older than time_seq are ignored.
1198  */
1199 static struct tree_mod_elem *
1200 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1201                            struct extent_buffer *eb_root, u64 time_seq)
1202 {
1203         struct tree_mod_elem *tm;
1204         struct tree_mod_elem *found = NULL;
1205         u64 root_logical = eb_root->start;
1206         int looped = 0;
1207
1208         if (!time_seq)
1209                 return NULL;
1210
1211         /*
1212          * the very last operation that's logged for a root is the
1213          * replacement operation (if it is replaced at all). this has
1214          * the logical address of the *new* root, making it the very
1215          * first operation that's logged for this root.
1216          */
1217         while (1) {
1218                 tm = tree_mod_log_search_oldest(fs_info, root_logical,
1219                                                 time_seq);
1220                 if (!looped && !tm)
1221                         return NULL;
1222                 /*
1223                  * if there are no tree operation for the oldest root, we simply
1224                  * return it. this should only happen if that (old) root is at
1225                  * level 0.
1226                  */
1227                 if (!tm)
1228                         break;
1229
1230                 /*
1231                  * if there's an operation that's not a root replacement, we
1232                  * found the oldest version of our root. normally, we'll find a
1233                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1234                  */
1235                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1236                         break;
1237
1238                 found = tm;
1239                 root_logical = tm->old_root.logical;
1240                 looped = 1;
1241         }
1242
1243         /* if there's no old root to return, return what we found instead */
1244         if (!found)
1245                 found = tm;
1246
1247         return found;
1248 }
1249
1250 /*
1251  * tm is a pointer to the first operation to rewind within eb. then, all
1252  * previous operations will be rewound (until we reach something older than
1253  * time_seq).
1254  */
1255 static void
1256 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1257                       u64 time_seq, struct tree_mod_elem *first_tm)
1258 {
1259         u32 n;
1260         struct rb_node *next;
1261         struct tree_mod_elem *tm = first_tm;
1262         unsigned long o_dst;
1263         unsigned long o_src;
1264         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1265
1266         n = btrfs_header_nritems(eb);
1267         read_lock(&fs_info->tree_mod_log_lock);
1268         while (tm && tm->seq >= time_seq) {
1269                 /*
1270                  * all the operations are recorded with the operator used for
1271                  * the modification. as we're going backwards, we do the
1272                  * opposite of each operation here.
1273                  */
1274                 switch (tm->op) {
1275                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1276                         BUG_ON(tm->slot < n);
1277                         /* Fallthrough */
1278                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1279                 case MOD_LOG_KEY_REMOVE:
1280                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1281                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1282                         btrfs_set_node_ptr_generation(eb, tm->slot,
1283                                                       tm->generation);
1284                         n++;
1285                         break;
1286                 case MOD_LOG_KEY_REPLACE:
1287                         BUG_ON(tm->slot >= n);
1288                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1289                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1290                         btrfs_set_node_ptr_generation(eb, tm->slot,
1291                                                       tm->generation);
1292                         break;
1293                 case MOD_LOG_KEY_ADD:
1294                         /* if a move operation is needed it's in the log */
1295                         n--;
1296                         break;
1297                 case MOD_LOG_MOVE_KEYS:
1298                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1299                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1300                         memmove_extent_buffer(eb, o_dst, o_src,
1301                                               tm->move.nr_items * p_size);
1302                         break;
1303                 case MOD_LOG_ROOT_REPLACE:
1304                         /*
1305                          * this operation is special. for roots, this must be
1306                          * handled explicitly before rewinding.
1307                          * for non-roots, this operation may exist if the node
1308                          * was a root: root A -> child B; then A gets empty and
1309                          * B is promoted to the new root. in the mod log, we'll
1310                          * have a root-replace operation for B, a tree block
1311                          * that is no root. we simply ignore that operation.
1312                          */
1313                         break;
1314                 }
1315                 next = rb_next(&tm->node);
1316                 if (!next)
1317                         break;
1318                 tm = rb_entry(next, struct tree_mod_elem, node);
1319                 if (tm->logical != first_tm->logical)
1320                         break;
1321         }
1322         read_unlock(&fs_info->tree_mod_log_lock);
1323         btrfs_set_header_nritems(eb, n);
1324 }
1325
1326 /*
1327  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1328  * is returned. If rewind operations happen, a fresh buffer is returned. The
1329  * returned buffer is always read-locked. If the returned buffer is not the
1330  * input buffer, the lock on the input buffer is released and the input buffer
1331  * is freed (its refcount is decremented).
1332  */
1333 static struct extent_buffer *
1334 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1335                     struct extent_buffer *eb, u64 time_seq)
1336 {
1337         struct extent_buffer *eb_rewin;
1338         struct tree_mod_elem *tm;
1339
1340         if (!time_seq)
1341                 return eb;
1342
1343         if (btrfs_header_level(eb) == 0)
1344                 return eb;
1345
1346         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1347         if (!tm)
1348                 return eb;
1349
1350         btrfs_set_path_blocking(path);
1351         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1352
1353         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1354                 BUG_ON(tm->slot != 0);
1355                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1356                 if (!eb_rewin) {
1357                         btrfs_tree_read_unlock_blocking(eb);
1358                         free_extent_buffer(eb);
1359                         return NULL;
1360                 }
1361                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1362                 btrfs_set_header_backref_rev(eb_rewin,
1363                                              btrfs_header_backref_rev(eb));
1364                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1365                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1366         } else {
1367                 eb_rewin = btrfs_clone_extent_buffer(eb);
1368                 if (!eb_rewin) {
1369                         btrfs_tree_read_unlock_blocking(eb);
1370                         free_extent_buffer(eb);
1371                         return NULL;
1372                 }
1373         }
1374
1375         btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1376         btrfs_tree_read_unlock_blocking(eb);
1377         free_extent_buffer(eb);
1378
1379         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
1380                                        eb_rewin, btrfs_header_level(eb_rewin));
1381         btrfs_tree_read_lock(eb_rewin);
1382         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1383         WARN_ON(btrfs_header_nritems(eb_rewin) >
1384                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1385
1386         return eb_rewin;
1387 }
1388
1389 /*
1390  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1391  * value. If there are no changes, the current root->root_node is returned. If
1392  * anything changed in between, there's a fresh buffer allocated on which the
1393  * rewind operations are done. In any case, the returned buffer is read locked.
1394  * Returns NULL on error (with no locks held).
1395  */
1396 static inline struct extent_buffer *
1397 get_old_root(struct btrfs_root *root, u64 time_seq)
1398 {
1399         struct btrfs_fs_info *fs_info = root->fs_info;
1400         struct tree_mod_elem *tm;
1401         struct extent_buffer *eb = NULL;
1402         struct extent_buffer *eb_root;
1403         u64 eb_root_owner = 0;
1404         struct extent_buffer *old;
1405         struct tree_mod_root *old_root = NULL;
1406         u64 old_generation = 0;
1407         u64 logical;
1408
1409         eb_root = btrfs_read_lock_root_node(root);
1410         tm = __tree_mod_log_oldest_root(fs_info, eb_root, time_seq);
1411         if (!tm)
1412                 return eb_root;
1413
1414         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1415                 old_root = &tm->old_root;
1416                 old_generation = tm->generation;
1417                 logical = old_root->logical;
1418         } else {
1419                 logical = eb_root->start;
1420         }
1421
1422         tm = tree_mod_log_search(fs_info, logical, time_seq);
1423         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1424                 btrfs_tree_read_unlock(eb_root);
1425                 free_extent_buffer(eb_root);
1426                 old = read_tree_block(fs_info, logical, 0);
1427                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1428                         if (!IS_ERR(old))
1429                                 free_extent_buffer(old);
1430                         btrfs_warn(fs_info,
1431                                    "failed to read tree block %llu from get_old_root",
1432                                    logical);
1433                 } else {
1434                         struct tree_mod_elem *tm2;
1435
1436                         btrfs_tree_read_lock(old);
1437                         eb = btrfs_clone_extent_buffer(old);
1438                         /*
1439                          * After the lookup for the most recent tree mod operation
1440                          * above and before we locked and cloned the extent buffer
1441                          * 'old', a new tree mod log operation may have been added.
1442                          * So lookup for a more recent one to make sure the number
1443                          * of mod log operations we replay is consistent with the
1444                          * number of items we have in the cloned extent buffer,
1445                          * otherwise we can hit a BUG_ON when rewinding the extent
1446                          * buffer.
1447                          */
1448                         tm2 = tree_mod_log_search(fs_info, logical, time_seq);
1449                         btrfs_tree_read_unlock(old);
1450                         free_extent_buffer(old);
1451                         ASSERT(tm2);
1452                         ASSERT(tm2 == tm || tm2->seq > tm->seq);
1453                         if (!tm2 || tm2->seq < tm->seq) {
1454                                 free_extent_buffer(eb);
1455                                 return NULL;
1456                         }
1457                         tm = tm2;
1458                 }
1459         } else if (old_root) {
1460                 eb_root_owner = btrfs_header_owner(eb_root);
1461                 btrfs_tree_read_unlock(eb_root);
1462                 free_extent_buffer(eb_root);
1463                 eb = alloc_dummy_extent_buffer(fs_info, logical);
1464         } else {
1465                 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1466                 eb = btrfs_clone_extent_buffer(eb_root);
1467                 btrfs_tree_read_unlock_blocking(eb_root);
1468                 free_extent_buffer(eb_root);
1469         }
1470
1471         if (!eb)
1472                 return NULL;
1473         if (old_root) {
1474                 btrfs_set_header_bytenr(eb, eb->start);
1475                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1476                 btrfs_set_header_owner(eb, eb_root_owner);
1477                 btrfs_set_header_level(eb, old_root->level);
1478                 btrfs_set_header_generation(eb, old_generation);
1479         }
1480         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1481                                        btrfs_header_level(eb));
1482         btrfs_tree_read_lock(eb);
1483         if (tm)
1484                 __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1485         else
1486                 WARN_ON(btrfs_header_level(eb) != 0);
1487         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1488
1489         return eb;
1490 }
1491
1492 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1493 {
1494         struct tree_mod_elem *tm;
1495         int level;
1496         struct extent_buffer *eb_root = btrfs_root_node(root);
1497
1498         tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1499         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1500                 level = tm->old_root.level;
1501         } else {
1502                 level = btrfs_header_level(eb_root);
1503         }
1504         free_extent_buffer(eb_root);
1505
1506         return level;
1507 }
1508
1509 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1510                                    struct btrfs_root *root,
1511                                    struct extent_buffer *buf)
1512 {
1513         if (btrfs_is_testing(root->fs_info))
1514                 return 0;
1515
1516         /* ensure we can see the force_cow */
1517         smp_rmb();
1518
1519         /*
1520          * We do not need to cow a block if
1521          * 1) this block is not created or changed in this transaction;
1522          * 2) this block does not belong to TREE_RELOC tree;
1523          * 3) the root is not forced COW.
1524          *
1525          * What is forced COW:
1526          *    when we create snapshot during committing the transaction,
1527          *    after we've finished coping src root, we must COW the shared
1528          *    block to ensure the metadata consistency.
1529          */
1530         if (btrfs_header_generation(buf) == trans->transid &&
1531             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1532             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1533               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1534             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1535                 return 0;
1536         return 1;
1537 }
1538
1539 /*
1540  * cows a single block, see __btrfs_cow_block for the real work.
1541  * This version of it has extra checks so that a block isn't COWed more than
1542  * once per transaction, as long as it hasn't been written yet
1543  */
1544 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1545                     struct btrfs_root *root, struct extent_buffer *buf,
1546                     struct extent_buffer *parent, int parent_slot,
1547                     struct extent_buffer **cow_ret)
1548 {
1549         struct btrfs_fs_info *fs_info = root->fs_info;
1550         u64 search_start;
1551         int ret;
1552
1553         if (trans->transaction != fs_info->running_transaction)
1554                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1555                        trans->transid,
1556                        fs_info->running_transaction->transid);
1557
1558         if (trans->transid != fs_info->generation)
1559                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1560                        trans->transid, fs_info->generation);
1561
1562         if (!should_cow_block(trans, root, buf)) {
1563                 trans->dirty = true;
1564                 *cow_ret = buf;
1565                 return 0;
1566         }
1567
1568         search_start = buf->start & ~((u64)SZ_1G - 1);
1569
1570         if (parent)
1571                 btrfs_set_lock_blocking(parent);
1572         btrfs_set_lock_blocking(buf);
1573
1574         ret = __btrfs_cow_block(trans, root, buf, parent,
1575                                  parent_slot, cow_ret, search_start, 0);
1576
1577         trace_btrfs_cow_block(root, buf, *cow_ret);
1578
1579         return ret;
1580 }
1581
1582 /*
1583  * helper function for defrag to decide if two blocks pointed to by a
1584  * node are actually close by
1585  */
1586 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1587 {
1588         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1589                 return 1;
1590         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1591                 return 1;
1592         return 0;
1593 }
1594
1595 /*
1596  * compare two keys in a memcmp fashion
1597  */
1598 static int comp_keys(const struct btrfs_disk_key *disk,
1599                      const struct btrfs_key *k2)
1600 {
1601         struct btrfs_key k1;
1602
1603         btrfs_disk_key_to_cpu(&k1, disk);
1604
1605         return btrfs_comp_cpu_keys(&k1, k2);
1606 }
1607
1608 /*
1609  * same as comp_keys only with two btrfs_key's
1610  */
1611 int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1612 {
1613         if (k1->objectid > k2->objectid)
1614                 return 1;
1615         if (k1->objectid < k2->objectid)
1616                 return -1;
1617         if (k1->type > k2->type)
1618                 return 1;
1619         if (k1->type < k2->type)
1620                 return -1;
1621         if (k1->offset > k2->offset)
1622                 return 1;
1623         if (k1->offset < k2->offset)
1624                 return -1;
1625         return 0;
1626 }
1627
1628 /*
1629  * this is used by the defrag code to go through all the
1630  * leaves pointed to by a node and reallocate them so that
1631  * disk order is close to key order
1632  */
1633 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1634                        struct btrfs_root *root, struct extent_buffer *parent,
1635                        int start_slot, u64 *last_ret,
1636                        struct btrfs_key *progress)
1637 {
1638         struct btrfs_fs_info *fs_info = root->fs_info;
1639         struct extent_buffer *cur;
1640         u64 blocknr;
1641         u64 gen;
1642         u64 search_start = *last_ret;
1643         u64 last_block = 0;
1644         u64 other;
1645         u32 parent_nritems;
1646         int end_slot;
1647         int i;
1648         int err = 0;
1649         int parent_level;
1650         int uptodate;
1651         u32 blocksize;
1652         int progress_passed = 0;
1653         struct btrfs_disk_key disk_key;
1654
1655         parent_level = btrfs_header_level(parent);
1656
1657         WARN_ON(trans->transaction != fs_info->running_transaction);
1658         WARN_ON(trans->transid != fs_info->generation);
1659
1660         parent_nritems = btrfs_header_nritems(parent);
1661         blocksize = fs_info->nodesize;
1662         end_slot = parent_nritems - 1;
1663
1664         if (parent_nritems <= 1)
1665                 return 0;
1666
1667         btrfs_set_lock_blocking(parent);
1668
1669         for (i = start_slot; i <= end_slot; i++) {
1670                 int close = 1;
1671
1672                 btrfs_node_key(parent, &disk_key, i);
1673                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1674                         continue;
1675
1676                 progress_passed = 1;
1677                 blocknr = btrfs_node_blockptr(parent, i);
1678                 gen = btrfs_node_ptr_generation(parent, i);
1679                 if (last_block == 0)
1680                         last_block = blocknr;
1681
1682                 if (i > 0) {
1683                         other = btrfs_node_blockptr(parent, i - 1);
1684                         close = close_blocks(blocknr, other, blocksize);
1685                 }
1686                 if (!close && i < end_slot) {
1687                         other = btrfs_node_blockptr(parent, i + 1);
1688                         close = close_blocks(blocknr, other, blocksize);
1689                 }
1690                 if (close) {
1691                         last_block = blocknr;
1692                         continue;
1693                 }
1694
1695                 cur = find_extent_buffer(fs_info, blocknr);
1696                 if (cur)
1697                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1698                 else
1699                         uptodate = 0;
1700                 if (!cur || !uptodate) {
1701                         if (!cur) {
1702                                 cur = read_tree_block(fs_info, blocknr, gen);
1703                                 if (IS_ERR(cur)) {
1704                                         return PTR_ERR(cur);
1705                                 } else if (!extent_buffer_uptodate(cur)) {
1706                                         free_extent_buffer(cur);
1707                                         return -EIO;
1708                                 }
1709                         } else if (!uptodate) {
1710                                 err = btrfs_read_buffer(cur, gen);
1711                                 if (err) {
1712                                         free_extent_buffer(cur);
1713                                         return err;
1714                                 }
1715                         }
1716                 }
1717                 if (search_start == 0)
1718                         search_start = last_block;
1719
1720                 btrfs_tree_lock(cur);
1721                 btrfs_set_lock_blocking(cur);
1722                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1723                                         &cur, search_start,
1724                                         min(16 * blocksize,
1725                                             (end_slot - i) * blocksize));
1726                 if (err) {
1727                         btrfs_tree_unlock(cur);
1728                         free_extent_buffer(cur);
1729                         break;
1730                 }
1731                 search_start = cur->start;
1732                 last_block = cur->start;
1733                 *last_ret = search_start;
1734                 btrfs_tree_unlock(cur);
1735                 free_extent_buffer(cur);
1736         }
1737         return err;
1738 }
1739
1740 /*
1741  * search for key in the extent_buffer.  The items start at offset p,
1742  * and they are item_size apart.  There are 'max' items in p.
1743  *
1744  * the slot in the array is returned via slot, and it points to
1745  * the place where you would insert key if it is not found in
1746  * the array.
1747  *
1748  * slot may point to max if the key is bigger than all of the keys
1749  */
1750 static noinline int generic_bin_search(struct extent_buffer *eb,
1751                                        unsigned long p, int item_size,
1752                                        const struct btrfs_key *key,
1753                                        int max, int *slot)
1754 {
1755         int low = 0;
1756         int high = max;
1757         int mid;
1758         int ret;
1759         struct btrfs_disk_key *tmp = NULL;
1760         struct btrfs_disk_key unaligned;
1761         unsigned long offset;
1762         char *kaddr = NULL;
1763         unsigned long map_start = 0;
1764         unsigned long map_len = 0;
1765         int err;
1766
1767         if (low > high) {
1768                 btrfs_err(eb->fs_info,
1769                  "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1770                           __func__, low, high, eb->start,
1771                           btrfs_header_owner(eb), btrfs_header_level(eb));
1772                 return -EINVAL;
1773         }
1774
1775         while (low < high) {
1776                 mid = (low + high) / 2;
1777                 offset = p + mid * item_size;
1778
1779                 if (!kaddr || offset < map_start ||
1780                     (offset + sizeof(struct btrfs_disk_key)) >
1781                     map_start + map_len) {
1782
1783                         err = map_private_extent_buffer(eb, offset,
1784                                                 sizeof(struct btrfs_disk_key),
1785                                                 &kaddr, &map_start, &map_len);
1786
1787                         if (!err) {
1788                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1789                                                         map_start);
1790                         } else if (err == 1) {
1791                                 read_extent_buffer(eb, &unaligned,
1792                                                    offset, sizeof(unaligned));
1793                                 tmp = &unaligned;
1794                         } else {
1795                                 return err;
1796                         }
1797
1798                 } else {
1799                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
1800                                                         map_start);
1801                 }
1802                 ret = comp_keys(tmp, key);
1803
1804                 if (ret < 0)
1805                         low = mid + 1;
1806                 else if (ret > 0)
1807                         high = mid;
1808                 else {
1809                         *slot = mid;
1810                         return 0;
1811                 }
1812         }
1813         *slot = low;
1814         return 1;
1815 }
1816
1817 /*
1818  * simple bin_search frontend that does the right thing for
1819  * leaves vs nodes
1820  */
1821 static int bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1822                       int level, int *slot)
1823 {
1824         if (level == 0)
1825                 return generic_bin_search(eb,
1826                                           offsetof(struct btrfs_leaf, items),
1827                                           sizeof(struct btrfs_item),
1828                                           key, btrfs_header_nritems(eb),
1829                                           slot);
1830         else
1831                 return generic_bin_search(eb,
1832                                           offsetof(struct btrfs_node, ptrs),
1833                                           sizeof(struct btrfs_key_ptr),
1834                                           key, btrfs_header_nritems(eb),
1835                                           slot);
1836 }
1837
1838 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1839                      int level, int *slot)
1840 {
1841         return bin_search(eb, key, level, slot);
1842 }
1843
1844 static void root_add_used(struct btrfs_root *root, u32 size)
1845 {
1846         spin_lock(&root->accounting_lock);
1847         btrfs_set_root_used(&root->root_item,
1848                             btrfs_root_used(&root->root_item) + size);
1849         spin_unlock(&root->accounting_lock);
1850 }
1851
1852 static void root_sub_used(struct btrfs_root *root, u32 size)
1853 {
1854         spin_lock(&root->accounting_lock);
1855         btrfs_set_root_used(&root->root_item,
1856                             btrfs_root_used(&root->root_item) - size);
1857         spin_unlock(&root->accounting_lock);
1858 }
1859
1860 /* given a node and slot number, this reads the blocks it points to.  The
1861  * extent buffer is returned with a reference taken (but unlocked).
1862  */
1863 static noinline struct extent_buffer *
1864 read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
1865                int slot)
1866 {
1867         int level = btrfs_header_level(parent);
1868         struct extent_buffer *eb;
1869
1870         if (slot < 0 || slot >= btrfs_header_nritems(parent))
1871                 return ERR_PTR(-ENOENT);
1872
1873         BUG_ON(level == 0);
1874
1875         eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
1876                              btrfs_node_ptr_generation(parent, slot));
1877         if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1878                 free_extent_buffer(eb);
1879                 eb = ERR_PTR(-EIO);
1880         }
1881
1882         return eb;
1883 }
1884
1885 /*
1886  * node level balancing, used to make sure nodes are in proper order for
1887  * item deletion.  We balance from the top down, so we have to make sure
1888  * that a deletion won't leave an node completely empty later on.
1889  */
1890 static noinline int balance_level(struct btrfs_trans_handle *trans,
1891                          struct btrfs_root *root,
1892                          struct btrfs_path *path, int level)
1893 {
1894         struct btrfs_fs_info *fs_info = root->fs_info;
1895         struct extent_buffer *right = NULL;
1896         struct extent_buffer *mid;
1897         struct extent_buffer *left = NULL;
1898         struct extent_buffer *parent = NULL;
1899         int ret = 0;
1900         int wret;
1901         int pslot;
1902         int orig_slot = path->slots[level];
1903         u64 orig_ptr;
1904
1905         if (level == 0)
1906                 return 0;
1907
1908         mid = path->nodes[level];
1909
1910         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1911                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1912         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1913
1914         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1915
1916         if (level < BTRFS_MAX_LEVEL - 1) {
1917                 parent = path->nodes[level + 1];
1918                 pslot = path->slots[level + 1];
1919         }
1920
1921         /*
1922          * deal with the case where there is only one pointer in the root
1923          * by promoting the node below to a root
1924          */
1925         if (!parent) {
1926                 struct extent_buffer *child;
1927
1928                 if (btrfs_header_nritems(mid) != 1)
1929                         return 0;
1930
1931                 /* promote the child to a root */
1932                 child = read_node_slot(fs_info, mid, 0);
1933                 if (IS_ERR(child)) {
1934                         ret = PTR_ERR(child);
1935                         btrfs_handle_fs_error(fs_info, ret, NULL);
1936                         goto enospc;
1937                 }
1938
1939                 btrfs_tree_lock(child);
1940                 btrfs_set_lock_blocking(child);
1941                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1942                 if (ret) {
1943                         btrfs_tree_unlock(child);
1944                         free_extent_buffer(child);
1945                         goto enospc;
1946                 }
1947
1948                 tree_mod_log_set_root_pointer(root, child, 1);
1949                 rcu_assign_pointer(root->node, child);
1950
1951                 add_root_to_dirty_list(root);
1952                 btrfs_tree_unlock(child);
1953
1954                 path->locks[level] = 0;
1955                 path->nodes[level] = NULL;
1956                 clean_tree_block(fs_info, mid);
1957                 btrfs_tree_unlock(mid);
1958                 /* once for the path */
1959                 free_extent_buffer(mid);
1960
1961                 root_sub_used(root, mid->len);
1962                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1963                 /* once for the root ptr */
1964                 free_extent_buffer_stale(mid);
1965                 return 0;
1966         }
1967         if (btrfs_header_nritems(mid) >
1968             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1969                 return 0;
1970
1971         left = read_node_slot(fs_info, parent, pslot - 1);
1972         if (IS_ERR(left))
1973                 left = NULL;
1974
1975         if (left) {
1976                 btrfs_tree_lock(left);
1977                 btrfs_set_lock_blocking(left);
1978                 wret = btrfs_cow_block(trans, root, left,
1979                                        parent, pslot - 1, &left);
1980                 if (wret) {
1981                         ret = wret;
1982                         goto enospc;
1983                 }
1984         }
1985
1986         right = read_node_slot(fs_info, parent, pslot + 1);
1987         if (IS_ERR(right))
1988                 right = NULL;
1989
1990         if (right) {
1991                 btrfs_tree_lock(right);
1992                 btrfs_set_lock_blocking(right);
1993                 wret = btrfs_cow_block(trans, root, right,
1994                                        parent, pslot + 1, &right);
1995                 if (wret) {
1996                         ret = wret;
1997                         goto enospc;
1998                 }
1999         }
2000
2001         /* first, try to make some room in the middle buffer */
2002         if (left) {
2003                 orig_slot += btrfs_header_nritems(left);
2004                 wret = push_node_left(trans, fs_info, left, mid, 1);
2005                 if (wret < 0)
2006                         ret = wret;
2007         }
2008
2009         /*
2010          * then try to empty the right most buffer into the middle
2011          */
2012         if (right) {
2013                 wret = push_node_left(trans, fs_info, mid, right, 1);
2014                 if (wret < 0 && wret != -ENOSPC)
2015                         ret = wret;
2016                 if (btrfs_header_nritems(right) == 0) {
2017                         clean_tree_block(fs_info, right);
2018                         btrfs_tree_unlock(right);
2019                         del_ptr(root, path, level + 1, pslot + 1);
2020                         root_sub_used(root, right->len);
2021                         btrfs_free_tree_block(trans, root, right, 0, 1);
2022                         free_extent_buffer_stale(right);
2023                         right = NULL;
2024                 } else {
2025                         struct btrfs_disk_key right_key;
2026                         btrfs_node_key(right, &right_key, 0);
2027                         tree_mod_log_set_node_key(fs_info, parent,
2028                                                   pslot + 1, 0);
2029                         btrfs_set_node_key(parent, &right_key, pslot + 1);
2030                         btrfs_mark_buffer_dirty(parent);
2031                 }
2032         }
2033         if (btrfs_header_nritems(mid) == 1) {
2034                 /*
2035                  * we're not allowed to leave a node with one item in the
2036                  * tree during a delete.  A deletion from lower in the tree
2037                  * could try to delete the only pointer in this node.
2038                  * So, pull some keys from the left.
2039                  * There has to be a left pointer at this point because
2040                  * otherwise we would have pulled some pointers from the
2041                  * right
2042                  */
2043                 if (!left) {
2044                         ret = -EROFS;
2045                         btrfs_handle_fs_error(fs_info, ret, NULL);
2046                         goto enospc;
2047                 }
2048                 wret = balance_node_right(trans, fs_info, mid, left);
2049                 if (wret < 0) {
2050                         ret = wret;
2051                         goto enospc;
2052                 }
2053                 if (wret == 1) {
2054                         wret = push_node_left(trans, fs_info, left, mid, 1);
2055                         if (wret < 0)
2056                                 ret = wret;
2057                 }
2058                 BUG_ON(wret == 1);
2059         }
2060         if (btrfs_header_nritems(mid) == 0) {
2061                 clean_tree_block(fs_info, mid);
2062                 btrfs_tree_unlock(mid);
2063                 del_ptr(root, path, level + 1, pslot);
2064                 root_sub_used(root, mid->len);
2065                 btrfs_free_tree_block(trans, root, mid, 0, 1);
2066                 free_extent_buffer_stale(mid);
2067                 mid = NULL;
2068         } else {
2069                 /* update the parent key to reflect our changes */
2070                 struct btrfs_disk_key mid_key;
2071                 btrfs_node_key(mid, &mid_key, 0);
2072                 tree_mod_log_set_node_key(fs_info, parent, pslot, 0);
2073                 btrfs_set_node_key(parent, &mid_key, pslot);
2074                 btrfs_mark_buffer_dirty(parent);
2075         }
2076
2077         /* update the path */
2078         if (left) {
2079                 if (btrfs_header_nritems(left) > orig_slot) {
2080                         extent_buffer_get(left);
2081                         /* left was locked after cow */
2082                         path->nodes[level] = left;
2083                         path->slots[level + 1] -= 1;
2084                         path->slots[level] = orig_slot;
2085                         if (mid) {
2086                                 btrfs_tree_unlock(mid);
2087                                 free_extent_buffer(mid);
2088                         }
2089                 } else {
2090                         orig_slot -= btrfs_header_nritems(left);
2091                         path->slots[level] = orig_slot;
2092                 }
2093         }
2094         /* double check we haven't messed things up */
2095         if (orig_ptr !=
2096             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2097                 BUG();
2098 enospc:
2099         if (right) {
2100                 btrfs_tree_unlock(right);
2101                 free_extent_buffer(right);
2102         }
2103         if (left) {
2104                 if (path->nodes[level] != left)
2105                         btrfs_tree_unlock(left);
2106                 free_extent_buffer(left);
2107         }
2108         return ret;
2109 }
2110
2111 /* Node balancing for insertion.  Here we only split or push nodes around
2112  * when they are completely full.  This is also done top down, so we
2113  * have to be pessimistic.
2114  */
2115 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2116                                           struct btrfs_root *root,
2117                                           struct btrfs_path *path, int level)
2118 {
2119         struct btrfs_fs_info *fs_info = root->fs_info;
2120         struct extent_buffer *right = NULL;
2121         struct extent_buffer *mid;
2122         struct extent_buffer *left = NULL;
2123         struct extent_buffer *parent = NULL;
2124         int ret = 0;
2125         int wret;
2126         int pslot;
2127         int orig_slot = path->slots[level];
2128
2129         if (level == 0)
2130                 return 1;
2131
2132         mid = path->nodes[level];
2133         WARN_ON(btrfs_header_generation(mid) != trans->transid);
2134
2135         if (level < BTRFS_MAX_LEVEL - 1) {
2136                 parent = path->nodes[level + 1];
2137                 pslot = path->slots[level + 1];
2138         }
2139
2140         if (!parent)
2141                 return 1;
2142
2143         left = read_node_slot(fs_info, parent, pslot - 1);
2144         if (IS_ERR(left))
2145                 left = NULL;
2146
2147         /* first, try to make some room in the middle buffer */
2148         if (left) {
2149                 u32 left_nr;
2150
2151                 btrfs_tree_lock(left);
2152                 btrfs_set_lock_blocking(left);
2153
2154                 left_nr = btrfs_header_nritems(left);
2155                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2156                         wret = 1;
2157                 } else {
2158                         ret = btrfs_cow_block(trans, root, left, parent,
2159                                               pslot - 1, &left);
2160                         if (ret)
2161                                 wret = 1;
2162                         else {
2163                                 wret = push_node_left(trans, fs_info,
2164                                                       left, mid, 0);
2165                         }
2166                 }
2167                 if (wret < 0)
2168                         ret = wret;
2169                 if (wret == 0) {
2170                         struct btrfs_disk_key disk_key;
2171                         orig_slot += left_nr;
2172                         btrfs_node_key(mid, &disk_key, 0);
2173                         tree_mod_log_set_node_key(fs_info, parent, pslot, 0);
2174                         btrfs_set_node_key(parent, &disk_key, pslot);
2175                         btrfs_mark_buffer_dirty(parent);
2176                         if (btrfs_header_nritems(left) > orig_slot) {
2177                                 path->nodes[level] = left;
2178                                 path->slots[level + 1] -= 1;
2179                                 path->slots[level] = orig_slot;
2180                                 btrfs_tree_unlock(mid);
2181                                 free_extent_buffer(mid);
2182                         } else {
2183                                 orig_slot -=
2184                                         btrfs_header_nritems(left);
2185                                 path->slots[level] = orig_slot;
2186                                 btrfs_tree_unlock(left);
2187                                 free_extent_buffer(left);
2188                         }
2189                         return 0;
2190                 }
2191                 btrfs_tree_unlock(left);
2192                 free_extent_buffer(left);
2193         }
2194         right = read_node_slot(fs_info, parent, pslot + 1);
2195         if (IS_ERR(right))
2196                 right = NULL;
2197
2198         /*
2199          * then try to empty the right most buffer into the middle
2200          */
2201         if (right) {
2202                 u32 right_nr;
2203
2204                 btrfs_tree_lock(right);
2205                 btrfs_set_lock_blocking(right);
2206
2207                 right_nr = btrfs_header_nritems(right);
2208                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2209                         wret = 1;
2210                 } else {
2211                         ret = btrfs_cow_block(trans, root, right,
2212                                               parent, pslot + 1,
2213                                               &right);
2214                         if (ret)
2215                                 wret = 1;
2216                         else {
2217                                 wret = balance_node_right(trans, fs_info,
2218                                                           right, mid);
2219                         }
2220                 }
2221                 if (wret < 0)
2222                         ret = wret;
2223                 if (wret == 0) {
2224                         struct btrfs_disk_key disk_key;
2225
2226                         btrfs_node_key(right, &disk_key, 0);
2227                         tree_mod_log_set_node_key(fs_info, parent,
2228                                                   pslot + 1, 0);
2229                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2230                         btrfs_mark_buffer_dirty(parent);
2231
2232                         if (btrfs_header_nritems(mid) <= orig_slot) {
2233                                 path->nodes[level] = right;
2234                                 path->slots[level + 1] += 1;
2235                                 path->slots[level] = orig_slot -
2236                                         btrfs_header_nritems(mid);
2237                                 btrfs_tree_unlock(mid);
2238                                 free_extent_buffer(mid);
2239                         } else {
2240                                 btrfs_tree_unlock(right);
2241                                 free_extent_buffer(right);
2242                         }
2243                         return 0;
2244                 }
2245                 btrfs_tree_unlock(right);
2246                 free_extent_buffer(right);
2247         }
2248         return 1;
2249 }
2250
2251 /*
2252  * readahead one full node of leaves, finding things that are close
2253  * to the block in 'slot', and triggering ra on them.
2254  */
2255 static void reada_for_search(struct btrfs_fs_info *fs_info,
2256                              struct btrfs_path *path,
2257                              int level, int slot, u64 objectid)
2258 {
2259         struct extent_buffer *node;
2260         struct btrfs_disk_key disk_key;
2261         u32 nritems;
2262         u64 search;
2263         u64 target;
2264         u64 nread = 0;
2265         struct extent_buffer *eb;
2266         u32 nr;
2267         u32 blocksize;
2268         u32 nscan = 0;
2269
2270         if (level != 1)
2271                 return;
2272
2273         if (!path->nodes[level])
2274                 return;
2275
2276         node = path->nodes[level];
2277
2278         search = btrfs_node_blockptr(node, slot);
2279         blocksize = fs_info->nodesize;
2280         eb = find_extent_buffer(fs_info, search);
2281         if (eb) {
2282                 free_extent_buffer(eb);
2283                 return;
2284         }
2285
2286         target = search;
2287
2288         nritems = btrfs_header_nritems(node);
2289         nr = slot;
2290
2291         while (1) {
2292                 if (path->reada == READA_BACK) {
2293                         if (nr == 0)
2294                                 break;
2295                         nr--;
2296                 } else if (path->reada == READA_FORWARD) {
2297                         nr++;
2298                         if (nr >= nritems)
2299                                 break;
2300                 }
2301                 if (path->reada == READA_BACK && objectid) {
2302                         btrfs_node_key(node, &disk_key, nr);
2303                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2304                                 break;
2305                 }
2306                 search = btrfs_node_blockptr(node, nr);
2307                 if ((search <= target && target - search <= 65536) ||
2308                     (search > target && search - target <= 65536)) {
2309                         readahead_tree_block(fs_info, search);
2310                         nread += blocksize;
2311                 }
2312                 nscan++;
2313                 if ((nread > 65536 || nscan > 32))
2314                         break;
2315         }
2316 }
2317
2318 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2319                                        struct btrfs_path *path, int level)
2320 {
2321         int slot;
2322         int nritems;
2323         struct extent_buffer *parent;
2324         struct extent_buffer *eb;
2325         u64 gen;
2326         u64 block1 = 0;
2327         u64 block2 = 0;
2328
2329         parent = path->nodes[level + 1];
2330         if (!parent)
2331                 return;
2332
2333         nritems = btrfs_header_nritems(parent);
2334         slot = path->slots[level + 1];
2335
2336         if (slot > 0) {
2337                 block1 = btrfs_node_blockptr(parent, slot - 1);
2338                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2339                 eb = find_extent_buffer(fs_info, block1);
2340                 /*
2341                  * if we get -eagain from btrfs_buffer_uptodate, we
2342                  * don't want to return eagain here.  That will loop
2343                  * forever
2344                  */
2345                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2346                         block1 = 0;
2347                 free_extent_buffer(eb);
2348         }
2349         if (slot + 1 < nritems) {
2350                 block2 = btrfs_node_blockptr(parent, slot + 1);
2351                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2352                 eb = find_extent_buffer(fs_info, block2);
2353                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2354                         block2 = 0;
2355                 free_extent_buffer(eb);
2356         }
2357
2358         if (block1)
2359                 readahead_tree_block(fs_info, block1);
2360         if (block2)
2361                 readahead_tree_block(fs_info, block2);
2362 }
2363
2364
2365 /*
2366  * when we walk down the tree, it is usually safe to unlock the higher layers
2367  * in the tree.  The exceptions are when our path goes through slot 0, because
2368  * operations on the tree might require changing key pointers higher up in the
2369  * tree.
2370  *
2371  * callers might also have set path->keep_locks, which tells this code to keep
2372  * the lock if the path points to the last slot in the block.  This is part of
2373  * walking through the tree, and selecting the next slot in the higher block.
2374  *
2375  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2376  * if lowest_unlock is 1, level 0 won't be unlocked
2377  */
2378 static noinline void unlock_up(struct btrfs_path *path, int level,
2379                                int lowest_unlock, int min_write_lock_level,
2380                                int *write_lock_level)
2381 {
2382         int i;
2383         int skip_level = level;
2384         int no_skips = 0;
2385         struct extent_buffer *t;
2386
2387         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2388                 if (!path->nodes[i])
2389                         break;
2390                 if (!path->locks[i])
2391                         break;
2392                 if (!no_skips && path->slots[i] == 0) {
2393                         skip_level = i + 1;
2394                         continue;
2395                 }
2396                 if (!no_skips && path->keep_locks) {
2397                         u32 nritems;
2398                         t = path->nodes[i];
2399                         nritems = btrfs_header_nritems(t);
2400                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2401                                 skip_level = i + 1;
2402                                 continue;
2403                         }
2404                 }
2405                 if (skip_level < i && i >= lowest_unlock)
2406                         no_skips = 1;
2407
2408                 t = path->nodes[i];
2409                 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2410                         btrfs_tree_unlock_rw(t, path->locks[i]);
2411                         path->locks[i] = 0;
2412                         if (write_lock_level &&
2413                             i > min_write_lock_level &&
2414                             i <= *write_lock_level) {
2415                                 *write_lock_level = i - 1;
2416                         }
2417                 }
2418         }
2419 }
2420
2421 /*
2422  * This releases any locks held in the path starting at level and
2423  * going all the way up to the root.
2424  *
2425  * btrfs_search_slot will keep the lock held on higher nodes in a few
2426  * corner cases, such as COW of the block at slot zero in the node.  This
2427  * ignores those rules, and it should only be called when there are no
2428  * more updates to be done higher up in the tree.
2429  */
2430 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2431 {
2432         int i;
2433
2434         if (path->keep_locks)
2435                 return;
2436
2437         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2438                 if (!path->nodes[i])
2439                         continue;
2440                 if (!path->locks[i])
2441                         continue;
2442                 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2443                 path->locks[i] = 0;
2444         }
2445 }
2446
2447 /*
2448  * helper function for btrfs_search_slot.  The goal is to find a block
2449  * in cache without setting the path to blocking.  If we find the block
2450  * we return zero and the path is unchanged.
2451  *
2452  * If we can't find the block, we set the path blocking and do some
2453  * reada.  -EAGAIN is returned and the search must be repeated.
2454  */
2455 static int
2456 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2457                       struct extent_buffer **eb_ret, int level, int slot,
2458                       const struct btrfs_key *key)
2459 {
2460         struct btrfs_fs_info *fs_info = root->fs_info;
2461         u64 blocknr;
2462         u64 gen;
2463         struct extent_buffer *b = *eb_ret;
2464         struct extent_buffer *tmp;
2465         int ret;
2466
2467         blocknr = btrfs_node_blockptr(b, slot);
2468         gen = btrfs_node_ptr_generation(b, slot);
2469
2470         tmp = find_extent_buffer(fs_info, blocknr);
2471         if (tmp) {
2472                 /* first we do an atomic uptodate check */
2473                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2474                         *eb_ret = tmp;
2475                         return 0;
2476                 }
2477
2478                 /* the pages were up to date, but we failed
2479                  * the generation number check.  Do a full
2480                  * read for the generation number that is correct.
2481                  * We must do this without dropping locks so
2482                  * we can trust our generation number
2483                  */
2484                 btrfs_set_path_blocking(p);
2485
2486                 /* now we're allowed to do a blocking uptodate check */
2487                 ret = btrfs_read_buffer(tmp, gen);
2488                 if (!ret) {
2489                         *eb_ret = tmp;
2490                         return 0;
2491                 }
2492                 free_extent_buffer(tmp);
2493                 btrfs_release_path(p);
2494                 return -EIO;
2495         }
2496
2497         /*
2498          * reduce lock contention at high levels
2499          * of the btree by dropping locks before
2500          * we read.  Don't release the lock on the current
2501          * level because we need to walk this node to figure
2502          * out which blocks to read.
2503          */
2504         btrfs_unlock_up_safe(p, level + 1);
2505         btrfs_set_path_blocking(p);
2506
2507         free_extent_buffer(tmp);
2508         if (p->reada != READA_NONE)
2509                 reada_for_search(fs_info, p, level, slot, key->objectid);
2510
2511         ret = -EAGAIN;
2512         tmp = read_tree_block(fs_info, blocknr, gen);
2513         if (!IS_ERR(tmp)) {
2514                 /*
2515                  * If the read above didn't mark this buffer up to date,
2516                  * it will never end up being up to date.  Set ret to EIO now
2517                  * and give up so that our caller doesn't loop forever
2518                  * on our EAGAINs.
2519                  */
2520                 if (!btrfs_buffer_uptodate(tmp, 0, 0))
2521                         ret = -EIO;
2522                 free_extent_buffer(tmp);
2523         } else {
2524                 ret = PTR_ERR(tmp);
2525         }
2526
2527         btrfs_release_path(p);
2528         return ret;
2529 }
2530
2531 /*
2532  * helper function for btrfs_search_slot.  This does all of the checks
2533  * for node-level blocks and does any balancing required based on
2534  * the ins_len.
2535  *
2536  * If no extra work was required, zero is returned.  If we had to
2537  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2538  * start over
2539  */
2540 static int
2541 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2542                        struct btrfs_root *root, struct btrfs_path *p,
2543                        struct extent_buffer *b, int level, int ins_len,
2544                        int *write_lock_level)
2545 {
2546         struct btrfs_fs_info *fs_info = root->fs_info;
2547         int ret;
2548
2549         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2550             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2551                 int sret;
2552
2553                 if (*write_lock_level < level + 1) {
2554                         *write_lock_level = level + 1;
2555                         btrfs_release_path(p);
2556                         goto again;
2557                 }
2558
2559                 btrfs_set_path_blocking(p);
2560                 reada_for_balance(fs_info, p, level);
2561                 sret = split_node(trans, root, p, level);
2562                 btrfs_clear_path_blocking(p, NULL, 0);
2563
2564                 BUG_ON(sret > 0);
2565                 if (sret) {
2566                         ret = sret;
2567                         goto done;
2568                 }
2569                 b = p->nodes[level];
2570         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2571                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2572                 int sret;
2573
2574                 if (*write_lock_level < level + 1) {
2575                         *write_lock_level = level + 1;
2576                         btrfs_release_path(p);
2577                         goto again;
2578                 }
2579
2580                 btrfs_set_path_blocking(p);
2581                 reada_for_balance(fs_info, p, level);
2582                 sret = balance_level(trans, root, p, level);
2583                 btrfs_clear_path_blocking(p, NULL, 0);
2584
2585                 if (sret) {
2586                         ret = sret;
2587                         goto done;
2588                 }
2589                 b = p->nodes[level];
2590                 if (!b) {
2591                         btrfs_release_path(p);
2592                         goto again;
2593                 }
2594                 BUG_ON(btrfs_header_nritems(b) == 1);
2595         }
2596         return 0;
2597
2598 again:
2599         ret = -EAGAIN;
2600 done:
2601         return ret;
2602 }
2603
2604 static void key_search_validate(struct extent_buffer *b,
2605                                 const struct btrfs_key *key,
2606                                 int level)
2607 {
2608 #ifdef CONFIG_BTRFS_ASSERT
2609         struct btrfs_disk_key disk_key;
2610
2611         btrfs_cpu_key_to_disk(&disk_key, key);
2612
2613         if (level == 0)
2614                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2615                     offsetof(struct btrfs_leaf, items[0].key),
2616                     sizeof(disk_key)));
2617         else
2618                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2619                     offsetof(struct btrfs_node, ptrs[0].key),
2620                     sizeof(disk_key)));
2621 #endif
2622 }
2623
2624 static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2625                       int level, int *prev_cmp, int *slot)
2626 {
2627         if (*prev_cmp != 0) {
2628                 *prev_cmp = bin_search(b, key, level, slot);
2629                 return *prev_cmp;
2630         }
2631
2632         key_search_validate(b, key, level);
2633         *slot = 0;
2634
2635         return 0;
2636 }
2637
2638 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2639                 u64 iobjectid, u64 ioff, u8 key_type,
2640                 struct btrfs_key *found_key)
2641 {
2642         int ret;
2643         struct btrfs_key key;
2644         struct extent_buffer *eb;
2645
2646         ASSERT(path);
2647         ASSERT(found_key);
2648
2649         key.type = key_type;
2650         key.objectid = iobjectid;
2651         key.offset = ioff;
2652
2653         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2654         if (ret < 0)
2655                 return ret;
2656
2657         eb = path->nodes[0];
2658         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2659                 ret = btrfs_next_leaf(fs_root, path);
2660                 if (ret)
2661                         return ret;
2662                 eb = path->nodes[0];
2663         }
2664
2665         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2666         if (found_key->type != key.type ||
2667                         found_key->objectid != key.objectid)
2668                 return 1;
2669
2670         return 0;
2671 }
2672
2673 /*
2674  * look for key in the tree.  path is filled in with nodes along the way
2675  * if key is found, we return zero and you can find the item in the leaf
2676  * level of the path (level 0)
2677  *
2678  * If the key isn't found, the path points to the slot where it should
2679  * be inserted, and 1 is returned.  If there are other errors during the
2680  * search a negative error number is returned.
2681  *
2682  * if ins_len > 0, nodes and leaves will be split as we walk down the
2683  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2684  * possible)
2685  */
2686 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2687                       const struct btrfs_key *key, struct btrfs_path *p,
2688                       int ins_len, int cow)
2689 {
2690         struct btrfs_fs_info *fs_info = root->fs_info;
2691         struct extent_buffer *b;
2692         int slot;
2693         int ret;
2694         int err;
2695         int level;
2696         int lowest_unlock = 1;
2697         int root_lock;
2698         /* everything at write_lock_level or lower must be write locked */
2699         int write_lock_level = 0;
2700         u8 lowest_level = 0;
2701         int min_write_lock_level;
2702         int prev_cmp;
2703
2704         lowest_level = p->lowest_level;
2705         WARN_ON(lowest_level && ins_len > 0);
2706         WARN_ON(p->nodes[0] != NULL);
2707         BUG_ON(!cow && ins_len);
2708
2709         if (ins_len < 0) {
2710                 lowest_unlock = 2;
2711
2712                 /* when we are removing items, we might have to go up to level
2713                  * two as we update tree pointers  Make sure we keep write
2714                  * for those levels as well
2715                  */
2716                 write_lock_level = 2;
2717         } else if (ins_len > 0) {
2718                 /*
2719                  * for inserting items, make sure we have a write lock on
2720                  * level 1 so we can update keys
2721                  */
2722                 write_lock_level = 1;
2723         }
2724
2725         if (!cow)
2726                 write_lock_level = -1;
2727
2728         if (cow && (p->keep_locks || p->lowest_level))
2729                 write_lock_level = BTRFS_MAX_LEVEL;
2730
2731         min_write_lock_level = write_lock_level;
2732
2733 again:
2734         prev_cmp = -1;
2735         /*
2736          * we try very hard to do read locks on the root
2737          */
2738         root_lock = BTRFS_READ_LOCK;
2739         level = 0;
2740         if (p->search_commit_root) {
2741                 /*
2742                  * the commit roots are read only
2743                  * so we always do read locks
2744                  */
2745                 if (p->need_commit_sem)
2746                         down_read(&fs_info->commit_root_sem);
2747                 b = root->commit_root;
2748                 extent_buffer_get(b);
2749                 level = btrfs_header_level(b);
2750                 if (p->need_commit_sem)
2751                         up_read(&fs_info->commit_root_sem);
2752                 if (!p->skip_locking)
2753                         btrfs_tree_read_lock(b);
2754         } else {
2755                 if (p->skip_locking) {
2756                         b = btrfs_root_node(root);
2757                         level = btrfs_header_level(b);
2758                 } else {
2759                         /* we don't know the level of the root node
2760                          * until we actually have it read locked
2761                          */
2762                         b = btrfs_read_lock_root_node(root);
2763                         level = btrfs_header_level(b);
2764                         if (level <= write_lock_level) {
2765                                 /* whoops, must trade for write lock */
2766                                 btrfs_tree_read_unlock(b);
2767                                 free_extent_buffer(b);
2768                                 b = btrfs_lock_root_node(root);
2769                                 root_lock = BTRFS_WRITE_LOCK;
2770
2771                                 /* the level might have changed, check again */
2772                                 level = btrfs_header_level(b);
2773                         }
2774                 }
2775         }
2776         p->nodes[level] = b;
2777         if (!p->skip_locking)
2778                 p->locks[level] = root_lock;
2779
2780         while (b) {
2781                 level = btrfs_header_level(b);
2782
2783                 /*
2784                  * setup the path here so we can release it under lock
2785                  * contention with the cow code
2786                  */
2787                 if (cow) {
2788                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2789
2790                         /*
2791                          * if we don't really need to cow this block
2792                          * then we don't want to set the path blocking,
2793                          * so we test it here
2794                          */
2795                         if (!should_cow_block(trans, root, b)) {
2796                                 trans->dirty = true;
2797                                 goto cow_done;
2798                         }
2799
2800                         /*
2801                          * must have write locks on this node and the
2802                          * parent
2803                          */
2804                         if (level > write_lock_level ||
2805                             (level + 1 > write_lock_level &&
2806                             level + 1 < BTRFS_MAX_LEVEL &&
2807                             p->nodes[level + 1])) {
2808                                 write_lock_level = level + 1;
2809                                 btrfs_release_path(p);
2810                                 goto again;
2811                         }
2812
2813                         btrfs_set_path_blocking(p);
2814                         if (last_level)
2815                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
2816                                                       &b);
2817                         else
2818                                 err = btrfs_cow_block(trans, root, b,
2819                                                       p->nodes[level + 1],
2820                                                       p->slots[level + 1], &b);
2821                         if (err) {
2822                                 ret = err;
2823                                 goto done;
2824                         }
2825                 }
2826 cow_done:
2827                 p->nodes[level] = b;
2828                 btrfs_clear_path_blocking(p, NULL, 0);
2829
2830                 /*
2831                  * we have a lock on b and as long as we aren't changing
2832                  * the tree, there is no way to for the items in b to change.
2833                  * It is safe to drop the lock on our parent before we
2834                  * go through the expensive btree search on b.
2835                  *
2836                  * If we're inserting or deleting (ins_len != 0), then we might
2837                  * be changing slot zero, which may require changing the parent.
2838                  * So, we can't drop the lock until after we know which slot
2839                  * we're operating on.
2840                  */
2841                 if (!ins_len && !p->keep_locks) {
2842                         int u = level + 1;
2843
2844                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2845                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2846                                 p->locks[u] = 0;
2847                         }
2848                 }
2849
2850                 ret = key_search(b, key, level, &prev_cmp, &slot);
2851                 if (ret < 0)
2852                         goto done;
2853
2854                 if (level != 0) {
2855                         int dec = 0;
2856                         if (ret && slot > 0) {
2857                                 dec = 1;
2858                                 slot -= 1;
2859                         }
2860                         p->slots[level] = slot;
2861                         err = setup_nodes_for_search(trans, root, p, b, level,
2862                                              ins_len, &write_lock_level);
2863                         if (err == -EAGAIN)
2864                                 goto again;
2865                         if (err) {
2866                                 ret = err;
2867                                 goto done;
2868                         }
2869                         b = p->nodes[level];
2870                         slot = p->slots[level];
2871
2872                         /*
2873                          * slot 0 is special, if we change the key
2874                          * we have to update the parent pointer
2875                          * which means we must have a write lock
2876                          * on the parent
2877                          */
2878                         if (slot == 0 && ins_len &&
2879                             write_lock_level < level + 1) {
2880                                 write_lock_level = level + 1;
2881                                 btrfs_release_path(p);
2882                                 goto again;
2883                         }
2884
2885                         unlock_up(p, level, lowest_unlock,
2886                                   min_write_lock_level, &write_lock_level);
2887
2888                         if (level == lowest_level) {
2889                                 if (dec)
2890                                         p->slots[level]++;
2891                                 goto done;
2892                         }
2893
2894                         err = read_block_for_search(root, p, &b, level,
2895                                                     slot, key);
2896                         if (err == -EAGAIN)
2897                                 goto again;
2898                         if (err) {
2899                                 ret = err;
2900                                 goto done;
2901                         }
2902
2903                         if (!p->skip_locking) {
2904                                 level = btrfs_header_level(b);
2905                                 if (level <= write_lock_level) {
2906                                         err = btrfs_try_tree_write_lock(b);
2907                                         if (!err) {
2908                                                 btrfs_set_path_blocking(p);
2909                                                 btrfs_tree_lock(b);
2910                                                 btrfs_clear_path_blocking(p, b,
2911                                                                   BTRFS_WRITE_LOCK);
2912                                         }
2913                                         p->locks[level] = BTRFS_WRITE_LOCK;
2914                                 } else {
2915                                         err = btrfs_tree_read_lock_atomic(b);
2916                                         if (!err) {
2917                                                 btrfs_set_path_blocking(p);
2918                                                 btrfs_tree_read_lock(b);
2919                                                 btrfs_clear_path_blocking(p, b,
2920                                                                   BTRFS_READ_LOCK);
2921                                         }
2922                                         p->locks[level] = BTRFS_READ_LOCK;
2923                                 }
2924                                 p->nodes[level] = b;
2925                         }
2926                 } else {
2927                         p->slots[level] = slot;
2928                         if (ins_len > 0 &&
2929                             btrfs_leaf_free_space(fs_info, b) < ins_len) {
2930                                 if (write_lock_level < 1) {
2931                                         write_lock_level = 1;
2932                                         btrfs_release_path(p);
2933                                         goto again;
2934                                 }
2935
2936                                 btrfs_set_path_blocking(p);
2937                                 err = split_leaf(trans, root, key,
2938                                                  p, ins_len, ret == 0);
2939                                 btrfs_clear_path_blocking(p, NULL, 0);
2940
2941                                 BUG_ON(err > 0);
2942                                 if (err) {
2943                                         ret = err;
2944                                         goto done;
2945                                 }
2946                         }
2947                         if (!p->search_for_split)
2948                                 unlock_up(p, level, lowest_unlock,
2949                                           min_write_lock_level, &write_lock_level);
2950                         goto done;
2951                 }
2952         }
2953         ret = 1;
2954 done:
2955         /*
2956          * we don't really know what they plan on doing with the path
2957          * from here on, so for now just mark it as blocking
2958          */
2959         if (!p->leave_spinning)
2960                 btrfs_set_path_blocking(p);
2961         if (ret < 0 && !p->skip_release_on_error)
2962                 btrfs_release_path(p);
2963         return ret;
2964 }
2965
2966 /*
2967  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2968  * current state of the tree together with the operations recorded in the tree
2969  * modification log to search for the key in a previous version of this tree, as
2970  * denoted by the time_seq parameter.
2971  *
2972  * Naturally, there is no support for insert, delete or cow operations.
2973  *
2974  * The resulting path and return value will be set up as if we called
2975  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2976  */
2977 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2978                           struct btrfs_path *p, u64 time_seq)
2979 {
2980         struct btrfs_fs_info *fs_info = root->fs_info;
2981         struct extent_buffer *b;
2982         int slot;
2983         int ret;
2984         int err;
2985         int level;
2986         int lowest_unlock = 1;
2987         u8 lowest_level = 0;
2988         int prev_cmp = -1;
2989
2990         lowest_level = p->lowest_level;
2991         WARN_ON(p->nodes[0] != NULL);
2992
2993         if (p->search_commit_root) {
2994                 BUG_ON(time_seq);
2995                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2996         }
2997
2998 again:
2999         b = get_old_root(root, time_seq);
3000         if (!b) {
3001                 ret = -EIO;
3002                 goto done;
3003         }
3004         level = btrfs_header_level(b);
3005         p->locks[level] = BTRFS_READ_LOCK;
3006
3007         while (b) {
3008                 level = btrfs_header_level(b);
3009                 p->nodes[level] = b;
3010                 btrfs_clear_path_blocking(p, NULL, 0);
3011
3012                 /*
3013                  * we have a lock on b and as long as we aren't changing
3014                  * the tree, there is no way to for the items in b to change.
3015                  * It is safe to drop the lock on our parent before we
3016                  * go through the expensive btree search on b.
3017                  */
3018                 btrfs_unlock_up_safe(p, level + 1);
3019
3020                 /*
3021                  * Since we can unwind ebs we want to do a real search every
3022                  * time.
3023                  */
3024                 prev_cmp = -1;
3025                 ret = key_search(b, key, level, &prev_cmp, &slot);
3026
3027                 if (level != 0) {
3028                         int dec = 0;
3029                         if (ret && slot > 0) {
3030                                 dec = 1;
3031                                 slot -= 1;
3032                         }
3033                         p->slots[level] = slot;
3034                         unlock_up(p, level, lowest_unlock, 0, NULL);
3035
3036                         if (level == lowest_level) {
3037                                 if (dec)
3038                                         p->slots[level]++;
3039                                 goto done;
3040                         }
3041
3042                         err = read_block_for_search(root, p, &b, level,
3043                                                     slot, key);
3044                         if (err == -EAGAIN)
3045                                 goto again;
3046                         if (err) {
3047                                 ret = err;
3048                                 goto done;
3049                         }
3050
3051                         level = btrfs_header_level(b);
3052                         err = btrfs_tree_read_lock_atomic(b);
3053                         if (!err) {
3054                                 btrfs_set_path_blocking(p);
3055                                 btrfs_tree_read_lock(b);
3056                                 btrfs_clear_path_blocking(p, b,
3057                                                           BTRFS_READ_LOCK);
3058                         }
3059                         b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3060                         if (!b) {
3061                                 ret = -ENOMEM;
3062                                 goto done;
3063                         }
3064                         p->locks[level] = BTRFS_READ_LOCK;
3065                         p->nodes[level] = b;
3066                 } else {
3067                         p->slots[level] = slot;
3068                         unlock_up(p, level, lowest_unlock, 0, NULL);
3069                         goto done;
3070                 }
3071         }
3072         ret = 1;
3073 done:
3074         if (!p->leave_spinning)
3075                 btrfs_set_path_blocking(p);
3076         if (ret < 0)
3077                 btrfs_release_path(p);
3078
3079         return ret;
3080 }
3081
3082 /*
3083  * helper to use instead of search slot if no exact match is needed but
3084  * instead the next or previous item should be returned.
3085  * When find_higher is true, the next higher item is returned, the next lower
3086  * otherwise.
3087  * When return_any and find_higher are both true, and no higher item is found,
3088  * return the next lower instead.
3089  * When return_any is true and find_higher is false, and no lower item is found,
3090  * return the next higher instead.
3091  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3092  * < 0 on error
3093  */
3094 int btrfs_search_slot_for_read(struct btrfs_root *root,
3095                                const struct btrfs_key *key,
3096                                struct btrfs_path *p, int find_higher,
3097                                int return_any)
3098 {
3099         int ret;
3100         struct extent_buffer *leaf;
3101
3102 again:
3103         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3104         if (ret <= 0)
3105                 return ret;
3106         /*
3107          * a return value of 1 means the path is at the position where the
3108          * item should be inserted. Normally this is the next bigger item,
3109          * but in case the previous item is the last in a leaf, path points
3110          * to the first free slot in the previous leaf, i.e. at an invalid
3111          * item.
3112          */
3113         leaf = p->nodes[0];
3114
3115         if (find_higher) {
3116                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3117                         ret = btrfs_next_leaf(root, p);
3118                         if (ret <= 0)
3119                                 return ret;
3120                         if (!return_any)
3121                                 return 1;
3122                         /*
3123                          * no higher item found, return the next
3124                          * lower instead
3125                          */
3126                         return_any = 0;
3127                         find_higher = 0;
3128                         btrfs_release_path(p);
3129                         goto again;
3130                 }
3131         } else {
3132                 if (p->slots[0] == 0) {
3133                         ret = btrfs_prev_leaf(root, p);
3134                         if (ret < 0)
3135                                 return ret;
3136                         if (!ret) {
3137                                 leaf = p->nodes[0];
3138                                 if (p->slots[0] == btrfs_header_nritems(leaf))
3139                                         p->slots[0]--;
3140                                 return 0;
3141                         }
3142                         if (!return_any)
3143                                 return 1;
3144                         /*
3145                          * no lower item found, return the next
3146                          * higher instead
3147                          */
3148                         return_any = 0;
3149                         find_higher = 1;
3150                         btrfs_release_path(p);
3151                         goto again;
3152                 } else {
3153                         --p->slots[0];
3154                 }
3155         }
3156         return 0;
3157 }
3158
3159 /*
3160  * adjust the pointers going up the tree, starting at level
3161  * making sure the right key of each node is points to 'key'.
3162  * This is used after shifting pointers to the left, so it stops
3163  * fixing up pointers when a given leaf/node is not in slot 0 of the
3164  * higher levels
3165  *
3166  */
3167 static void fixup_low_keys(struct btrfs_fs_info *fs_info,
3168                            struct btrfs_path *path,
3169                            struct btrfs_disk_key *key, int level)
3170 {
3171         int i;
3172         struct extent_buffer *t;
3173
3174         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3175                 int tslot = path->slots[i];
3176                 if (!path->nodes[i])
3177                         break;
3178                 t = path->nodes[i];
3179                 tree_mod_log_set_node_key(fs_info, t, tslot, 1);
3180                 btrfs_set_node_key(t, key, tslot);
3181                 btrfs_mark_buffer_dirty(path->nodes[i]);
3182                 if (tslot != 0)
3183                         break;
3184         }
3185 }
3186
3187 /*
3188  * update item key.
3189  *
3190  * This function isn't completely safe. It's the caller's responsibility
3191  * that the new key won't break the order
3192  */
3193 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3194                              struct btrfs_path *path,
3195                              const struct btrfs_key *new_key)
3196 {
3197         struct btrfs_disk_key disk_key;
3198         struct extent_buffer *eb;
3199         int slot;
3200
3201         eb = path->nodes[0];
3202         slot = path->slots[0];
3203         if (slot > 0) {
3204                 btrfs_item_key(eb, &disk_key, slot - 1);
3205                 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3206         }
3207         if (slot < btrfs_header_nritems(eb) - 1) {
3208                 btrfs_item_key(eb, &disk_key, slot + 1);
3209                 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3210         }
3211
3212         btrfs_cpu_key_to_disk(&disk_key, new_key);
3213         btrfs_set_item_key(eb, &disk_key, slot);
3214         btrfs_mark_buffer_dirty(eb);
3215         if (slot == 0)
3216                 fixup_low_keys(fs_info, path, &disk_key, 1);
3217 }
3218
3219 /*
3220  * try to push data from one node into the next node left in the
3221  * tree.
3222  *
3223  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3224  * error, and > 0 if there was no room in the left hand block.
3225  */
3226 static int push_node_left(struct btrfs_trans_handle *trans,
3227                           struct btrfs_fs_info *fs_info,
3228                           struct extent_buffer *dst,
3229                           struct extent_buffer *src, int empty)
3230 {
3231         int push_items = 0;
3232         int src_nritems;
3233         int dst_nritems;
3234         int ret = 0;
3235
3236         src_nritems = btrfs_header_nritems(src);
3237         dst_nritems = btrfs_header_nritems(dst);
3238         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3239         WARN_ON(btrfs_header_generation(src) != trans->transid);
3240         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3241
3242         if (!empty && src_nritems <= 8)
3243                 return 1;
3244
3245         if (push_items <= 0)
3246                 return 1;
3247
3248         if (empty) {
3249                 push_items = min(src_nritems, push_items);
3250                 if (push_items < src_nritems) {
3251                         /* leave at least 8 pointers in the node if
3252                          * we aren't going to empty it
3253                          */
3254                         if (src_nritems - push_items < 8) {
3255                                 if (push_items <= 8)
3256                                         return 1;
3257                                 push_items -= 8;
3258                         }
3259                 }
3260         } else
3261                 push_items = min(src_nritems - 8, push_items);
3262
3263         ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0,
3264                                    push_items);
3265         if (ret) {
3266                 btrfs_abort_transaction(trans, ret);
3267                 return ret;
3268         }
3269         copy_extent_buffer(dst, src,
3270                            btrfs_node_key_ptr_offset(dst_nritems),
3271                            btrfs_node_key_ptr_offset(0),
3272                            push_items * sizeof(struct btrfs_key_ptr));
3273
3274         if (push_items < src_nritems) {
3275                 /*
3276                  * don't call tree_mod_log_eb_move here, key removal was already
3277                  * fully logged by tree_mod_log_eb_copy above.
3278                  */
3279                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3280                                       btrfs_node_key_ptr_offset(push_items),
3281                                       (src_nritems - push_items) *
3282                                       sizeof(struct btrfs_key_ptr));
3283         }
3284         btrfs_set_header_nritems(src, src_nritems - push_items);
3285         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3286         btrfs_mark_buffer_dirty(src);
3287         btrfs_mark_buffer_dirty(dst);
3288
3289         return ret;
3290 }
3291
3292 /*
3293  * try to push data from one node into the next node right in the
3294  * tree.
3295  *
3296  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3297  * error, and > 0 if there was no room in the right hand block.
3298  *
3299  * this will  only push up to 1/2 the contents of the left node over
3300  */
3301 static int balance_node_right(struct btrfs_trans_handle *trans,
3302                               struct btrfs_fs_info *fs_info,
3303                               struct extent_buffer *dst,
3304                               struct extent_buffer *src)
3305 {
3306         int push_items = 0;
3307         int max_push;
3308         int src_nritems;
3309         int dst_nritems;
3310         int ret = 0;
3311
3312         WARN_ON(btrfs_header_generation(src) != trans->transid);
3313         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3314
3315         src_nritems = btrfs_header_nritems(src);
3316         dst_nritems = btrfs_header_nritems(dst);
3317         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3318         if (push_items <= 0)
3319                 return 1;
3320
3321         if (src_nritems < 4)
3322                 return 1;
3323
3324         max_push = src_nritems / 2 + 1;
3325         /* don't try to empty the node */
3326         if (max_push >= src_nritems)
3327                 return 1;
3328
3329         if (max_push < push_items)
3330                 push_items = max_push;
3331
3332         tree_mod_log_eb_move(fs_info, dst, push_items, 0, dst_nritems);
3333         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3334                                       btrfs_node_key_ptr_offset(0),
3335                                       (dst_nritems) *
3336                                       sizeof(struct btrfs_key_ptr));
3337
3338         ret = tree_mod_log_eb_copy(fs_info, dst, src, 0,
3339                                    src_nritems - push_items, push_items);
3340         if (ret) {
3341                 btrfs_abort_transaction(trans, ret);
3342                 return ret;
3343         }
3344         copy_extent_buffer(dst, src,
3345                            btrfs_node_key_ptr_offset(0),
3346                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3347                            push_items * sizeof(struct btrfs_key_ptr));
3348
3349         btrfs_set_header_nritems(src, src_nritems - push_items);
3350         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3351
3352         btrfs_mark_buffer_dirty(src);
3353         btrfs_mark_buffer_dirty(dst);
3354
3355         return ret;
3356 }
3357
3358 /*
3359  * helper function to insert a new root level in the tree.
3360  * A new node is allocated, and a single item is inserted to
3361  * point to the existing root
3362  *
3363  * returns zero on success or < 0 on failure.
3364  */
3365 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3366                            struct btrfs_root *root,
3367                            struct btrfs_path *path, int level)
3368 {
3369         struct btrfs_fs_info *fs_info = root->fs_info;
3370         u64 lower_gen;
3371         struct extent_buffer *lower;
3372         struct extent_buffer *c;
3373         struct extent_buffer *old;
3374         struct btrfs_disk_key lower_key;
3375
3376         BUG_ON(path->nodes[level]);
3377         BUG_ON(path->nodes[level-1] != root->node);
3378
3379         lower = path->nodes[level-1];
3380         if (level == 1)
3381                 btrfs_item_key(lower, &lower_key, 0);
3382         else
3383                 btrfs_node_key(lower, &lower_key, 0);
3384
3385         c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3386                                    &lower_key, level, root->node->start, 0);
3387         if (IS_ERR(c))
3388                 return PTR_ERR(c);
3389
3390         root_add_used(root, fs_info->nodesize);
3391
3392         memzero_extent_buffer(c, 0, sizeof(struct btrfs_header));
3393         btrfs_set_header_nritems(c, 1);
3394         btrfs_set_header_level(c, level);
3395         btrfs_set_header_bytenr(c, c->start);
3396         btrfs_set_header_generation(c, trans->transid);
3397         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3398         btrfs_set_header_owner(c, root->root_key.objectid);
3399
3400         write_extent_buffer_fsid(c, fs_info->fsid);
3401         write_extent_buffer_chunk_tree_uuid(c, fs_info->chunk_tree_uuid);
3402
3403         btrfs_set_node_key(c, &lower_key, 0);
3404         btrfs_set_node_blockptr(c, 0, lower->start);
3405         lower_gen = btrfs_header_generation(lower);
3406         WARN_ON(lower_gen != trans->transid);
3407
3408         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3409
3410         btrfs_mark_buffer_dirty(c);
3411
3412         old = root->node;
3413         tree_mod_log_set_root_pointer(root, c, 0);
3414         rcu_assign_pointer(root->node, c);
3415
3416         /* the super has an extra ref to root->node */
3417         free_extent_buffer(old);
3418
3419         add_root_to_dirty_list(root);
3420         extent_buffer_get(c);
3421         path->nodes[level] = c;
3422         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3423         path->slots[level] = 0;
3424         return 0;
3425 }
3426
3427 /*
3428  * worker function to insert a single pointer in a node.
3429  * the node should have enough room for the pointer already
3430  *
3431  * slot and level indicate where you want the key to go, and
3432  * blocknr is the block the key points to.
3433  */
3434 static void insert_ptr(struct btrfs_trans_handle *trans,
3435                        struct btrfs_fs_info *fs_info, struct btrfs_path *path,
3436                        struct btrfs_disk_key *key, u64 bytenr,
3437                        int slot, int level)
3438 {
3439         struct extent_buffer *lower;
3440         int nritems;
3441         int ret;
3442
3443         BUG_ON(!path->nodes[level]);
3444         btrfs_assert_tree_locked(path->nodes[level]);
3445         lower = path->nodes[level];
3446         nritems = btrfs_header_nritems(lower);
3447         BUG_ON(slot > nritems);
3448         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
3449         if (slot != nritems) {
3450                 if (level)
3451                         tree_mod_log_eb_move(fs_info, lower, slot + 1,
3452                                              slot, nritems - slot);
3453                 memmove_extent_buffer(lower,
3454                               btrfs_node_key_ptr_offset(slot + 1),
3455                               btrfs_node_key_ptr_offset(slot),
3456                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3457         }
3458         if (level) {
3459                 ret = tree_mod_log_insert_key(fs_info, lower, slot,
3460                                               MOD_LOG_KEY_ADD, GFP_NOFS);
3461                 BUG_ON(ret < 0);
3462         }
3463         btrfs_set_node_key(lower, key, slot);
3464         btrfs_set_node_blockptr(lower, slot, bytenr);
3465         WARN_ON(trans->transid == 0);
3466         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3467         btrfs_set_header_nritems(lower, nritems + 1);
3468         btrfs_mark_buffer_dirty(lower);
3469 }
3470
3471 /*
3472  * split the node at the specified level in path in two.
3473  * The path is corrected to point to the appropriate node after the split
3474  *
3475  * Before splitting this tries to make some room in the node by pushing
3476  * left and right, if either one works, it returns right away.
3477  *
3478  * returns 0 on success and < 0 on failure
3479  */
3480 static noinline int split_node(struct btrfs_trans_handle *trans,
3481                                struct btrfs_root *root,
3482                                struct btrfs_path *path, int level)
3483 {
3484         struct btrfs_fs_info *fs_info = root->fs_info;
3485         struct extent_buffer *c;
3486         struct extent_buffer *split;
3487         struct btrfs_disk_key disk_key;
3488         int mid;
3489         int ret;
3490         u32 c_nritems;
3491
3492         c = path->nodes[level];
3493         WARN_ON(btrfs_header_generation(c) != trans->transid);
3494         if (c == root->node) {
3495                 /*
3496                  * trying to split the root, lets make a new one
3497                  *
3498                  * tree mod log: We don't log_removal old root in
3499                  * insert_new_root, because that root buffer will be kept as a
3500                  * normal node. We are going to log removal of half of the
3501                  * elements below with tree_mod_log_eb_copy. We're holding a
3502                  * tree lock on the buffer, which is why we cannot race with
3503                  * other tree_mod_log users.
3504                  */
3505                 ret = insert_new_root(trans, root, path, level + 1);
3506                 if (ret)
3507                         return ret;
3508         } else {
3509                 ret = push_nodes_for_insert(trans, root, path, level);
3510                 c = path->nodes[level];
3511                 if (!ret && btrfs_header_nritems(c) <
3512                     BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3513                         return 0;
3514                 if (ret < 0)
3515                         return ret;
3516         }
3517
3518         c_nritems = btrfs_header_nritems(c);
3519         mid = (c_nritems + 1) / 2;
3520         btrfs_node_key(c, &disk_key, mid);
3521
3522         split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3523                         &disk_key, level, c->start, 0);
3524         if (IS_ERR(split))
3525                 return PTR_ERR(split);
3526
3527         root_add_used(root, fs_info->nodesize);
3528
3529         memzero_extent_buffer(split, 0, sizeof(struct btrfs_header));
3530         btrfs_set_header_level(split, btrfs_header_level(c));
3531         btrfs_set_header_bytenr(split, split->start);
3532         btrfs_set_header_generation(split, trans->transid);
3533         btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3534         btrfs_set_header_owner(split, root->root_key.objectid);
3535         write_extent_buffer_fsid(split, fs_info->fsid);
3536         write_extent_buffer_chunk_tree_uuid(split, fs_info->chunk_tree_uuid);
3537
3538         ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid);
3539         if (ret) {
3540                 btrfs_abort_transaction(trans, ret);
3541                 return ret;
3542         }
3543         copy_extent_buffer(split, c,
3544                            btrfs_node_key_ptr_offset(0),
3545                            btrfs_node_key_ptr_offset(mid),
3546                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3547         btrfs_set_header_nritems(split, c_nritems - mid);
3548         btrfs_set_header_nritems(c, mid);
3549         ret = 0;
3550
3551         btrfs_mark_buffer_dirty(c);
3552         btrfs_mark_buffer_dirty(split);
3553
3554         insert_ptr(trans, fs_info, path, &disk_key, split->start,
3555                    path->slots[level + 1] + 1, level + 1);
3556
3557         if (path->slots[level] >= mid) {
3558                 path->slots[level] -= mid;
3559                 btrfs_tree_unlock(c);
3560                 free_extent_buffer(c);
3561                 path->nodes[level] = split;
3562                 path->slots[level + 1] += 1;
3563         } else {
3564                 btrfs_tree_unlock(split);
3565                 free_extent_buffer(split);
3566         }
3567         return ret;
3568 }
3569
3570 /*
3571  * how many bytes are required to store the items in a leaf.  start
3572  * and nr indicate which items in the leaf to check.  This totals up the
3573  * space used both by the item structs and the item data
3574  */
3575 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3576 {
3577         struct btrfs_item *start_item;
3578         struct btrfs_item *end_item;
3579         struct btrfs_map_token token;
3580         int data_len;
3581         int nritems = btrfs_header_nritems(l);
3582         int end = min(nritems, start + nr) - 1;
3583
3584         if (!nr)
3585                 return 0;
3586         btrfs_init_map_token(&token);
3587         start_item = btrfs_item_nr(start);
3588         end_item = btrfs_item_nr(end);
3589         data_len = btrfs_token_item_offset(l, start_item, &token) +
3590                 btrfs_token_item_size(l, start_item, &token);
3591         data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3592         data_len += sizeof(struct btrfs_item) * nr;
3593         WARN_ON(data_len < 0);
3594         return data_len;
3595 }
3596
3597 /*
3598  * The space between the end of the leaf items and
3599  * the start of the leaf data.  IOW, how much room
3600  * the leaf has left for both items and data
3601  */
3602 noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
3603                                    struct extent_buffer *leaf)
3604 {
3605         int nritems = btrfs_header_nritems(leaf);
3606         int ret;
3607
3608         ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3609         if (ret < 0) {
3610                 btrfs_crit(fs_info,
3611                            "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3612                            ret,
3613                            (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3614                            leaf_space_used(leaf, 0, nritems), nritems);
3615         }
3616         return ret;
3617 }
3618
3619 /*
3620  * min slot controls the lowest index we're willing to push to the
3621  * right.  We'll push up to and including min_slot, but no lower
3622  */
3623 static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
3624                                       struct btrfs_path *path,
3625                                       int data_size, int empty,
3626                                       struct extent_buffer *right,
3627                                       int free_space, u32 left_nritems,
3628                                       u32 min_slot)
3629 {
3630         struct extent_buffer *left = path->nodes[0];
3631         struct extent_buffer *upper = path->nodes[1];
3632         struct btrfs_map_token token;
3633         struct btrfs_disk_key disk_key;
3634         int slot;
3635         u32 i;
3636         int push_space = 0;
3637         int push_items = 0;
3638         struct btrfs_item *item;
3639         u32 nr;
3640         u32 right_nritems;
3641         u32 data_end;
3642         u32 this_item_size;
3643
3644         btrfs_init_map_token(&token);
3645
3646         if (empty)
3647                 nr = 0;
3648         else
3649                 nr = max_t(u32, 1, min_slot);
3650
3651         if (path->slots[0] >= left_nritems)
3652                 push_space += data_size;
3653
3654         slot = path->slots[1];
3655         i = left_nritems - 1;
3656         while (i >= nr) {
3657                 item = btrfs_item_nr(i);
3658
3659                 if (!empty && push_items > 0) {
3660                         if (path->slots[0] > i)
3661                                 break;
3662                         if (path->slots[0] == i) {
3663                                 int space = btrfs_leaf_free_space(fs_info, left);
3664                                 if (space + push_space * 2 > free_space)
3665                                         break;
3666                         }
3667                 }
3668
3669                 if (path->slots[0] == i)
3670                         push_space += data_size;
3671
3672                 this_item_size = btrfs_item_size(left, item);
3673                 if (this_item_size + sizeof(*item) + push_space > free_space)
3674                         break;
3675
3676                 push_items++;
3677                 push_space += this_item_size + sizeof(*item);
3678                 if (i == 0)
3679                         break;
3680                 i--;
3681         }
3682
3683         if (push_items == 0)
3684                 goto out_unlock;
3685
3686         WARN_ON(!empty && push_items == left_nritems);
3687
3688         /* push left to right */
3689         right_nritems = btrfs_header_nritems(right);
3690
3691         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3692         push_space -= leaf_data_end(fs_info, left);
3693
3694         /* make room in the right data area */
3695         data_end = leaf_data_end(fs_info, right);
3696         memmove_extent_buffer(right,
3697                               BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3698                               BTRFS_LEAF_DATA_OFFSET + data_end,
3699                               BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3700
3701         /* copy from the left data area */
3702         copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3703                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3704                      BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
3705                      push_space);
3706
3707         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3708                               btrfs_item_nr_offset(0),
3709                               right_nritems * sizeof(struct btrfs_item));
3710
3711         /* copy the items from left to right */
3712         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3713                    btrfs_item_nr_offset(left_nritems - push_items),
3714                    push_items * sizeof(struct btrfs_item));
3715
3716         /* update the item pointers */
3717         right_nritems += push_items;
3718         btrfs_set_header_nritems(right, right_nritems);
3719         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3720         for (i = 0; i < right_nritems; i++) {
3721                 item = btrfs_item_nr(i);
3722                 push_space -= btrfs_token_item_size(right, item, &token);
3723                 btrfs_set_token_item_offset(right, item, push_space, &token);
3724         }
3725
3726         left_nritems -= push_items;
3727         btrfs_set_header_nritems(left, left_nritems);
3728
3729         if (left_nritems)
3730                 btrfs_mark_buffer_dirty(left);
3731         else
3732                 clean_tree_block(fs_info, left);
3733
3734         btrfs_mark_buffer_dirty(right);
3735
3736         btrfs_item_key(right, &disk_key, 0);
3737         btrfs_set_node_key(upper, &disk_key, slot + 1);
3738         btrfs_mark_buffer_dirty(upper);
3739
3740         /* then fixup the leaf pointer in the path */
3741         if (path->slots[0] >= left_nritems) {
3742                 path->slots[0] -= left_nritems;
3743                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3744                         clean_tree_block(fs_info, path->nodes[0]);
3745                 btrfs_tree_unlock(path->nodes[0]);
3746                 free_extent_buffer(path->nodes[0]);
3747                 path->nodes[0] = right;
3748                 path->slots[1] += 1;
3749         } else {
3750                 btrfs_tree_unlock(right);
3751                 free_extent_buffer(right);
3752         }
3753         return 0;
3754
3755 out_unlock:
3756         btrfs_tree_unlock(right);
3757         free_extent_buffer(right);
3758         return 1;
3759 }
3760
3761 /*
3762  * push some data in the path leaf to the right, trying to free up at
3763  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3764  *
3765  * returns 1 if the push failed because the other node didn't have enough
3766  * room, 0 if everything worked out and < 0 if there were major errors.
3767  *
3768  * this will push starting from min_slot to the end of the leaf.  It won't
3769  * push any slot lower than min_slot
3770  */
3771 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3772                            *root, struct btrfs_path *path,
3773                            int min_data_size, int data_size,
3774                            int empty, u32 min_slot)
3775 {
3776         struct btrfs_fs_info *fs_info = root->fs_info;
3777         struct extent_buffer *left = path->nodes[0];
3778         struct extent_buffer *right;
3779         struct extent_buffer *upper;
3780         int slot;
3781         int free_space;
3782         u32 left_nritems;
3783         int ret;
3784
3785         if (!path->nodes[1])
3786                 return 1;
3787
3788         slot = path->slots[1];
3789         upper = path->nodes[1];
3790         if (slot >= btrfs_header_nritems(upper) - 1)
3791                 return 1;
3792
3793         btrfs_assert_tree_locked(path->nodes[1]);
3794
3795         right = read_node_slot(fs_info, upper, slot + 1);
3796         /*
3797          * slot + 1 is not valid or we fail to read the right node,
3798          * no big deal, just return.
3799          */
3800         if (IS_ERR(right))
3801                 return 1;
3802
3803         btrfs_tree_lock(right);
3804         btrfs_set_lock_blocking(right);
3805
3806         free_space = btrfs_leaf_free_space(fs_info, right);
3807         if (free_space < data_size)
3808                 goto out_unlock;
3809
3810         /* cow and double check */
3811         ret = btrfs_cow_block(trans, root, right, upper,
3812                               slot + 1, &right);
3813         if (ret)
3814                 goto out_unlock;
3815
3816         free_space = btrfs_leaf_free_space(fs_info, right);
3817         if (free_space < data_size)
3818                 goto out_unlock;
3819
3820         left_nritems = btrfs_header_nritems(left);
3821         if (left_nritems == 0)
3822                 goto out_unlock;
3823
3824         if (path->slots[0] == left_nritems && !empty) {
3825                 /* Key greater than all keys in the leaf, right neighbor has
3826                  * enough room for it and we're not emptying our leaf to delete
3827                  * it, therefore use right neighbor to insert the new item and
3828                  * no need to touch/dirty our left leaft. */
3829                 btrfs_tree_unlock(left);
3830                 free_extent_buffer(left);
3831                 path->nodes[0] = right;
3832                 path->slots[0] = 0;
3833                 path->slots[1]++;
3834                 return 0;
3835         }
3836
3837         return __push_leaf_right(fs_info, path, min_data_size, empty,
3838                                 right, free_space, left_nritems, min_slot);
3839 out_unlock:
3840         btrfs_tree_unlock(right);
3841         free_extent_buffer(right);
3842         return 1;
3843 }
3844
3845 /*
3846  * push some data in the path leaf to the left, trying to free up at
3847  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3848  *
3849  * max_slot can put a limit on how far into the leaf we'll push items.  The
3850  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3851  * items
3852  */
3853 static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
3854                                      struct btrfs_path *path, int data_size,
3855                                      int empty, struct extent_buffer *left,
3856                                      int free_space, u32 right_nritems,
3857                                      u32 max_slot)
3858 {
3859         struct btrfs_disk_key disk_key;
3860         struct extent_buffer *right = path->nodes[0];
3861         int i;
3862         int push_space = 0;
3863         int push_items = 0;
3864         struct btrfs_item *item;
3865         u32 old_left_nritems;
3866         u32 nr;
3867         int ret = 0;
3868         u32 this_item_size;
3869         u32 old_left_item_size;
3870         struct btrfs_map_token token;
3871
3872         btrfs_init_map_token(&token);
3873
3874         if (empty)
3875                 nr = min(right_nritems, max_slot);
3876         else
3877                 nr = min(right_nritems - 1, max_slot);
3878
3879         for (i = 0; i < nr; i++) {
3880                 item = btrfs_item_nr(i);
3881
3882                 if (!empty && push_items > 0) {
3883                         if (path->slots[0] < i)
3884                                 break;
3885                         if (path->slots[0] == i) {
3886                                 int space = btrfs_leaf_free_space(fs_info, right);
3887                                 if (space + push_space * 2 > free_space)
3888                                         break;
3889                         }
3890                 }
3891
3892                 if (path->slots[0] == i)
3893                         push_space += data_size;
3894
3895                 this_item_size = btrfs_item_size(right, item);
3896                 if (this_item_size + sizeof(*item) + push_space > free_space)
3897                         break;
3898
3899                 push_items++;
3900                 push_space += this_item_size + sizeof(*item);
3901         }
3902
3903         if (push_items == 0) {
3904                 ret = 1;
3905                 goto out;
3906         }
3907         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3908
3909         /* push data from right to left */
3910         copy_extent_buffer(left, right,
3911                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3912                            btrfs_item_nr_offset(0),
3913                            push_items * sizeof(struct btrfs_item));
3914
3915         push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3916                      btrfs_item_offset_nr(right, push_items - 1);
3917
3918         copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3919                      leaf_data_end(fs_info, left) - push_space,
3920                      BTRFS_LEAF_DATA_OFFSET +
3921                      btrfs_item_offset_nr(right, push_items - 1),
3922                      push_space);
3923         old_left_nritems = btrfs_header_nritems(left);
3924         BUG_ON(old_left_nritems <= 0);
3925
3926         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3927         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3928                 u32 ioff;
3929
3930                 item = btrfs_item_nr(i);
3931
3932                 ioff = btrfs_token_item_offset(left, item, &token);
3933                 btrfs_set_token_item_offset(left, item,
3934                       ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
3935                       &token);
3936         }
3937         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3938
3939         /* fixup right node */
3940         if (push_items > right_nritems)
3941                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3942                        right_nritems);
3943
3944         if (push_items < right_nritems) {
3945                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3946                                                   leaf_data_end(fs_info, right);
3947                 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3948                                       BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3949                                       BTRFS_LEAF_DATA_OFFSET +
3950                                       leaf_data_end(fs_info, right), push_space);
3951
3952                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3953                               btrfs_item_nr_offset(push_items),
3954                              (btrfs_header_nritems(right) - push_items) *
3955                              sizeof(struct btrfs_item));
3956         }
3957         right_nritems -= push_items;
3958         btrfs_set_header_nritems(right, right_nritems);
3959         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3960         for (i = 0; i < right_nritems; i++) {
3961                 item = btrfs_item_nr(i);
3962
3963                 push_space = push_space - btrfs_token_item_size(right,
3964                                                                 item, &token);
3965                 btrfs_set_token_item_offset(right, item, push_space, &token);
3966         }
3967
3968         btrfs_mark_buffer_dirty(left);
3969         if (right_nritems)
3970                 btrfs_mark_buffer_dirty(right);
3971         else
3972                 clean_tree_block(fs_info, right);
3973
3974         btrfs_item_key(right, &disk_key, 0);
3975         fixup_low_keys(fs_info, path, &disk_key, 1);
3976
3977         /* then fixup the leaf pointer in the path */
3978         if (path->slots[0] < push_items) {
3979                 path->slots[0] += old_left_nritems;
3980                 btrfs_tree_unlock(path->nodes[0]);
3981                 free_extent_buffer(path->nodes[0]);
3982                 path->nodes[0] = left;
3983                 path->slots[1] -= 1;
3984         } else {
3985                 btrfs_tree_unlock(left);
3986                 free_extent_buffer(left);
3987                 path->slots[0] -= push_items;
3988         }
3989         BUG_ON(path->slots[0] < 0);
3990         return ret;
3991 out:
3992         btrfs_tree_unlock(left);
3993         free_extent_buffer(left);
3994         return ret;
3995 }
3996
3997 /*
3998  * push some data in the path leaf to the left, trying to free up at
3999  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
4000  *
4001  * max_slot can put a limit on how far into the leaf we'll push items.  The
4002  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
4003  * items
4004  */
4005 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
4006                           *root, struct btrfs_path *path, int min_data_size,
4007                           int data_size, int empty, u32 max_slot)
4008 {
4009         struct btrfs_fs_info *fs_info = root->fs_info;
4010         struct extent_buffer *right = path->nodes[0];
4011         struct extent_buffer *left;
4012         int slot;
4013         int free_space;
4014         u32 right_nritems;
4015         int ret = 0;
4016
4017         slot = path->slots[1];
4018         if (slot == 0)
4019                 return 1;
4020         if (!path->nodes[1])
4021                 return 1;
4022
4023         right_nritems = btrfs_header_nritems(right);
4024         if (right_nritems == 0)
4025                 return 1;
4026
4027         btrfs_assert_tree_locked(path->nodes[1]);
4028
4029         left = read_node_slot(fs_info, path->nodes[1], slot - 1);
4030         /*
4031          * slot - 1 is not valid or we fail to read the left node,
4032          * no big deal, just return.
4033          */
4034         if (IS_ERR(left))
4035                 return 1;
4036
4037         btrfs_tree_lock(left);
4038         btrfs_set_lock_blocking(left);
4039
4040         free_space = btrfs_leaf_free_space(fs_info, left);
4041         if (free_space < data_size) {
4042                 ret = 1;
4043                 goto out;
4044         }
4045
4046         /* cow and double check */
4047         ret = btrfs_cow_block(trans, root, left,
4048                               path->nodes[1], slot - 1, &left);
4049         if (ret) {
4050                 /* we hit -ENOSPC, but it isn't fatal here */
4051                 if (ret == -ENOSPC)
4052                         ret = 1;
4053                 goto out;
4054         }
4055
4056         free_space = btrfs_leaf_free_space(fs_info, left);
4057         if (free_space < data_size) {
4058                 ret = 1;
4059                 goto out;
4060         }
4061
4062         return __push_leaf_left(fs_info, path, min_data_size,
4063                                empty, left, free_space, right_nritems,
4064                                max_slot);
4065 out:
4066         btrfs_tree_unlock(left);
4067         free_extent_buffer(left);
4068         return ret;
4069 }
4070
4071 /*
4072  * split the path's leaf in two, making sure there is at least data_size
4073  * available for the resulting leaf level of the path.
4074  */
4075 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4076                                     struct btrfs_fs_info *fs_info,
4077                                     struct btrfs_path *path,
4078                                     struct extent_buffer *l,
4079                                     struct extent_buffer *right,
4080                                     int slot, int mid, int nritems)
4081 {
4082         int data_copy_size;
4083         int rt_data_off;
4084         int i;
4085         struct btrfs_disk_key disk_key;
4086         struct btrfs_map_token token;
4087
4088         btrfs_init_map_token(&token);
4089
4090         nritems = nritems - mid;
4091         btrfs_set_header_nritems(right, nritems);
4092         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l);
4093
4094         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4095                            btrfs_item_nr_offset(mid),
4096                            nritems * sizeof(struct btrfs_item));
4097
4098         copy_extent_buffer(right, l,
4099                      BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4100                      data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4101                      leaf_data_end(fs_info, l), data_copy_size);
4102
4103         rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4104
4105         for (i = 0; i < nritems; i++) {
4106                 struct btrfs_item *item = btrfs_item_nr(i);
4107                 u32 ioff;
4108
4109                 ioff = btrfs_token_item_offset(right, item, &token);
4110                 btrfs_set_token_item_offset(right, item,
4111                                             ioff + rt_data_off, &token);
4112         }
4113
4114         btrfs_set_header_nritems(l, mid);
4115         btrfs_item_key(right, &disk_key, 0);
4116         insert_ptr(trans, fs_info, path, &disk_key, right->start,
4117                    path->slots[1] + 1, 1);
4118
4119         btrfs_mark_buffer_dirty(right);
4120         btrfs_mark_buffer_dirty(l);
4121         BUG_ON(path->slots[0] != slot);
4122
4123         if (mid <= slot) {
4124                 btrfs_tree_unlock(path->nodes[0]);
4125                 free_extent_buffer(path->nodes[0]);
4126                 path->nodes[0] = right;
4127                 path->slots[0] -= mid;
4128                 path->slots[1] += 1;
4129         } else {
4130                 btrfs_tree_unlock(right);
4131                 free_extent_buffer(right);
4132         }
4133
4134         BUG_ON(path->slots[0] < 0);
4135 }
4136
4137 /*
4138  * double splits happen when we need to insert a big item in the middle
4139  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4140  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4141  *          A                 B                 C
4142  *
4143  * We avoid this by trying to push the items on either side of our target
4144  * into the adjacent leaves.  If all goes well we can avoid the double split
4145  * completely.
4146  */
4147 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4148                                           struct btrfs_root *root,
4149                                           struct btrfs_path *path,
4150                                           int data_size)
4151 {
4152         struct btrfs_fs_info *fs_info = root->fs_info;
4153         int ret;
4154         int progress = 0;
4155         int slot;
4156         u32 nritems;
4157         int space_needed = data_size;
4158
4159         slot = path->slots[0];
4160         if (slot < btrfs_header_nritems(path->nodes[0]))
4161                 space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4162
4163         /*
4164          * try to push all the items after our slot into the
4165          * right leaf
4166          */
4167         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4168         if (ret < 0)
4169                 return ret;
4170
4171         if (ret == 0)
4172                 progress++;
4173
4174         nritems = btrfs_header_nritems(path->nodes[0]);
4175         /*
4176          * our goal is to get our slot at the start or end of a leaf.  If
4177          * we've done so we're done
4178          */
4179         if (path->slots[0] == 0 || path->slots[0] == nritems)
4180                 return 0;
4181
4182         if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4183                 return 0;
4184
4185         /* try to push all the items before our slot into the next leaf */
4186         slot = path->slots[0];
4187         space_needed = data_size;
4188         if (slot > 0)
4189                 space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4190         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4191         if (ret < 0)
4192                 return ret;
4193
4194         if (ret == 0)
4195                 progress++;
4196
4197         if (progress)
4198                 return 0;
4199         return 1;
4200 }
4201
4202 /*
4203  * split the path's leaf in two, making sure there is at least data_size
4204  * available for the resulting leaf level of the path.
4205  *
4206  * returns 0 if all went well and < 0 on failure.
4207  */
4208 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4209                                struct btrfs_root *root,
4210                                const struct btrfs_key *ins_key,
4211                                struct btrfs_path *path, int data_size,
4212                                int extend)
4213 {
4214         struct btrfs_disk_key disk_key;
4215         struct extent_buffer *l;
4216         u32 nritems;
4217         int mid;
4218         int slot;
4219         struct extent_buffer *right;
4220         struct btrfs_fs_info *fs_info = root->fs_info;
4221         int ret = 0;
4222         int wret;
4223         int split;
4224         int num_doubles = 0;
4225         int tried_avoid_double = 0;
4226
4227         l = path->nodes[0];
4228         slot = path->slots[0];
4229         if (extend && data_size + btrfs_item_size_nr(l, slot) +
4230             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4231                 return -EOVERFLOW;
4232
4233         /* first try to make some room by pushing left and right */
4234         if (data_size && path->nodes[1]) {
4235                 int space_needed = data_size;
4236
4237                 if (slot < btrfs_header_nritems(l))
4238                         space_needed -= btrfs_leaf_free_space(fs_info, l);
4239
4240                 wret = push_leaf_right(trans, root, path, space_needed,
4241                                        space_needed, 0, 0);
4242                 if (wret < 0)
4243                         return wret;
4244                 if (wret) {
4245                         space_needed = data_size;
4246                         if (slot > 0)
4247                                 space_needed -= btrfs_leaf_free_space(fs_info,
4248                                                                       l);
4249                         wret = push_leaf_left(trans, root, path, space_needed,
4250                                               space_needed, 0, (u32)-1);
4251                         if (wret < 0)
4252                                 return wret;
4253                 }
4254                 l = path->nodes[0];
4255
4256                 /* did the pushes work? */
4257                 if (btrfs_leaf_free_space(fs_info, l) >= data_size)
4258                         return 0;
4259         }
4260
4261         if (!path->nodes[1]) {
4262                 ret = insert_new_root(trans, root, path, 1);
4263                 if (ret)
4264                         return ret;
4265         }
4266 again:
4267         split = 1;
4268         l = path->nodes[0];
4269         slot = path->slots[0];
4270         nritems = btrfs_header_nritems(l);
4271         mid = (nritems + 1) / 2;
4272
4273         if (mid <= slot) {
4274                 if (nritems == 1 ||
4275                     leaf_space_used(l, mid, nritems - mid) + data_size >
4276                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4277                         if (slot >= nritems) {
4278                                 split = 0;
4279                         } else {
4280                                 mid = slot;
4281                                 if (mid != nritems &&
4282                                     leaf_space_used(l, mid, nritems - mid) +
4283                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4284                                         if (data_size && !tried_avoid_double)
4285                                                 goto push_for_double;
4286                                         split = 2;
4287                                 }
4288                         }
4289                 }
4290         } else {
4291                 if (leaf_space_used(l, 0, mid) + data_size >
4292                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4293                         if (!extend && data_size && slot == 0) {
4294                                 split = 0;
4295                         } else if ((extend || !data_size) && slot == 0) {
4296                                 mid = 1;
4297                         } else {
4298                                 mid = slot;
4299                                 if (mid != nritems &&
4300                                     leaf_space_used(l, mid, nritems - mid) +
4301                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4302                                         if (data_size && !tried_avoid_double)
4303                                                 goto push_for_double;
4304                                         split = 2;
4305                                 }
4306                         }
4307                 }
4308         }
4309
4310         if (split == 0)
4311                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4312         else
4313                 btrfs_item_key(l, &disk_key, mid);
4314
4315         right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4316                         &disk_key, 0, l->start, 0);
4317         if (IS_ERR(right))
4318                 return PTR_ERR(right);
4319
4320         root_add_used(root, fs_info->nodesize);
4321
4322         memzero_extent_buffer(right, 0, sizeof(struct btrfs_header));
4323         btrfs_set_header_bytenr(right, right->start);
4324         btrfs_set_header_generation(right, trans->transid);
4325         btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4326         btrfs_set_header_owner(right, root->root_key.objectid);
4327         btrfs_set_header_level(right, 0);
4328         write_extent_buffer_fsid(right, fs_info->fsid);
4329         write_extent_buffer_chunk_tree_uuid(right, fs_info->chunk_tree_uuid);
4330
4331         if (split == 0) {
4332                 if (mid <= slot) {
4333                         btrfs_set_header_nritems(right, 0);
4334                         insert_ptr(trans, fs_info, path, &disk_key,
4335                                    right->start, path->slots[1] + 1, 1);
4336                         btrfs_tree_unlock(path->nodes[0]);
4337                         free_extent_buffer(path->nodes[0]);
4338                         path->nodes[0] = right;
4339                         path->slots[0] = 0;
4340                         path->slots[1] += 1;
4341                 } else {
4342                         btrfs_set_header_nritems(right, 0);
4343                         insert_ptr(trans, fs_info, path, &disk_key,
4344                                    right->start, path->slots[1], 1);
4345                         btrfs_tree_unlock(path->nodes[0]);
4346                         free_extent_buffer(path->nodes[0]);
4347                         path->nodes[0] = right;
4348                         path->slots[0] = 0;
4349                         if (path->slots[1] == 0)
4350                                 fixup_low_keys(fs_info, path, &disk_key, 1);
4351                 }
4352                 /*
4353                  * We create a new leaf 'right' for the required ins_len and
4354                  * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4355                  * the content of ins_len to 'right'.
4356                  */
4357                 return ret;
4358         }
4359
4360         copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems);
4361
4362         if (split == 2) {
4363                 BUG_ON(num_doubles != 0);
4364                 num_doubles++;
4365                 goto again;
4366         }
4367
4368         return 0;
4369
4370 push_for_double:
4371         push_for_double_split(trans, root, path, data_size);
4372         tried_avoid_double = 1;
4373         if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4374                 return 0;
4375         goto again;
4376 }
4377
4378 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4379                                          struct btrfs_root *root,
4380                                          struct btrfs_path *path, int ins_len)
4381 {
4382         struct btrfs_fs_info *fs_info = root->fs_info;
4383         struct btrfs_key key;
4384         struct extent_buffer *leaf;
4385         struct btrfs_file_extent_item *fi;
4386         u64 extent_len = 0;
4387         u32 item_size;
4388         int ret;
4389
4390         leaf = path->nodes[0];
4391         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4392
4393         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4394                key.type != BTRFS_EXTENT_CSUM_KEY);
4395
4396         if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len)
4397                 return 0;
4398
4399         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4400         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4401                 fi = btrfs_item_ptr(leaf, path->slots[0],
4402                                     struct btrfs_file_extent_item);
4403                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4404         }
4405         btrfs_release_path(path);
4406
4407         path->keep_locks = 1;
4408         path->search_for_split = 1;
4409         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4410         path->search_for_split = 0;
4411         if (ret > 0)
4412                 ret = -EAGAIN;
4413         if (ret < 0)
4414                 goto err;
4415
4416         ret = -EAGAIN;
4417         leaf = path->nodes[0];
4418         /* if our item isn't there, return now */
4419         if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4420                 goto err;
4421
4422         /* the leaf has  changed, it now has room.  return now */
4423         if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len)
4424                 goto err;
4425
4426         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4427                 fi = btrfs_item_ptr(leaf, path->slots[0],
4428                                     struct btrfs_file_extent_item);
4429                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4430                         goto err;
4431         }
4432
4433         btrfs_set_path_blocking(path);
4434         ret = split_leaf(trans, root, &key, path, ins_len, 1);
4435         if (ret)
4436                 goto err;
4437
4438         path->keep_locks = 0;
4439         btrfs_unlock_up_safe(path, 1);
4440         return 0;
4441 err:
4442         path->keep_locks = 0;
4443         return ret;
4444 }
4445
4446 static noinline int split_item(struct btrfs_fs_info *fs_info,
4447                                struct btrfs_path *path,
4448                                const struct btrfs_key *new_key,
4449                                unsigned long split_offset)
4450 {
4451         struct extent_buffer *leaf;
4452         struct btrfs_item *item;
4453         struct btrfs_item *new_item;
4454         int slot;
4455         char *buf;
4456         u32 nritems;
4457         u32 item_size;
4458         u32 orig_offset;
4459         struct btrfs_disk_key disk_key;
4460
4461         leaf = path->nodes[0];
4462         BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item));
4463
4464         btrfs_set_path_blocking(path);
4465
4466         item = btrfs_item_nr(path->slots[0]);
4467         orig_offset = btrfs_item_offset(leaf, item);
4468         item_size = btrfs_item_size(leaf, item);
4469
4470         buf = kmalloc(item_size, GFP_NOFS);
4471         if (!buf)
4472                 return -ENOMEM;
4473
4474         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4475                             path->slots[0]), item_size);
4476
4477         slot = path->slots[0] + 1;
4478         nritems = btrfs_header_nritems(leaf);
4479         if (slot != nritems) {
4480                 /* shift the items */
4481                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4482                                 btrfs_item_nr_offset(slot),
4483                                 (nritems - slot) * sizeof(struct btrfs_item));
4484         }
4485
4486         btrfs_cpu_key_to_disk(&disk_key, new_key);
4487         btrfs_set_item_key(leaf, &disk_key, slot);
4488
4489         new_item = btrfs_item_nr(slot);
4490
4491         btrfs_set_item_offset(leaf, new_item, orig_offset);
4492         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4493
4494         btrfs_set_item_offset(leaf, item,
4495                               orig_offset + item_size - split_offset);
4496         btrfs_set_item_size(leaf, item, split_offset);
4497
4498         btrfs_set_header_nritems(leaf, nritems + 1);
4499
4500         /* write the data for the start of the original item */
4501         write_extent_buffer(leaf, buf,
4502                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4503                             split_offset);
4504
4505         /* write the data for the new item */
4506         write_extent_buffer(leaf, buf + split_offset,
4507                             btrfs_item_ptr_offset(leaf, slot),
4508                             item_size - split_offset);
4509         btrfs_mark_buffer_dirty(leaf);
4510
4511         BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0);
4512         kfree(buf);
4513         return 0;
4514 }
4515
4516 /*
4517  * This function splits a single item into two items,
4518  * giving 'new_key' to the new item and splitting the
4519  * old one at split_offset (from the start of the item).
4520  *
4521  * The path may be released by this operation.  After
4522  * the split, the path is pointing to the old item.  The
4523  * new item is going to be in the same node as the old one.
4524  *
4525  * Note, the item being split must be smaller enough to live alone on
4526  * a tree block with room for one extra struct btrfs_item
4527  *
4528  * This allows us to split the item in place, keeping a lock on the
4529  * leaf the entire time.
4530  */
4531 int btrfs_split_item(struct btrfs_trans_handle *trans,
4532                      struct btrfs_root *root,
4533                      struct btrfs_path *path,
4534                      const struct btrfs_key *new_key,
4535                      unsigned long split_offset)
4536 {
4537         int ret;
4538         ret = setup_leaf_for_split(trans, root, path,
4539                                    sizeof(struct btrfs_item));
4540         if (ret)
4541                 return ret;
4542
4543         ret = split_item(root->fs_info, path, new_key, split_offset);
4544         return ret;
4545 }
4546
4547 /*
4548  * This function duplicate a item, giving 'new_key' to the new item.
4549  * It guarantees both items live in the same tree leaf and the new item
4550  * is contiguous with the original item.
4551  *
4552  * This allows us to split file extent in place, keeping a lock on the
4553  * leaf the entire time.
4554  */
4555 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4556                          struct btrfs_root *root,
4557                          struct btrfs_path *path,
4558                          const struct btrfs_key *new_key)
4559 {
4560         struct extent_buffer *leaf;
4561         int ret;
4562         u32 item_size;
4563
4564         leaf = path->nodes[0];
4565         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4566         ret = setup_leaf_for_split(trans, root, path,
4567                                    item_size + sizeof(struct btrfs_item));
4568         if (ret)
4569                 return ret;
4570
4571         path->slots[0]++;
4572         setup_items_for_insert(root, path, new_key, &item_size,
4573                                item_size, item_size +
4574                                sizeof(struct btrfs_item), 1);
4575         leaf = path->nodes[0];
4576         memcpy_extent_buffer(leaf,
4577                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4578                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4579                              item_size);
4580         return 0;
4581 }
4582
4583 /*
4584  * make the item pointed to by the path smaller.  new_size indicates
4585  * how small to make it, and from_end tells us if we just chop bytes
4586  * off the end of the item or if we shift the item to chop bytes off
4587  * the front.
4588  */
4589 void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
4590                          struct btrfs_path *path, u32 new_size, int from_end)
4591 {
4592         int slot;
4593         struct extent_buffer *leaf;
4594         struct btrfs_item *item;
4595         u32 nritems;
4596         unsigned int data_end;
4597         unsigned int old_data_start;
4598         unsigned int old_size;
4599         unsigned int size_diff;
4600         int i;
4601         struct btrfs_map_token token;
4602
4603         btrfs_init_map_token(&token);
4604
4605         leaf = path->nodes[0];
4606         slot = path->slots[0];
4607
4608         old_size = btrfs_item_size_nr(leaf, slot);
4609         if (old_size == new_size)
4610                 return;
4611
4612         nritems = btrfs_header_nritems(leaf);
4613         data_end = leaf_data_end(fs_info, leaf);
4614
4615         old_data_start = btrfs_item_offset_nr(leaf, slot);
4616
4617         size_diff = old_size - new_size;
4618
4619         BUG_ON(slot < 0);
4620         BUG_ON(slot >= nritems);
4621
4622         /*
4623          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4624          */
4625         /* first correct the data pointers */
4626         for (i = slot; i < nritems; i++) {
4627                 u32 ioff;
4628                 item = btrfs_item_nr(i);
4629
4630                 ioff = btrfs_token_item_offset(leaf, item, &token);
4631                 btrfs_set_token_item_offset(leaf, item,
4632                                             ioff + size_diff, &token);
4633         }
4634
4635         /* shift the data */
4636         if (from_end) {
4637                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4638                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4639                               data_end, old_data_start + new_size - data_end);
4640         } else {
4641                 struct btrfs_disk_key disk_key;
4642                 u64 offset;
4643
4644                 btrfs_item_key(leaf, &disk_key, slot);
4645
4646                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4647                         unsigned long ptr;
4648                         struct btrfs_file_extent_item *fi;
4649
4650                         fi = btrfs_item_ptr(leaf, slot,
4651                                             struct btrfs_file_extent_item);
4652                         fi = (struct btrfs_file_extent_item *)(
4653                              (unsigned long)fi - size_diff);
4654
4655                         if (btrfs_file_extent_type(leaf, fi) ==
4656                             BTRFS_FILE_EXTENT_INLINE) {
4657                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4658                                 memmove_extent_buffer(leaf, ptr,
4659                                       (unsigned long)fi,
4660                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
4661                         }
4662                 }
4663
4664                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4665                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4666                               data_end, old_data_start - data_end);
4667
4668                 offset = btrfs_disk_key_offset(&disk_key);
4669                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4670                 btrfs_set_item_key(leaf, &disk_key, slot);
4671                 if (slot == 0)
4672                         fixup_low_keys(fs_info, path, &disk_key, 1);
4673         }
4674
4675         item = btrfs_item_nr(slot);
4676         btrfs_set_item_size(leaf, item, new_size);
4677         btrfs_mark_buffer_dirty(leaf);
4678
4679         if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4680                 btrfs_print_leaf(leaf);
4681                 BUG();
4682         }
4683 }
4684
4685 /*
4686  * make the item pointed to by the path bigger, data_size is the added size.
4687  */
4688 void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
4689                        u32 data_size)
4690 {
4691         int slot;
4692         struct extent_buffer *leaf;
4693         struct btrfs_item *item;
4694         u32 nritems;
4695         unsigned int data_end;
4696         unsigned int old_data;
4697         unsigned int old_size;
4698         int i;
4699         struct btrfs_map_token token;
4700
4701         btrfs_init_map_token(&token);
4702
4703         leaf = path->nodes[0];
4704
4705         nritems = btrfs_header_nritems(leaf);
4706         data_end = leaf_data_end(fs_info, leaf);
4707
4708         if (btrfs_leaf_free_space(fs_info, leaf) < data_size) {
4709                 btrfs_print_leaf(leaf);
4710                 BUG();
4711         }
4712         slot = path->slots[0];
4713         old_data = btrfs_item_end_nr(leaf, slot);
4714
4715         BUG_ON(slot < 0);
4716         if (slot >= nritems) {
4717                 btrfs_print_leaf(leaf);
4718                 btrfs_crit(fs_info, "slot %d too large, nritems %d",
4719                            slot, nritems);
4720                 BUG_ON(1);
4721         }
4722
4723         /*
4724          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4725          */
4726         /* first correct the data pointers */
4727         for (i = slot; i < nritems; i++) {
4728                 u32 ioff;
4729                 item = btrfs_item_nr(i);
4730
4731                 ioff = btrfs_token_item_offset(leaf, item, &token);
4732                 btrfs_set_token_item_offset(leaf, item,
4733                                             ioff - data_size, &token);
4734         }
4735
4736         /* shift the data */
4737         memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4738                       data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4739                       data_end, old_data - data_end);
4740
4741         data_end = old_data;
4742         old_size = btrfs_item_size_nr(leaf, slot);
4743         item = btrfs_item_nr(slot);
4744         btrfs_set_item_size(leaf, item, old_size + data_size);
4745         btrfs_mark_buffer_dirty(leaf);
4746
4747         if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4748                 btrfs_print_leaf(leaf);
4749                 BUG();
4750         }
4751 }
4752
4753 /*
4754  * this is a helper for btrfs_insert_empty_items, the main goal here is
4755  * to save stack depth by doing the bulk of the work in a function
4756  * that doesn't call btrfs_search_slot
4757  */
4758 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4759                             const struct btrfs_key *cpu_key, u32 *data_size,
4760                             u32 total_data, u32 total_size, int nr)
4761 {
4762         struct btrfs_fs_info *fs_info = root->fs_info;
4763         struct btrfs_item *item;
4764         int i;
4765         u32 nritems;
4766         unsigned int data_end;
4767         struct btrfs_disk_key disk_key;
4768         struct extent_buffer *leaf;
4769         int slot;
4770         struct btrfs_map_token token;
4771
4772         if (path->slots[0] == 0) {
4773                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4774                 fixup_low_keys(fs_info, path, &disk_key, 1);
4775         }
4776         btrfs_unlock_up_safe(path, 1);
4777
4778         btrfs_init_map_token(&token);
4779
4780         leaf = path->nodes[0];
4781         slot = path->slots[0];
4782
4783         nritems = btrfs_header_nritems(leaf);
4784         data_end = leaf_data_end(fs_info, leaf);
4785
4786         if (btrfs_leaf_free_space(fs_info, leaf) < total_size) {
4787                 btrfs_print_leaf(leaf);
4788                 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4789                            total_size, btrfs_leaf_free_space(fs_info, leaf));
4790                 BUG();
4791         }
4792
4793         if (slot != nritems) {
4794                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4795
4796                 if (old_data < data_end) {
4797                         btrfs_print_leaf(leaf);
4798                         btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4799                                    slot, old_data, data_end);
4800                         BUG_ON(1);
4801                 }
4802                 /*
4803                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4804                  */
4805                 /* first correct the data pointers */
4806                 for (i = slot; i < nritems; i++) {
4807                         u32 ioff;
4808
4809                         item = btrfs_item_nr(i);
4810                         ioff = btrfs_token_item_offset(leaf, item, &token);
4811                         btrfs_set_token_item_offset(leaf, item,
4812                                                     ioff - total_data, &token);
4813                 }
4814                 /* shift the items */
4815                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4816                               btrfs_item_nr_offset(slot),
4817                               (nritems - slot) * sizeof(struct btrfs_item));
4818
4819                 /* shift the data */
4820                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4821                               data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4822                               data_end, old_data - data_end);
4823                 data_end = old_data;
4824         }
4825
4826         /* setup the item for the new data */
4827         for (i = 0; i < nr; i++) {
4828                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4829                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4830                 item = btrfs_item_nr(slot + i);
4831                 btrfs_set_token_item_offset(leaf, item,
4832                                             data_end - data_size[i], &token);
4833                 data_end -= data_size[i];
4834                 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4835         }
4836
4837         btrfs_set_header_nritems(leaf, nritems + nr);
4838         btrfs_mark_buffer_dirty(leaf);
4839
4840         if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4841                 btrfs_print_leaf(leaf);
4842                 BUG();
4843         }
4844 }
4845
4846 /*
4847  * Given a key and some data, insert items into the tree.
4848  * This does all the path init required, making room in the tree if needed.
4849  */
4850 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4851                             struct btrfs_root *root,
4852                             struct btrfs_path *path,
4853                             const struct btrfs_key *cpu_key, u32 *data_size,
4854                             int nr)
4855 {
4856         int ret = 0;
4857         int slot;
4858         int i;
4859         u32 total_size = 0;
4860         u32 total_data = 0;
4861
4862         for (i = 0; i < nr; i++)
4863                 total_data += data_size[i];
4864
4865         total_size = total_data + (nr * sizeof(struct btrfs_item));
4866         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4867         if (ret == 0)
4868                 return -EEXIST;
4869         if (ret < 0)
4870                 return ret;
4871
4872         slot = path->slots[0];
4873         BUG_ON(slot < 0);
4874
4875         setup_items_for_insert(root, path, cpu_key, data_size,
4876                                total_data, total_size, nr);
4877         return 0;
4878 }
4879
4880 /*
4881  * Given a key and some data, insert an item into the tree.
4882  * This does all the path init required, making room in the tree if needed.
4883  */
4884 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4885                       const struct btrfs_key *cpu_key, void *data,
4886                       u32 data_size)
4887 {
4888         int ret = 0;
4889         struct btrfs_path *path;
4890         struct extent_buffer *leaf;
4891         unsigned long ptr;
4892
4893         path = btrfs_alloc_path();
4894         if (!path)
4895                 return -ENOMEM;
4896         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4897         if (!ret) {
4898                 leaf = path->nodes[0];
4899                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4900                 write_extent_buffer(leaf, data, ptr, data_size);
4901                 btrfs_mark_buffer_dirty(leaf);
4902         }
4903         btrfs_free_path(path);
4904         return ret;
4905 }
4906
4907 /*
4908  * delete the pointer from a given node.
4909  *
4910  * the tree should have been previously balanced so the deletion does not
4911  * empty a node.
4912  */
4913 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4914                     int level, int slot)
4915 {
4916         struct btrfs_fs_info *fs_info = root->fs_info;
4917         struct extent_buffer *parent = path->nodes[level];
4918         u32 nritems;
4919         int ret;
4920
4921         nritems = btrfs_header_nritems(parent);
4922         if (slot != nritems - 1) {
4923                 if (level)
4924                         tree_mod_log_eb_move(fs_info, parent, slot,
4925                                              slot + 1, nritems - slot - 1);
4926                 memmove_extent_buffer(parent,
4927                               btrfs_node_key_ptr_offset(slot),
4928                               btrfs_node_key_ptr_offset(slot + 1),
4929                               sizeof(struct btrfs_key_ptr) *
4930                               (nritems - slot - 1));
4931         } else if (level) {
4932                 ret = tree_mod_log_insert_key(fs_info, parent, slot,
4933                                               MOD_LOG_KEY_REMOVE, GFP_NOFS);
4934                 BUG_ON(ret < 0);
4935         }
4936
4937         nritems--;
4938         btrfs_set_header_nritems(parent, nritems);
4939         if (nritems == 0 && parent == root->node) {
4940                 BUG_ON(btrfs_header_level(root->node) != 1);
4941                 /* just turn the root into a leaf and break */
4942                 btrfs_set_header_level(root->node, 0);
4943         } else if (slot == 0) {
4944                 struct btrfs_disk_key disk_key;
4945
4946                 btrfs_node_key(parent, &disk_key, 0);
4947                 fixup_low_keys(fs_info, path, &disk_key, level + 1);
4948         }
4949         btrfs_mark_buffer_dirty(parent);
4950 }
4951
4952 /*
4953  * a helper function to delete the leaf pointed to by path->slots[1] and
4954  * path->nodes[1].
4955  *
4956  * This deletes the pointer in path->nodes[1] and frees the leaf
4957  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4958  *
4959  * The path must have already been setup for deleting the leaf, including
4960  * all the proper balancing.  path->nodes[1] must be locked.
4961  */
4962 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4963                                     struct btrfs_root *root,
4964                                     struct btrfs_path *path,
4965                                     struct extent_buffer *leaf)
4966 {
4967         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4968         del_ptr(root, path, 1, path->slots[1]);
4969
4970         /*
4971          * btrfs_free_extent is expensive, we want to make sure we
4972          * aren't holding any locks when we call it
4973          */
4974         btrfs_unlock_up_safe(path, 0);
4975
4976         root_sub_used(root, leaf->len);
4977
4978         extent_buffer_get(leaf);
4979         btrfs_free_tree_block(trans, root, leaf, 0, 1);
4980         free_extent_buffer_stale(leaf);
4981 }
4982 /*
4983  * delete the item at the leaf level in path.  If that empties
4984  * the leaf, remove it from the tree
4985  */
4986 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4987                     struct btrfs_path *path, int slot, int nr)
4988 {
4989         struct btrfs_fs_info *fs_info = root->fs_info;
4990         struct extent_buffer *leaf;
4991         struct btrfs_item *item;
4992         u32 last_off;
4993         u32 dsize = 0;
4994         int ret = 0;
4995         int wret;
4996         int i;
4997         u32 nritems;
4998         struct btrfs_map_token token;
4999
5000         btrfs_init_map_token(&token);
5001
5002         leaf = path->nodes[0];
5003         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
5004
5005         for (i = 0; i < nr; i++)
5006                 dsize += btrfs_item_size_nr(leaf, slot + i);
5007
5008         nritems = btrfs_header_nritems(leaf);
5009
5010         if (slot + nr != nritems) {
5011                 int data_end = leaf_data_end(fs_info, leaf);
5012
5013                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
5014                               data_end + dsize,
5015                               BTRFS_LEAF_DATA_OFFSET + data_end,
5016                               last_off - data_end);
5017
5018                 for (i = slot + nr; i < nritems; i++) {
5019                         u32 ioff;
5020
5021                         item = btrfs_item_nr(i);
5022                         ioff = btrfs_token_item_offset(leaf, item, &token);
5023                         btrfs_set_token_item_offset(leaf, item,
5024                                                     ioff + dsize, &token);
5025                 }
5026
5027                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
5028                               btrfs_item_nr_offset(slot + nr),
5029                               sizeof(struct btrfs_item) *
5030                               (nritems - slot - nr));
5031         }
5032         btrfs_set_header_nritems(leaf, nritems - nr);
5033         nritems -= nr;
5034
5035         /* delete the leaf if we've emptied it */
5036         if (nritems == 0) {
5037                 if (leaf == root->node) {
5038                         btrfs_set_header_level(leaf, 0);
5039                 } else {
5040                         btrfs_set_path_blocking(path);
5041                         clean_tree_block(fs_info, leaf);
5042                         btrfs_del_leaf(trans, root, path, leaf);
5043                 }
5044         } else {
5045                 int used = leaf_space_used(leaf, 0, nritems);
5046                 if (slot == 0) {
5047                         struct btrfs_disk_key disk_key;
5048
5049                         btrfs_item_key(leaf, &disk_key, 0);
5050                         fixup_low_keys(fs_info, path, &disk_key, 1);
5051                 }
5052
5053                 /* delete the leaf if it is mostly empty */
5054                 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
5055                         /* push_leaf_left fixes the path.
5056                          * make sure the path still points to our leaf
5057                          * for possible call to del_ptr below
5058                          */
5059                         slot = path->slots[1];
5060                         extent_buffer_get(leaf);
5061
5062                         btrfs_set_path_blocking(path);
5063                         wret = push_leaf_left(trans, root, path, 1, 1,
5064                                               1, (u32)-1);
5065                         if (wret < 0 && wret != -ENOSPC)
5066                                 ret = wret;
5067
5068                         if (path->nodes[0] == leaf &&
5069                             btrfs_header_nritems(leaf)) {
5070                                 wret = push_leaf_right(trans, root, path, 1,
5071                                                        1, 1, 0);
5072                                 if (wret < 0 && wret != -ENOSPC)
5073                                         ret = wret;
5074                         }
5075
5076                         if (btrfs_header_nritems(leaf) == 0) {
5077                                 path->slots[1] = slot;
5078                                 btrfs_del_leaf(trans, root, path, leaf);
5079                                 free_extent_buffer(leaf);
5080                                 ret = 0;
5081                         } else {
5082                                 /* if we're still in the path, make sure
5083                                  * we're dirty.  Otherwise, one of the
5084                                  * push_leaf functions must have already
5085                                  * dirtied this buffer
5086                                  */
5087                                 if (path->nodes[0] == leaf)
5088                                         btrfs_mark_buffer_dirty(leaf);
5089                                 free_extent_buffer(leaf);
5090                         }
5091                 } else {
5092                         btrfs_mark_buffer_dirty(leaf);
5093                 }
5094         }
5095         return ret;
5096 }
5097
5098 /*
5099  * search the tree again to find a leaf with lesser keys
5100  * returns 0 if it found something or 1 if there are no lesser leaves.
5101  * returns < 0 on io errors.
5102  *
5103  * This may release the path, and so you may lose any locks held at the
5104  * time you call it.
5105  */
5106 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5107 {
5108         struct btrfs_key key;
5109         struct btrfs_disk_key found_key;
5110         int ret;
5111
5112         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5113
5114         if (key.offset > 0) {
5115                 key.offset--;
5116         } else if (key.type > 0) {
5117                 key.type--;
5118                 key.offset = (u64)-1;
5119         } else if (key.objectid > 0) {
5120                 key.objectid--;
5121                 key.type = (u8)-1;
5122                 key.offset = (u64)-1;
5123         } else {
5124                 return 1;
5125         }
5126
5127         btrfs_release_path(path);
5128         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5129         if (ret < 0)
5130                 return ret;
5131         btrfs_item_key(path->nodes[0], &found_key, 0);
5132         ret = comp_keys(&found_key, &key);
5133         /*
5134          * We might have had an item with the previous key in the tree right
5135          * before we released our path. And after we released our path, that
5136          * item might have been pushed to the first slot (0) of the leaf we
5137          * were holding due to a tree balance. Alternatively, an item with the
5138          * previous key can exist as the only element of a leaf (big fat item).
5139          * Therefore account for these 2 cases, so that our callers (like
5140          * btrfs_previous_item) don't miss an existing item with a key matching
5141          * the previous key we computed above.
5142          */
5143         if (ret <= 0)
5144                 return 0;
5145         return 1;
5146 }
5147
5148 /*
5149  * A helper function to walk down the tree starting at min_key, and looking
5150  * for nodes or leaves that are have a minimum transaction id.
5151  * This is used by the btree defrag code, and tree logging
5152  *
5153  * This does not cow, but it does stuff the starting key it finds back
5154  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5155  * key and get a writable path.
5156  *
5157  * This does lock as it descends, and path->keep_locks should be set
5158  * to 1 by the caller.
5159  *
5160  * This honors path->lowest_level to prevent descent past a given level
5161  * of the tree.
5162  *
5163  * min_trans indicates the oldest transaction that you are interested
5164  * in walking through.  Any nodes or leaves older than min_trans are
5165  * skipped over (without reading them).
5166  *
5167  * returns zero if something useful was found, < 0 on error and 1 if there
5168  * was nothing in the tree that matched the search criteria.
5169  */
5170 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5171                          struct btrfs_path *path,
5172                          u64 min_trans)
5173 {
5174         struct btrfs_fs_info *fs_info = root->fs_info;
5175         struct extent_buffer *cur;
5176         struct btrfs_key found_key;
5177         int slot;
5178         int sret;
5179         u32 nritems;
5180         int level;
5181         int ret = 1;
5182         int keep_locks = path->keep_locks;
5183
5184         path->keep_locks = 1;
5185 again:
5186         cur = btrfs_read_lock_root_node(root);
5187         level = btrfs_header_level(cur);
5188         WARN_ON(path->nodes[level]);
5189         path->nodes[level] = cur;
5190         path->locks[level] = BTRFS_READ_LOCK;
5191
5192         if (btrfs_header_generation(cur) < min_trans) {
5193                 ret = 1;
5194                 goto out;
5195         }
5196         while (1) {
5197                 nritems = btrfs_header_nritems(cur);
5198                 level = btrfs_header_level(cur);
5199                 sret = bin_search(cur, min_key, level, &slot);
5200
5201                 /* at the lowest level, we're done, setup the path and exit */
5202                 if (level == path->lowest_level) {
5203                         if (slot >= nritems)
5204                                 goto find_next_key;
5205                         ret = 0;
5206                         path->slots[level] = slot;
5207                         btrfs_item_key_to_cpu(cur, &found_key, slot);
5208                         goto out;
5209                 }
5210                 if (sret && slot > 0)
5211                         slot--;
5212                 /*
5213                  * check this node pointer against the min_trans parameters.
5214                  * If it is too old, old, skip to the next one.
5215                  */
5216                 while (slot < nritems) {
5217                         u64 gen;
5218
5219                         gen = btrfs_node_ptr_generation(cur, slot);
5220                         if (gen < min_trans) {
5221                                 slot++;
5222                                 continue;
5223                         }
5224                         break;
5225                 }
5226 find_next_key:
5227                 /*
5228                  * we didn't find a candidate key in this node, walk forward
5229                  * and find another one
5230                  */
5231                 if (slot >= nritems) {
5232                         path->slots[level] = slot;
5233                         btrfs_set_path_blocking(path);
5234                         sret = btrfs_find_next_key(root, path, min_key, level,
5235                                                   min_trans);
5236                         if (sret == 0) {
5237                                 btrfs_release_path(path);
5238                                 goto again;
5239                         } else {
5240                                 goto out;
5241                         }
5242                 }
5243                 /* save our key for returning back */
5244                 btrfs_node_key_to_cpu(cur, &found_key, slot);
5245                 path->slots[level] = slot;
5246                 if (level == path->lowest_level) {
5247                         ret = 0;
5248                         goto out;
5249                 }
5250                 btrfs_set_path_blocking(path);
5251                 cur = read_node_slot(fs_info, cur, slot);
5252                 if (IS_ERR(cur)) {
5253                         ret = PTR_ERR(cur);
5254                         goto out;
5255                 }
5256
5257                 btrfs_tree_read_lock(cur);
5258
5259                 path->locks[level - 1] = BTRFS_READ_LOCK;
5260                 path->nodes[level - 1] = cur;
5261                 unlock_up(path, level, 1, 0, NULL);
5262                 btrfs_clear_path_blocking(path, NULL, 0);
5263         }
5264 out:
5265         path->keep_locks = keep_locks;
5266         if (ret == 0) {
5267                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5268                 btrfs_set_path_blocking(path);
5269                 memcpy(min_key, &found_key, sizeof(found_key));
5270         }
5271         return ret;
5272 }
5273
5274 static int tree_move_down(struct btrfs_fs_info *fs_info,
5275                            struct btrfs_path *path,
5276                            int *level)
5277 {
5278         struct extent_buffer *eb;
5279
5280         BUG_ON(*level == 0);
5281         eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]);
5282         if (IS_ERR(eb))
5283                 return PTR_ERR(eb);
5284
5285         path->nodes[*level - 1] = eb;
5286         path->slots[*level - 1] = 0;
5287         (*level)--;
5288         return 0;
5289 }
5290
5291 static int tree_move_next_or_upnext(struct btrfs_path *path,
5292                                     int *level, int root_level)
5293 {
5294         int ret = 0;
5295         int nritems;
5296         nritems = btrfs_header_nritems(path->nodes[*level]);
5297
5298         path->slots[*level]++;
5299
5300         while (path->slots[*level] >= nritems) {
5301                 if (*level == root_level)
5302                         return -1;
5303
5304                 /* move upnext */
5305                 path->slots[*level] = 0;
5306                 free_extent_buffer(path->nodes[*level]);
5307                 path->nodes[*level] = NULL;
5308                 (*level)++;
5309                 path->slots[*level]++;
5310
5311                 nritems = btrfs_header_nritems(path->nodes[*level]);
5312                 ret = 1;
5313         }
5314         return ret;
5315 }
5316
5317 /*
5318  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5319  * or down.
5320  */
5321 static int tree_advance(struct btrfs_fs_info *fs_info,
5322                         struct btrfs_path *path,
5323                         int *level, int root_level,
5324                         int allow_down,
5325                         struct btrfs_key *key)
5326 {
5327         int ret;
5328
5329         if (*level == 0 || !allow_down) {
5330                 ret = tree_move_next_or_upnext(path, level, root_level);
5331         } else {
5332                 ret = tree_move_down(fs_info, path, level);
5333         }
5334         if (ret >= 0) {
5335                 if (*level == 0)
5336                         btrfs_item_key_to_cpu(path->nodes[*level], key,
5337                                         path->slots[*level]);
5338                 else
5339                         btrfs_node_key_to_cpu(path->nodes[*level], key,
5340                                         path->slots[*level]);
5341         }
5342         return ret;
5343 }
5344
5345 static int tree_compare_item(struct btrfs_path *left_path,
5346                              struct btrfs_path *right_path,
5347                              char *tmp_buf)
5348 {
5349         int cmp;
5350         int len1, len2;
5351         unsigned long off1, off2;
5352
5353         len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5354         len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5355         if (len1 != len2)
5356                 return 1;
5357
5358         off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5359         off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5360                                 right_path->slots[0]);
5361
5362         read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5363
5364         cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5365         if (cmp)
5366                 return 1;
5367         return 0;
5368 }
5369
5370 #define ADVANCE 1
5371 #define ADVANCE_ONLY_NEXT -1
5372
5373 /*
5374  * This function compares two trees and calls the provided callback for
5375  * every changed/new/deleted item it finds.
5376  * If shared tree blocks are encountered, whole subtrees are skipped, making
5377  * the compare pretty fast on snapshotted subvolumes.
5378  *
5379  * This currently works on commit roots only. As commit roots are read only,
5380  * we don't do any locking. The commit roots are protected with transactions.
5381  * Transactions are ended and rejoined when a commit is tried in between.
5382  *
5383  * This function checks for modifications done to the trees while comparing.
5384  * If it detects a change, it aborts immediately.
5385  */
5386 int btrfs_compare_trees(struct btrfs_root *left_root,
5387                         struct btrfs_root *right_root,
5388                         btrfs_changed_cb_t changed_cb, void *ctx)
5389 {
5390         struct btrfs_fs_info *fs_info = left_root->fs_info;
5391         int ret;
5392         int cmp;
5393         struct btrfs_path *left_path = NULL;
5394         struct btrfs_path *right_path = NULL;
5395         struct btrfs_key left_key;
5396         struct btrfs_key right_key;
5397         char *tmp_buf = NULL;
5398         int left_root_level;
5399         int right_root_level;
5400         int left_level;
5401         int right_level;
5402         int left_end_reached;
5403         int right_end_reached;
5404         int advance_left;
5405         int advance_right;
5406         u64 left_blockptr;
5407         u64 right_blockptr;
5408         u64 left_gen;
5409         u64 right_gen;
5410
5411         left_path = btrfs_alloc_path();
5412         if (!left_path) {
5413                 ret = -ENOMEM;
5414                 goto out;
5415         }
5416         right_path = btrfs_alloc_path();
5417         if (!right_path) {
5418                 ret = -ENOMEM;
5419                 goto out;
5420         }
5421
5422         tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
5423         if (!tmp_buf) {
5424                 ret = -ENOMEM;
5425                 goto out;
5426         }
5427
5428         left_path->search_commit_root = 1;
5429         left_path->skip_locking = 1;
5430         right_path->search_commit_root = 1;
5431         right_path->skip_locking = 1;
5432
5433         /*
5434          * Strategy: Go to the first items of both trees. Then do
5435          *
5436          * If both trees are at level 0
5437          *   Compare keys of current items
5438          *     If left < right treat left item as new, advance left tree
5439          *       and repeat
5440          *     If left > right treat right item as deleted, advance right tree
5441          *       and repeat
5442          *     If left == right do deep compare of items, treat as changed if
5443          *       needed, advance both trees and repeat
5444          * If both trees are at the same level but not at level 0
5445          *   Compare keys of current nodes/leafs
5446          *     If left < right advance left tree and repeat
5447          *     If left > right advance right tree and repeat
5448          *     If left == right compare blockptrs of the next nodes/leafs
5449          *       If they match advance both trees but stay at the same level
5450          *         and repeat
5451          *       If they don't match advance both trees while allowing to go
5452          *         deeper and repeat
5453          * If tree levels are different
5454          *   Advance the tree that needs it and repeat
5455          *
5456          * Advancing a tree means:
5457          *   If we are at level 0, try to go to the next slot. If that's not
5458          *   possible, go one level up and repeat. Stop when we found a level
5459          *   where we could go to the next slot. We may at this point be on a
5460          *   node or a leaf.
5461          *
5462          *   If we are not at level 0 and not on shared tree blocks, go one
5463          *   level deeper.
5464          *
5465          *   If we are not at level 0 and on shared tree blocks, go one slot to
5466          *   the right if possible or go up and right.
5467          */
5468
5469         down_read(&fs_info->commit_root_sem);
5470         left_level = btrfs_header_level(left_root->commit_root);
5471         left_root_level = left_level;
5472         left_path->nodes[left_level] =
5473                         btrfs_clone_extent_buffer(left_root->commit_root);
5474         if (!left_path->nodes[left_level]) {
5475                 up_read(&fs_info->commit_root_sem);
5476                 ret = -ENOMEM;
5477                 goto out;
5478         }
5479         extent_buffer_get(left_path->nodes[left_level]);
5480
5481         right_level = btrfs_header_level(right_root->commit_root);
5482         right_root_level = right_level;
5483         right_path->nodes[right_level] =
5484                         btrfs_clone_extent_buffer(right_root->commit_root);
5485         if (!right_path->nodes[right_level]) {
5486                 up_read(&fs_info->commit_root_sem);
5487                 ret = -ENOMEM;
5488                 goto out;
5489         }
5490         extent_buffer_get(right_path->nodes[right_level]);
5491         up_read(&fs_info->commit_root_sem);
5492
5493         if (left_level == 0)
5494                 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5495                                 &left_key, left_path->slots[left_level]);
5496         else
5497                 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5498                                 &left_key, left_path->slots[left_level]);
5499         if (right_level == 0)
5500                 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5501                                 &right_key, right_path->slots[right_level]);
5502         else
5503                 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5504                                 &right_key, right_path->slots[right_level]);
5505
5506         left_end_reached = right_end_reached = 0;
5507         advance_left = advance_right = 0;
5508
5509         while (1) {
5510                 cond_resched();
5511                 if (advance_left && !left_end_reached) {
5512                         ret = tree_advance(fs_info, left_path, &left_level,
5513                                         left_root_level,
5514                                         advance_left != ADVANCE_ONLY_NEXT,
5515                                         &left_key);
5516                         if (ret == -1)
5517                                 left_end_reached = ADVANCE;
5518                         else if (ret < 0)
5519                                 goto out;
5520                         advance_left = 0;
5521                 }
5522                 if (advance_right && !right_end_reached) {
5523                         ret = tree_advance(fs_info, right_path, &right_level,
5524                                         right_root_level,
5525                                         advance_right != ADVANCE_ONLY_NEXT,
5526                                         &right_key);
5527                         if (ret == -1)
5528                                 right_end_reached = ADVANCE;
5529                         else if (ret < 0)
5530                                 goto out;
5531                         advance_right = 0;
5532                 }
5533
5534                 if (left_end_reached && right_end_reached) {
5535                         ret = 0;
5536                         goto out;
5537                 } else if (left_end_reached) {
5538                         if (right_level == 0) {
5539                                 ret = changed_cb(left_root, right_root,
5540                                                 left_path, right_path,
5541                                                 &right_key,
5542                                                 BTRFS_COMPARE_TREE_DELETED,
5543                                                 ctx);
5544                                 if (ret < 0)
5545                                         goto out;
5546                         }
5547                         advance_right = ADVANCE;
5548                         continue;
5549                 } else if (right_end_reached) {
5550                         if (left_level == 0) {
5551                                 ret = changed_cb(left_root, right_root,
5552                                                 left_path, right_path,
5553                                                 &left_key,
5554                                                 BTRFS_COMPARE_TREE_NEW,
5555                                                 ctx);
5556                                 if (ret < 0)
5557                                         goto out;
5558                         }
5559                         advance_left = ADVANCE;
5560                         continue;
5561                 }
5562
5563                 if (left_level == 0 && right_level == 0) {
5564                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5565                         if (cmp < 0) {
5566                                 ret = changed_cb(left_root, right_root,
5567                                                 left_path, right_path,
5568                                                 &left_key,
5569                                                 BTRFS_COMPARE_TREE_NEW,
5570                                                 ctx);
5571                                 if (ret < 0)
5572                                         goto out;
5573                                 advance_left = ADVANCE;
5574                         } else if (cmp > 0) {
5575                                 ret = changed_cb(left_root, right_root,
5576                                                 left_path, right_path,
5577                                                 &right_key,
5578                                                 BTRFS_COMPARE_TREE_DELETED,
5579                                                 ctx);
5580                                 if (ret < 0)
5581                                         goto out;
5582                                 advance_right = ADVANCE;
5583                         } else {
5584                                 enum btrfs_compare_tree_result result;
5585
5586                                 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5587                                 ret = tree_compare_item(left_path, right_path,
5588                                                         tmp_buf);
5589                                 if (ret)
5590                                         result = BTRFS_COMPARE_TREE_CHANGED;
5591                                 else
5592                                         result = BTRFS_COMPARE_TREE_SAME;
5593                                 ret = changed_cb(left_root, right_root,
5594                                                  left_path, right_path,
5595                                                  &left_key, result, ctx);
5596                                 if (ret < 0)
5597                                         goto out;
5598                                 advance_left = ADVANCE;
5599                                 advance_right = ADVANCE;
5600                         }
5601                 } else if (left_level == right_level) {
5602                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5603                         if (cmp < 0) {
5604                                 advance_left = ADVANCE;
5605                         } else if (cmp > 0) {
5606                                 advance_right = ADVANCE;
5607                         } else {
5608                                 left_blockptr = btrfs_node_blockptr(
5609                                                 left_path->nodes[left_level],
5610                                                 left_path->slots[left_level]);
5611                                 right_blockptr = btrfs_node_blockptr(
5612                                                 right_path->nodes[right_level],
5613                                                 right_path->slots[right_level]);
5614                                 left_gen = btrfs_node_ptr_generation(
5615                                                 left_path->nodes[left_level],
5616                                                 left_path->slots[left_level]);
5617                                 right_gen = btrfs_node_ptr_generation(
5618                                                 right_path->nodes[right_level],
5619                                                 right_path->slots[right_level]);
5620                                 if (left_blockptr == right_blockptr &&
5621                                     left_gen == right_gen) {
5622                                         /*
5623                                          * As we're on a shared block, don't
5624                                          * allow to go deeper.
5625                                          */
5626                                         advance_left = ADVANCE_ONLY_NEXT;
5627                                         advance_right = ADVANCE_ONLY_NEXT;
5628                                 } else {
5629                                         advance_left = ADVANCE;
5630                                         advance_right = ADVANCE;
5631                                 }
5632                         }
5633                 } else if (left_level < right_level) {
5634                         advance_right = ADVANCE;
5635                 } else {
5636                         advance_left = ADVANCE;
5637                 }
5638         }
5639
5640 out:
5641         btrfs_free_path(left_path);
5642         btrfs_free_path(right_path);
5643         kvfree(tmp_buf);
5644         return ret;
5645 }
5646
5647 /*
5648  * this is similar to btrfs_next_leaf, but does not try to preserve
5649  * and fixup the path.  It looks for and returns the next key in the
5650  * tree based on the current path and the min_trans parameters.
5651  *
5652  * 0 is returned if another key is found, < 0 if there are any errors
5653  * and 1 is returned if there are no higher keys in the tree
5654  *
5655  * path->keep_locks should be set to 1 on the search made before
5656  * calling this function.
5657  */
5658 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5659                         struct btrfs_key *key, int level, u64 min_trans)
5660 {
5661         int slot;
5662         struct extent_buffer *c;
5663
5664         WARN_ON(!path->keep_locks);
5665         while (level < BTRFS_MAX_LEVEL) {
5666                 if (!path->nodes[level])
5667                         return 1;
5668
5669                 slot = path->slots[level] + 1;
5670                 c = path->nodes[level];
5671 next:
5672                 if (slot >= btrfs_header_nritems(c)) {
5673                         int ret;
5674                         int orig_lowest;
5675                         struct btrfs_key cur_key;
5676                         if (level + 1 >= BTRFS_MAX_LEVEL ||
5677                             !path->nodes[level + 1])
5678                                 return 1;
5679
5680                         if (path->locks[level + 1]) {
5681                                 level++;
5682                                 continue;
5683                         }
5684
5685                         slot = btrfs_header_nritems(c) - 1;
5686                         if (level == 0)
5687                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
5688                         else
5689                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
5690
5691                         orig_lowest = path->lowest_level;
5692                         btrfs_release_path(path);
5693                         path->lowest_level = level;
5694                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
5695                                                 0, 0);
5696                         path->lowest_level = orig_lowest;
5697                         if (ret < 0)
5698                                 return ret;
5699
5700                         c = path->nodes[level];
5701                         slot = path->slots[level];
5702                         if (ret == 0)
5703                                 slot++;
5704                         goto next;
5705                 }
5706
5707                 if (level == 0)
5708                         btrfs_item_key_to_cpu(c, key, slot);
5709                 else {
5710                         u64 gen = btrfs_node_ptr_generation(c, slot);
5711
5712                         if (gen < min_trans) {
5713                                 slot++;
5714                                 goto next;
5715                         }
5716                         btrfs_node_key_to_cpu(c, key, slot);
5717                 }
5718                 return 0;
5719         }
5720         return 1;
5721 }
5722
5723 /*
5724  * search the tree again to find a leaf with greater keys
5725  * returns 0 if it found something or 1 if there are no greater leaves.
5726  * returns < 0 on io errors.
5727  */
5728 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5729 {
5730         return btrfs_next_old_leaf(root, path, 0);
5731 }
5732
5733 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5734                         u64 time_seq)
5735 {
5736         int slot;
5737         int level;
5738         struct extent_buffer *c;
5739         struct extent_buffer *next;
5740         struct btrfs_key key;
5741         u32 nritems;
5742         int ret;
5743         int old_spinning = path->leave_spinning;
5744         int next_rw_lock = 0;
5745
5746         nritems = btrfs_header_nritems(path->nodes[0]);
5747         if (nritems == 0)
5748                 return 1;
5749
5750         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5751 again:
5752         level = 1;
5753         next = NULL;
5754         next_rw_lock = 0;
5755         btrfs_release_path(path);
5756
5757         path->keep_locks = 1;
5758         path->leave_spinning = 1;
5759
5760         if (time_seq)
5761                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5762         else
5763                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5764         path->keep_locks = 0;
5765
5766         if (ret < 0)
5767                 return ret;
5768
5769         nritems = btrfs_header_nritems(path->nodes[0]);
5770         /*
5771          * by releasing the path above we dropped all our locks.  A balance
5772          * could have added more items next to the key that used to be
5773          * at the very end of the block.  So, check again here and
5774          * advance the path if there are now more items available.
5775          */
5776         if (nritems > 0 && path->slots[0] < nritems - 1) {
5777                 if (ret == 0)
5778                         path->slots[0]++;
5779                 ret = 0;
5780                 goto done;
5781         }
5782         /*
5783          * So the above check misses one case:
5784          * - after releasing the path above, someone has removed the item that
5785          *   used to be at the very end of the block, and balance between leafs
5786          *   gets another one with bigger key.offset to replace it.
5787          *
5788          * This one should be returned as well, or we can get leaf corruption
5789          * later(esp. in __btrfs_drop_extents()).
5790          *
5791          * And a bit more explanation about this check,
5792          * with ret > 0, the key isn't found, the path points to the slot
5793          * where it should be inserted, so the path->slots[0] item must be the
5794          * bigger one.
5795          */
5796         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5797                 ret = 0;
5798                 goto done;
5799         }
5800
5801         while (level < BTRFS_MAX_LEVEL) {
5802                 if (!path->nodes[level]) {
5803                         ret = 1;
5804                         goto done;
5805                 }
5806
5807                 slot = path->slots[level] + 1;
5808                 c = path->nodes[level];
5809                 if (slot >= btrfs_header_nritems(c)) {
5810                         level++;
5811                         if (level == BTRFS_MAX_LEVEL) {
5812                                 ret = 1;
5813                                 goto done;
5814                         }
5815                         continue;
5816                 }
5817
5818                 if (next) {
5819                         btrfs_tree_unlock_rw(next, next_rw_lock);
5820                         free_extent_buffer(next);
5821                 }
5822
5823                 next = c;
5824                 next_rw_lock = path->locks[level];
5825                 ret = read_block_for_search(root, path, &next, level,
5826                                             slot, &key);
5827                 if (ret == -EAGAIN)
5828                         goto again;
5829
5830                 if (ret < 0) {
5831                         btrfs_release_path(path);
5832                         goto done;
5833                 }
5834
5835                 if (!path->skip_locking) {
5836                         ret = btrfs_try_tree_read_lock(next);
5837                         if (!ret && time_seq) {
5838                                 /*
5839                                  * If we don't get the lock, we may be racing
5840                                  * with push_leaf_left, holding that lock while
5841                                  * itself waiting for the leaf we've currently
5842                                  * locked. To solve this situation, we give up
5843                                  * on our lock and cycle.
5844                                  */
5845                                 free_extent_buffer(next);
5846                                 btrfs_release_path(path);
5847                                 cond_resched();
5848                                 goto again;
5849                         }
5850                         if (!ret) {
5851                                 btrfs_set_path_blocking(path);
5852                                 btrfs_tree_read_lock(next);
5853                                 btrfs_clear_path_blocking(path, next,
5854                                                           BTRFS_READ_LOCK);
5855                         }
5856                         next_rw_lock = BTRFS_READ_LOCK;
5857                 }
5858                 break;
5859         }
5860         path->slots[level] = slot;
5861         while (1) {
5862                 level--;
5863                 c = path->nodes[level];
5864                 if (path->locks[level])
5865                         btrfs_tree_unlock_rw(c, path->locks[level]);
5866
5867                 free_extent_buffer(c);
5868                 path->nodes[level] = next;
5869                 path->slots[level] = 0;
5870                 if (!path->skip_locking)
5871                         path->locks[level] = next_rw_lock;
5872                 if (!level)
5873                         break;
5874
5875                 ret = read_block_for_search(root, path, &next, level,
5876                                             0, &key);
5877                 if (ret == -EAGAIN)
5878                         goto again;
5879
5880                 if (ret < 0) {
5881                         btrfs_release_path(path);
5882                         goto done;
5883                 }
5884
5885                 if (!path->skip_locking) {
5886                         ret = btrfs_try_tree_read_lock(next);
5887                         if (!ret) {
5888                                 btrfs_set_path_blocking(path);
5889                                 btrfs_tree_read_lock(next);
5890                                 btrfs_clear_path_blocking(path, next,
5891                                                           BTRFS_READ_LOCK);
5892                         }
5893                         next_rw_lock = BTRFS_READ_LOCK;
5894                 }
5895         }
5896         ret = 0;
5897 done:
5898         unlock_up(path, 0, 1, 0, NULL);
5899         path->leave_spinning = old_spinning;
5900         if (!old_spinning)
5901                 btrfs_set_path_blocking(path);
5902
5903         return ret;
5904 }
5905
5906 /*
5907  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5908  * searching until it gets past min_objectid or finds an item of 'type'
5909  *
5910  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5911  */
5912 int btrfs_previous_item(struct btrfs_root *root,
5913                         struct btrfs_path *path, u64 min_objectid,
5914                         int type)
5915 {
5916         struct btrfs_key found_key;
5917         struct extent_buffer *leaf;
5918         u32 nritems;
5919         int ret;
5920
5921         while (1) {
5922                 if (path->slots[0] == 0) {
5923                         btrfs_set_path_blocking(path);
5924                         ret = btrfs_prev_leaf(root, path);
5925                         if (ret != 0)
5926                                 return ret;
5927                 } else {
5928                         path->slots[0]--;
5929                 }
5930                 leaf = path->nodes[0];
5931                 nritems = btrfs_header_nritems(leaf);
5932                 if (nritems == 0)
5933                         return 1;
5934                 if (path->slots[0] == nritems)
5935                         path->slots[0]--;
5936
5937                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5938                 if (found_key.objectid < min_objectid)
5939                         break;
5940                 if (found_key.type == type)
5941                         return 0;
5942                 if (found_key.objectid == min_objectid &&
5943                     found_key.type < type)
5944                         break;
5945         }
5946         return 1;
5947 }
5948
5949 /*
5950  * search in extent tree to find a previous Metadata/Data extent item with
5951  * min objecitd.
5952  *
5953  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5954  */
5955 int btrfs_previous_extent_item(struct btrfs_root *root,
5956                         struct btrfs_path *path, u64 min_objectid)
5957 {
5958         struct btrfs_key found_key;
5959         struct extent_buffer *leaf;
5960         u32 nritems;
5961         int ret;
5962
5963         while (1) {
5964                 if (path->slots[0] == 0) {
5965                         btrfs_set_path_blocking(path);
5966                         ret = btrfs_prev_leaf(root, path);
5967                         if (ret != 0)
5968                                 return ret;
5969                 } else {
5970                         path->slots[0]--;
5971                 }
5972                 leaf = path->nodes[0];
5973                 nritems = btrfs_header_nritems(leaf);
5974                 if (nritems == 0)
5975                         return 1;
5976                 if (path->slots[0] == nritems)
5977                         path->slots[0]--;
5978
5979                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5980                 if (found_key.objectid < min_objectid)
5981                         break;
5982                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5983                     found_key.type == BTRFS_METADATA_ITEM_KEY)
5984                         return 0;
5985                 if (found_key.objectid == min_objectid &&
5986                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
5987                         break;
5988         }
5989         return 1;
5990 }