OpenCores
URL https://opencores.org/ocsvn/test_project/test_project/trunk

Subversion Repositories test_project

[/] [test_project/] [trunk/] [linux_sd_driver/] [fs/] [ocfs2/] [alloc.c] - Blame information for rev 62

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 62 marcus.erl
/* -*- mode: c; c-basic-offset: 8; -*-
2
 * vim: noexpandtab sw=8 ts=8 sts=0:
3
 *
4
 * alloc.c
5
 *
6
 * Extent allocs and frees
7
 *
8
 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9
 *
10
 * This program is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2 of the License, or (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public
21
 * License along with this program; if not, write to the
22
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23
 * Boston, MA 021110-1307, USA.
24
 */
25
 
26
#include <linux/fs.h>
27
#include <linux/types.h>
28
#include <linux/slab.h>
29
#include <linux/highmem.h>
30
#include <linux/swap.h>
31
 
32
#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33
#include <cluster/masklog.h>
34
 
35
#include "ocfs2.h"
36
 
37
#include "alloc.h"
38
#include "aops.h"
39
#include "dlmglue.h"
40
#include "extent_map.h"
41
#include "inode.h"
42
#include "journal.h"
43
#include "localalloc.h"
44
#include "suballoc.h"
45
#include "sysfile.h"
46
#include "file.h"
47
#include "super.h"
48
#include "uptodate.h"
49
 
50
#include "buffer_head_io.h"
51
 
52
static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
53
static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
54
                                         struct ocfs2_extent_block *eb);
55
 
56
/*
57
 * Structures which describe a path through a btree, and functions to
58
 * manipulate them.
59
 *
60
 * The idea here is to be as generic as possible with the tree
61
 * manipulation code.
62
 */
63
struct ocfs2_path_item {
64
        struct buffer_head              *bh;
65
        struct ocfs2_extent_list        *el;
66
};
67
 
68
#define OCFS2_MAX_PATH_DEPTH    5
69
 
70
struct ocfs2_path {
71
        int                     p_tree_depth;
72
        struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
73
};
74
 
75
#define path_root_bh(_path) ((_path)->p_node[0].bh)
76
#define path_root_el(_path) ((_path)->p_node[0].el)
77
#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
78
#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
79
#define path_num_items(_path) ((_path)->p_tree_depth + 1)
80
 
81
/*
82
 * Reset the actual path elements so that we can re-use the structure
83
 * to build another path. Generally, this involves freeing the buffer
84
 * heads.
85
 */
86
static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
87
{
88
        int i, start = 0, depth = 0;
89
        struct ocfs2_path_item *node;
90
 
91
        if (keep_root)
92
                start = 1;
93
 
94
        for(i = start; i < path_num_items(path); i++) {
95
                node = &path->p_node[i];
96
 
97
                brelse(node->bh);
98
                node->bh = NULL;
99
                node->el = NULL;
100
        }
101
 
102
        /*
103
         * Tree depth may change during truncate, or insert. If we're
104
         * keeping the root extent list, then make sure that our path
105
         * structure reflects the proper depth.
106
         */
107
        if (keep_root)
108
                depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
109
 
110
        path->p_tree_depth = depth;
111
}
112
 
113
static void ocfs2_free_path(struct ocfs2_path *path)
114
{
115
        if (path) {
116
                ocfs2_reinit_path(path, 0);
117
                kfree(path);
118
        }
119
}
120
 
121
/*
122
 * All the elements of src into dest. After this call, src could be freed
123
 * without affecting dest.
124
 *
125
 * Both paths should have the same root. Any non-root elements of dest
126
 * will be freed.
127
 */
128
static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
129
{
130
        int i;
131
 
132
        BUG_ON(path_root_bh(dest) != path_root_bh(src));
133
        BUG_ON(path_root_el(dest) != path_root_el(src));
134
 
135
        ocfs2_reinit_path(dest, 1);
136
 
137
        for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
138
                dest->p_node[i].bh = src->p_node[i].bh;
139
                dest->p_node[i].el = src->p_node[i].el;
140
 
141
                if (dest->p_node[i].bh)
142
                        get_bh(dest->p_node[i].bh);
143
        }
144
}
145
 
146
/*
147
 * Make the *dest path the same as src and re-initialize src path to
148
 * have a root only.
149
 */
150
static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
151
{
152
        int i;
153
 
154
        BUG_ON(path_root_bh(dest) != path_root_bh(src));
155
 
156
        for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
157
                brelse(dest->p_node[i].bh);
158
 
159
                dest->p_node[i].bh = src->p_node[i].bh;
160
                dest->p_node[i].el = src->p_node[i].el;
161
 
162
                src->p_node[i].bh = NULL;
163
                src->p_node[i].el = NULL;
164
        }
165
}
166
 
167
/*
168
 * Insert an extent block at given index.
169
 *
170
 * This will not take an additional reference on eb_bh.
171
 */
172
static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
173
                                        struct buffer_head *eb_bh)
174
{
175
        struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
176
 
177
        /*
178
         * Right now, no root bh is an extent block, so this helps
179
         * catch code errors with dinode trees. The assertion can be
180
         * safely removed if we ever need to insert extent block
181
         * structures at the root.
182
         */
183
        BUG_ON(index == 0);
184
 
185
        path->p_node[index].bh = eb_bh;
186
        path->p_node[index].el = &eb->h_list;
187
}
188
 
189
static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
190
                                         struct ocfs2_extent_list *root_el)
191
{
192
        struct ocfs2_path *path;
193
 
194
        BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
195
 
196
        path = kzalloc(sizeof(*path), GFP_NOFS);
197
        if (path) {
198
                path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
199
                get_bh(root_bh);
200
                path_root_bh(path) = root_bh;
201
                path_root_el(path) = root_el;
202
        }
203
 
204
        return path;
205
}
206
 
207
/*
208
 * Allocate and initialize a new path based on a disk inode tree.
209
 */
210
static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
211
{
212
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
213
        struct ocfs2_extent_list *el = &di->id2.i_list;
214
 
215
        return ocfs2_new_path(di_bh, el);
216
}
217
 
218
/*
219
 * Convenience function to journal all components in a path.
220
 */
221
static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
222
                                     struct ocfs2_path *path)
223
{
224
        int i, ret = 0;
225
 
226
        if (!path)
227
                goto out;
228
 
229
        for(i = 0; i < path_num_items(path); i++) {
230
                ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
231
                                           OCFS2_JOURNAL_ACCESS_WRITE);
232
                if (ret < 0) {
233
                        mlog_errno(ret);
234
                        goto out;
235
                }
236
        }
237
 
238
out:
239
        return ret;
240
}
241
 
242
/*
243
 * Return the index of the extent record which contains cluster #v_cluster.
244
 * -1 is returned if it was not found.
245
 *
246
 * Should work fine on interior and exterior nodes.
247
 */
248
int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
249
{
250
        int ret = -1;
251
        int i;
252
        struct ocfs2_extent_rec *rec;
253
        u32 rec_end, rec_start, clusters;
254
 
255
        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
256
                rec = &el->l_recs[i];
257
 
258
                rec_start = le32_to_cpu(rec->e_cpos);
259
                clusters = ocfs2_rec_clusters(el, rec);
260
 
261
                rec_end = rec_start + clusters;
262
 
263
                if (v_cluster >= rec_start && v_cluster < rec_end) {
264
                        ret = i;
265
                        break;
266
                }
267
        }
268
 
269
        return ret;
270
}
271
 
272
enum ocfs2_contig_type {
273
        CONTIG_NONE = 0,
274
        CONTIG_LEFT,
275
        CONTIG_RIGHT,
276
        CONTIG_LEFTRIGHT,
277
};
278
 
279
 
280
/*
281
 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
282
 * ocfs2_extent_contig only work properly against leaf nodes!
283
 */
284
static int ocfs2_block_extent_contig(struct super_block *sb,
285
                                     struct ocfs2_extent_rec *ext,
286
                                     u64 blkno)
287
{
288
        u64 blk_end = le64_to_cpu(ext->e_blkno);
289
 
290
        blk_end += ocfs2_clusters_to_blocks(sb,
291
                                    le16_to_cpu(ext->e_leaf_clusters));
292
 
293
        return blkno == blk_end;
294
}
295
 
296
static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
297
                                  struct ocfs2_extent_rec *right)
298
{
299
        u32 left_range;
300
 
301
        left_range = le32_to_cpu(left->e_cpos) +
302
                le16_to_cpu(left->e_leaf_clusters);
303
 
304
        return (left_range == le32_to_cpu(right->e_cpos));
305
}
306
 
307
static enum ocfs2_contig_type
308
        ocfs2_extent_contig(struct inode *inode,
309
                            struct ocfs2_extent_rec *ext,
310
                            struct ocfs2_extent_rec *insert_rec)
311
{
312
        u64 blkno = le64_to_cpu(insert_rec->e_blkno);
313
 
314
        /*
315
         * Refuse to coalesce extent records with different flag
316
         * fields - we don't want to mix unwritten extents with user
317
         * data.
318
         */
319
        if (ext->e_flags != insert_rec->e_flags)
320
                return CONTIG_NONE;
321
 
322
        if (ocfs2_extents_adjacent(ext, insert_rec) &&
323
            ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
324
                        return CONTIG_RIGHT;
325
 
326
        blkno = le64_to_cpu(ext->e_blkno);
327
        if (ocfs2_extents_adjacent(insert_rec, ext) &&
328
            ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
329
                return CONTIG_LEFT;
330
 
331
        return CONTIG_NONE;
332
}
333
 
334
/*
335
 * NOTE: We can have pretty much any combination of contiguousness and
336
 * appending.
337
 *
338
 * The usefulness of APPEND_TAIL is more in that it lets us know that
339
 * we'll have to update the path to that leaf.
340
 */
341
enum ocfs2_append_type {
342
        APPEND_NONE = 0,
343
        APPEND_TAIL,
344
};
345
 
346
enum ocfs2_split_type {
347
        SPLIT_NONE = 0,
348
        SPLIT_LEFT,
349
        SPLIT_RIGHT,
350
};
351
 
352
struct ocfs2_insert_type {
353
        enum ocfs2_split_type   ins_split;
354
        enum ocfs2_append_type  ins_appending;
355
        enum ocfs2_contig_type  ins_contig;
356
        int                     ins_contig_index;
357
        int                     ins_tree_depth;
358
};
359
 
360
struct ocfs2_merge_ctxt {
361
        enum ocfs2_contig_type  c_contig_type;
362
        int                     c_has_empty_extent;
363
        int                     c_split_covers_rec;
364
};
365
 
366
/*
367
 * How many free extents have we got before we need more meta data?
368
 */
369
int ocfs2_num_free_extents(struct ocfs2_super *osb,
370
                           struct inode *inode,
371
                           struct ocfs2_dinode *fe)
372
{
373
        int retval;
374
        struct ocfs2_extent_list *el;
375
        struct ocfs2_extent_block *eb;
376
        struct buffer_head *eb_bh = NULL;
377
 
378
        mlog_entry_void();
379
 
380
        if (!OCFS2_IS_VALID_DINODE(fe)) {
381
                OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
382
                retval = -EIO;
383
                goto bail;
384
        }
385
 
386
        if (fe->i_last_eb_blk) {
387
                retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
388
                                          &eb_bh, OCFS2_BH_CACHED, inode);
389
                if (retval < 0) {
390
                        mlog_errno(retval);
391
                        goto bail;
392
                }
393
                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
394
                el = &eb->h_list;
395
        } else
396
                el = &fe->id2.i_list;
397
 
398
        BUG_ON(el->l_tree_depth != 0);
399
 
400
        retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
401
bail:
402
        if (eb_bh)
403
                brelse(eb_bh);
404
 
405
        mlog_exit(retval);
406
        return retval;
407
}
408
 
409
/* expects array to already be allocated
410
 *
411
 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
412
 * l_count for you
413
 */
414
static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
415
                                     handle_t *handle,
416
                                     struct inode *inode,
417
                                     int wanted,
418
                                     struct ocfs2_alloc_context *meta_ac,
419
                                     struct buffer_head *bhs[])
420
{
421
        int count, status, i;
422
        u16 suballoc_bit_start;
423
        u32 num_got;
424
        u64 first_blkno;
425
        struct ocfs2_extent_block *eb;
426
 
427
        mlog_entry_void();
428
 
429
        count = 0;
430
        while (count < wanted) {
431
                status = ocfs2_claim_metadata(osb,
432
                                              handle,
433
                                              meta_ac,
434
                                              wanted - count,
435
                                              &suballoc_bit_start,
436
                                              &num_got,
437
                                              &first_blkno);
438
                if (status < 0) {
439
                        mlog_errno(status);
440
                        goto bail;
441
                }
442
 
443
                for(i = count;  i < (num_got + count); i++) {
444
                        bhs[i] = sb_getblk(osb->sb, first_blkno);
445
                        if (bhs[i] == NULL) {
446
                                status = -EIO;
447
                                mlog_errno(status);
448
                                goto bail;
449
                        }
450
                        ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
451
 
452
                        status = ocfs2_journal_access(handle, inode, bhs[i],
453
                                                      OCFS2_JOURNAL_ACCESS_CREATE);
454
                        if (status < 0) {
455
                                mlog_errno(status);
456
                                goto bail;
457
                        }
458
 
459
                        memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
460
                        eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
461
                        /* Ok, setup the minimal stuff here. */
462
                        strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
463
                        eb->h_blkno = cpu_to_le64(first_blkno);
464
                        eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
465
                        eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
466
                        eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
467
                        eb->h_list.l_count =
468
                                cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
469
 
470
                        suballoc_bit_start++;
471
                        first_blkno++;
472
 
473
                        /* We'll also be dirtied by the caller, so
474
                         * this isn't absolutely necessary. */
475
                        status = ocfs2_journal_dirty(handle, bhs[i]);
476
                        if (status < 0) {
477
                                mlog_errno(status);
478
                                goto bail;
479
                        }
480
                }
481
 
482
                count += num_got;
483
        }
484
 
485
        status = 0;
486
bail:
487
        if (status < 0) {
488
                for(i = 0; i < wanted; i++) {
489
                        if (bhs[i])
490
                                brelse(bhs[i]);
491
                        bhs[i] = NULL;
492
                }
493
        }
494
        mlog_exit(status);
495
        return status;
496
}
497
 
498
/*
499
 * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
500
 *
501
 * Returns the sum of the rightmost extent rec logical offset and
502
 * cluster count.
503
 *
504
 * ocfs2_add_branch() uses this to determine what logical cluster
505
 * value should be populated into the leftmost new branch records.
506
 *
507
 * ocfs2_shift_tree_depth() uses this to determine the # clusters
508
 * value for the new topmost tree record.
509
 */
510
static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
511
{
512
        int i;
513
 
514
        i = le16_to_cpu(el->l_next_free_rec) - 1;
515
 
516
        return le32_to_cpu(el->l_recs[i].e_cpos) +
517
                ocfs2_rec_clusters(el, &el->l_recs[i]);
518
}
519
 
520
/*
521
 * Add an entire tree branch to our inode. eb_bh is the extent block
522
 * to start at, if we don't want to start the branch at the dinode
523
 * structure.
524
 *
525
 * last_eb_bh is required as we have to update it's next_leaf pointer
526
 * for the new last extent block.
527
 *
528
 * the new branch will be 'empty' in the sense that every block will
529
 * contain a single record with cluster count == 0.
530
 */
531
static int ocfs2_add_branch(struct ocfs2_super *osb,
532
                            handle_t *handle,
533
                            struct inode *inode,
534
                            struct buffer_head *fe_bh,
535
                            struct buffer_head *eb_bh,
536
                            struct buffer_head **last_eb_bh,
537
                            struct ocfs2_alloc_context *meta_ac)
538
{
539
        int status, new_blocks, i;
540
        u64 next_blkno, new_last_eb_blk;
541
        struct buffer_head *bh;
542
        struct buffer_head **new_eb_bhs = NULL;
543
        struct ocfs2_dinode *fe;
544
        struct ocfs2_extent_block *eb;
545
        struct ocfs2_extent_list  *eb_el;
546
        struct ocfs2_extent_list  *el;
547
        u32 new_cpos;
548
 
549
        mlog_entry_void();
550
 
551
        BUG_ON(!last_eb_bh || !*last_eb_bh);
552
 
553
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
554
 
555
        if (eb_bh) {
556
                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
557
                el = &eb->h_list;
558
        } else
559
                el = &fe->id2.i_list;
560
 
561
        /* we never add a branch to a leaf. */
562
        BUG_ON(!el->l_tree_depth);
563
 
564
        new_blocks = le16_to_cpu(el->l_tree_depth);
565
 
566
        /* allocate the number of new eb blocks we need */
567
        new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
568
                             GFP_KERNEL);
569
        if (!new_eb_bhs) {
570
                status = -ENOMEM;
571
                mlog_errno(status);
572
                goto bail;
573
        }
574
 
575
        status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
576
                                           meta_ac, new_eb_bhs);
577
        if (status < 0) {
578
                mlog_errno(status);
579
                goto bail;
580
        }
581
 
582
        eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
583
        new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
584
 
585
        /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
586
         * linked with the rest of the tree.
587
         * conversly, new_eb_bhs[0] is the new bottommost leaf.
588
         *
589
         * when we leave the loop, new_last_eb_blk will point to the
590
         * newest leaf, and next_blkno will point to the topmost extent
591
         * block. */
592
        next_blkno = new_last_eb_blk = 0;
593
        for(i = 0; i < new_blocks; i++) {
594
                bh = new_eb_bhs[i];
595
                eb = (struct ocfs2_extent_block *) bh->b_data;
596
                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
597
                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
598
                        status = -EIO;
599
                        goto bail;
600
                }
601
                eb_el = &eb->h_list;
602
 
603
                status = ocfs2_journal_access(handle, inode, bh,
604
                                              OCFS2_JOURNAL_ACCESS_CREATE);
605
                if (status < 0) {
606
                        mlog_errno(status);
607
                        goto bail;
608
                }
609
 
610
                eb->h_next_leaf_blk = 0;
611
                eb_el->l_tree_depth = cpu_to_le16(i);
612
                eb_el->l_next_free_rec = cpu_to_le16(1);
613
                /*
614
                 * This actually counts as an empty extent as
615
                 * c_clusters == 0
616
                 */
617
                eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
618
                eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
619
                /*
620
                 * eb_el isn't always an interior node, but even leaf
621
                 * nodes want a zero'd flags and reserved field so
622
                 * this gets the whole 32 bits regardless of use.
623
                 */
624
                eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
625
                if (!eb_el->l_tree_depth)
626
                        new_last_eb_blk = le64_to_cpu(eb->h_blkno);
627
 
628
                status = ocfs2_journal_dirty(handle, bh);
629
                if (status < 0) {
630
                        mlog_errno(status);
631
                        goto bail;
632
                }
633
 
634
                next_blkno = le64_to_cpu(eb->h_blkno);
635
        }
636
 
637
        /* This is a bit hairy. We want to update up to three blocks
638
         * here without leaving any of them in an inconsistent state
639
         * in case of error. We don't have to worry about
640
         * journal_dirty erroring as it won't unless we've aborted the
641
         * handle (in which case we would never be here) so reserving
642
         * the write with journal_access is all we need to do. */
643
        status = ocfs2_journal_access(handle, inode, *last_eb_bh,
644
                                      OCFS2_JOURNAL_ACCESS_WRITE);
645
        if (status < 0) {
646
                mlog_errno(status);
647
                goto bail;
648
        }
649
        status = ocfs2_journal_access(handle, inode, fe_bh,
650
                                      OCFS2_JOURNAL_ACCESS_WRITE);
651
        if (status < 0) {
652
                mlog_errno(status);
653
                goto bail;
654
        }
655
        if (eb_bh) {
656
                status = ocfs2_journal_access(handle, inode, eb_bh,
657
                                              OCFS2_JOURNAL_ACCESS_WRITE);
658
                if (status < 0) {
659
                        mlog_errno(status);
660
                        goto bail;
661
                }
662
        }
663
 
664
        /* Link the new branch into the rest of the tree (el will
665
         * either be on the fe, or the extent block passed in. */
666
        i = le16_to_cpu(el->l_next_free_rec);
667
        el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
668
        el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
669
        el->l_recs[i].e_int_clusters = 0;
670
        le16_add_cpu(&el->l_next_free_rec, 1);
671
 
672
        /* fe needs a new last extent block pointer, as does the
673
         * next_leaf on the previously last-extent-block. */
674
        fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
675
 
676
        eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
677
        eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
678
 
679
        status = ocfs2_journal_dirty(handle, *last_eb_bh);
680
        if (status < 0)
681
                mlog_errno(status);
682
        status = ocfs2_journal_dirty(handle, fe_bh);
683
        if (status < 0)
684
                mlog_errno(status);
685
        if (eb_bh) {
686
                status = ocfs2_journal_dirty(handle, eb_bh);
687
                if (status < 0)
688
                        mlog_errno(status);
689
        }
690
 
691
        /*
692
         * Some callers want to track the rightmost leaf so pass it
693
         * back here.
694
         */
695
        brelse(*last_eb_bh);
696
        get_bh(new_eb_bhs[0]);
697
        *last_eb_bh = new_eb_bhs[0];
698
 
699
        status = 0;
700
bail:
701
        if (new_eb_bhs) {
702
                for (i = 0; i < new_blocks; i++)
703
                        if (new_eb_bhs[i])
704
                                brelse(new_eb_bhs[i]);
705
                kfree(new_eb_bhs);
706
        }
707
 
708
        mlog_exit(status);
709
        return status;
710
}
711
 
712
/*
713
 * adds another level to the allocation tree.
714
 * returns back the new extent block so you can add a branch to it
715
 * after this call.
716
 */
717
static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
718
                                  handle_t *handle,
719
                                  struct inode *inode,
720
                                  struct buffer_head *fe_bh,
721
                                  struct ocfs2_alloc_context *meta_ac,
722
                                  struct buffer_head **ret_new_eb_bh)
723
{
724
        int status, i;
725
        u32 new_clusters;
726
        struct buffer_head *new_eb_bh = NULL;
727
        struct ocfs2_dinode *fe;
728
        struct ocfs2_extent_block *eb;
729
        struct ocfs2_extent_list  *fe_el;
730
        struct ocfs2_extent_list  *eb_el;
731
 
732
        mlog_entry_void();
733
 
734
        status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
735
                                           &new_eb_bh);
736
        if (status < 0) {
737
                mlog_errno(status);
738
                goto bail;
739
        }
740
 
741
        eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
742
        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
743
                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
744
                status = -EIO;
745
                goto bail;
746
        }
747
 
748
        eb_el = &eb->h_list;
749
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
750
        fe_el = &fe->id2.i_list;
751
 
752
        status = ocfs2_journal_access(handle, inode, new_eb_bh,
753
                                      OCFS2_JOURNAL_ACCESS_CREATE);
754
        if (status < 0) {
755
                mlog_errno(status);
756
                goto bail;
757
        }
758
 
759
        /* copy the fe data into the new extent block */
760
        eb_el->l_tree_depth = fe_el->l_tree_depth;
761
        eb_el->l_next_free_rec = fe_el->l_next_free_rec;
762
        for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
763
                eb_el->l_recs[i] = fe_el->l_recs[i];
764
 
765
        status = ocfs2_journal_dirty(handle, new_eb_bh);
766
        if (status < 0) {
767
                mlog_errno(status);
768
                goto bail;
769
        }
770
 
771
        status = ocfs2_journal_access(handle, inode, fe_bh,
772
                                      OCFS2_JOURNAL_ACCESS_WRITE);
773
        if (status < 0) {
774
                mlog_errno(status);
775
                goto bail;
776
        }
777
 
778
        new_clusters = ocfs2_sum_rightmost_rec(eb_el);
779
 
780
        /* update fe now */
781
        le16_add_cpu(&fe_el->l_tree_depth, 1);
782
        fe_el->l_recs[0].e_cpos = 0;
783
        fe_el->l_recs[0].e_blkno = eb->h_blkno;
784
        fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
785
        for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
786
                memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
787
        fe_el->l_next_free_rec = cpu_to_le16(1);
788
 
789
        /* If this is our 1st tree depth shift, then last_eb_blk
790
         * becomes the allocated extent block */
791
        if (fe_el->l_tree_depth == cpu_to_le16(1))
792
                fe->i_last_eb_blk = eb->h_blkno;
793
 
794
        status = ocfs2_journal_dirty(handle, fe_bh);
795
        if (status < 0) {
796
                mlog_errno(status);
797
                goto bail;
798
        }
799
 
800
        *ret_new_eb_bh = new_eb_bh;
801
        new_eb_bh = NULL;
802
        status = 0;
803
bail:
804
        if (new_eb_bh)
805
                brelse(new_eb_bh);
806
 
807
        mlog_exit(status);
808
        return status;
809
}
810
 
811
/*
812
 * Should only be called when there is no space left in any of the
813
 * leaf nodes. What we want to do is find the lowest tree depth
814
 * non-leaf extent block with room for new records. There are three
815
 * valid results of this search:
816
 *
817
 * 1) a lowest extent block is found, then we pass it back in
818
 *    *lowest_eb_bh and return '0'
819
 *
820
 * 2) the search fails to find anything, but the dinode has room. We
821
 *    pass NULL back in *lowest_eb_bh, but still return '0'
822
 *
823
 * 3) the search fails to find anything AND the dinode is full, in
824
 *    which case we return > 0
825
 *
826
 * return status < 0 indicates an error.
827
 */
828
static int ocfs2_find_branch_target(struct ocfs2_super *osb,
829
                                    struct inode *inode,
830
                                    struct buffer_head *fe_bh,
831
                                    struct buffer_head **target_bh)
832
{
833
        int status = 0, i;
834
        u64 blkno;
835
        struct ocfs2_dinode *fe;
836
        struct ocfs2_extent_block *eb;
837
        struct ocfs2_extent_list  *el;
838
        struct buffer_head *bh = NULL;
839
        struct buffer_head *lowest_bh = NULL;
840
 
841
        mlog_entry_void();
842
 
843
        *target_bh = NULL;
844
 
845
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
846
        el = &fe->id2.i_list;
847
 
848
        while(le16_to_cpu(el->l_tree_depth) > 1) {
849
                if (le16_to_cpu(el->l_next_free_rec) == 0) {
850
                        ocfs2_error(inode->i_sb, "Dinode %llu has empty "
851
                                    "extent list (next_free_rec == 0)",
852
                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
853
                        status = -EIO;
854
                        goto bail;
855
                }
856
                i = le16_to_cpu(el->l_next_free_rec) - 1;
857
                blkno = le64_to_cpu(el->l_recs[i].e_blkno);
858
                if (!blkno) {
859
                        ocfs2_error(inode->i_sb, "Dinode %llu has extent "
860
                                    "list where extent # %d has no physical "
861
                                    "block start",
862
                                    (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
863
                        status = -EIO;
864
                        goto bail;
865
                }
866
 
867
                if (bh) {
868
                        brelse(bh);
869
                        bh = NULL;
870
                }
871
 
872
                status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
873
                                          inode);
874
                if (status < 0) {
875
                        mlog_errno(status);
876
                        goto bail;
877
                }
878
 
879
                eb = (struct ocfs2_extent_block *) bh->b_data;
880
                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
881
                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
882
                        status = -EIO;
883
                        goto bail;
884
                }
885
                el = &eb->h_list;
886
 
887
                if (le16_to_cpu(el->l_next_free_rec) <
888
                    le16_to_cpu(el->l_count)) {
889
                        if (lowest_bh)
890
                                brelse(lowest_bh);
891
                        lowest_bh = bh;
892
                        get_bh(lowest_bh);
893
                }
894
        }
895
 
896
        /* If we didn't find one and the fe doesn't have any room,
897
         * then return '1' */
898
        if (!lowest_bh
899
            && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
900
                status = 1;
901
 
902
        *target_bh = lowest_bh;
903
bail:
904
        if (bh)
905
                brelse(bh);
906
 
907
        mlog_exit(status);
908
        return status;
909
}
910
 
911
/*
912
 * Grow a b-tree so that it has more records.
913
 *
914
 * We might shift the tree depth in which case existing paths should
915
 * be considered invalid.
916
 *
917
 * Tree depth after the grow is returned via *final_depth.
918
 *
919
 * *last_eb_bh will be updated by ocfs2_add_branch().
920
 */
921
static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
922
                           struct buffer_head *di_bh, int *final_depth,
923
                           struct buffer_head **last_eb_bh,
924
                           struct ocfs2_alloc_context *meta_ac)
925
{
926
        int ret, shift;
927
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
928
        int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
929
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
930
        struct buffer_head *bh = NULL;
931
 
932
        BUG_ON(meta_ac == NULL);
933
 
934
        shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
935
        if (shift < 0) {
936
                ret = shift;
937
                mlog_errno(ret);
938
                goto out;
939
        }
940
 
941
        /* We traveled all the way to the bottom of the allocation tree
942
         * and didn't find room for any more extents - we need to add
943
         * another tree level */
944
        if (shift) {
945
                BUG_ON(bh);
946
                mlog(0, "need to shift tree depth (current = %d)\n", depth);
947
 
948
                /* ocfs2_shift_tree_depth will return us a buffer with
949
                 * the new extent block (so we can pass that to
950
                 * ocfs2_add_branch). */
951
                ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
952
                                             meta_ac, &bh);
953
                if (ret < 0) {
954
                        mlog_errno(ret);
955
                        goto out;
956
                }
957
                depth++;
958
                if (depth == 1) {
959
                        /*
960
                         * Special case: we have room now if we shifted from
961
                         * tree_depth 0, so no more work needs to be done.
962
                         *
963
                         * We won't be calling add_branch, so pass
964
                         * back *last_eb_bh as the new leaf. At depth
965
                         * zero, it should always be null so there's
966
                         * no reason to brelse.
967
                         */
968
                        BUG_ON(*last_eb_bh);
969
                        get_bh(bh);
970
                        *last_eb_bh = bh;
971
                        goto out;
972
                }
973
        }
974
 
975
        /* call ocfs2_add_branch to add the final part of the tree with
976
         * the new data. */
977
        mlog(0, "add branch. bh = %p\n", bh);
978
        ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
979
                               meta_ac);
980
        if (ret < 0) {
981
                mlog_errno(ret);
982
                goto out;
983
        }
984
 
985
out:
986
        if (final_depth)
987
                *final_depth = depth;
988
        brelse(bh);
989
        return ret;
990
}
991
 
992
/*
993
 * This is only valid for leaf nodes, which are the only ones that can
994
 * have empty extents anyway.
995
 */
996
static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
997
{
998
        return !rec->e_leaf_clusters;
999
}
1000
 
1001
/*
1002
 * This function will discard the rightmost extent record.
1003
 */
1004
static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1005
{
1006
        int next_free = le16_to_cpu(el->l_next_free_rec);
1007
        int count = le16_to_cpu(el->l_count);
1008
        unsigned int num_bytes;
1009
 
1010
        BUG_ON(!next_free);
1011
        /* This will cause us to go off the end of our extent list. */
1012
        BUG_ON(next_free >= count);
1013
 
1014
        num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1015
 
1016
        memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1017
}
1018
 
1019
static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1020
                              struct ocfs2_extent_rec *insert_rec)
1021
{
1022
        int i, insert_index, next_free, has_empty, num_bytes;
1023
        u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1024
        struct ocfs2_extent_rec *rec;
1025
 
1026
        next_free = le16_to_cpu(el->l_next_free_rec);
1027
        has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1028
 
1029
        BUG_ON(!next_free);
1030
 
1031
        /* The tree code before us didn't allow enough room in the leaf. */
1032
        if (el->l_next_free_rec == el->l_count && !has_empty)
1033
                BUG();
1034
 
1035
        /*
1036
         * The easiest way to approach this is to just remove the
1037
         * empty extent and temporarily decrement next_free.
1038
         */
1039
        if (has_empty) {
1040
                /*
1041
                 * If next_free was 1 (only an empty extent), this
1042
                 * loop won't execute, which is fine. We still want
1043
                 * the decrement above to happen.
1044
                 */
1045
                for(i = 0; i < (next_free - 1); i++)
1046
                        el->l_recs[i] = el->l_recs[i+1];
1047
 
1048
                next_free--;
1049
        }
1050
 
1051
        /*
1052
         * Figure out what the new record index should be.
1053
         */
1054
        for(i = 0; i < next_free; i++) {
1055
                rec = &el->l_recs[i];
1056
 
1057
                if (insert_cpos < le32_to_cpu(rec->e_cpos))
1058
                        break;
1059
        }
1060
        insert_index = i;
1061
 
1062
        mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1063
             insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1064
 
1065
        BUG_ON(insert_index < 0);
1066
        BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1067
        BUG_ON(insert_index > next_free);
1068
 
1069
        /*
1070
         * No need to memmove if we're just adding to the tail.
1071
         */
1072
        if (insert_index != next_free) {
1073
                BUG_ON(next_free >= le16_to_cpu(el->l_count));
1074
 
1075
                num_bytes = next_free - insert_index;
1076
                num_bytes *= sizeof(struct ocfs2_extent_rec);
1077
                memmove(&el->l_recs[insert_index + 1],
1078
                        &el->l_recs[insert_index],
1079
                        num_bytes);
1080
        }
1081
 
1082
        /*
1083
         * Either we had an empty extent, and need to re-increment or
1084
         * there was no empty extent on a non full rightmost leaf node,
1085
         * in which case we still need to increment.
1086
         */
1087
        next_free++;
1088
        el->l_next_free_rec = cpu_to_le16(next_free);
1089
        /*
1090
         * Make sure none of the math above just messed up our tree.
1091
         */
1092
        BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1093
 
1094
        el->l_recs[insert_index] = *insert_rec;
1095
 
1096
}
1097
 
1098
static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1099
{
1100
        int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1101
 
1102
        BUG_ON(num_recs == 0);
1103
 
1104
        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1105
                num_recs--;
1106
                size = num_recs * sizeof(struct ocfs2_extent_rec);
1107
                memmove(&el->l_recs[0], &el->l_recs[1], size);
1108
                memset(&el->l_recs[num_recs], 0,
1109
                       sizeof(struct ocfs2_extent_rec));
1110
                el->l_next_free_rec = cpu_to_le16(num_recs);
1111
        }
1112
}
1113
 
1114
/*
1115
 * Create an empty extent record .
1116
 *
1117
 * l_next_free_rec may be updated.
1118
 *
1119
 * If an empty extent already exists do nothing.
1120
 */
1121
static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1122
{
1123
        int next_free = le16_to_cpu(el->l_next_free_rec);
1124
 
1125
        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1126
 
1127
        if (next_free == 0)
1128
                goto set_and_inc;
1129
 
1130
        if (ocfs2_is_empty_extent(&el->l_recs[0]))
1131
                return;
1132
 
1133
        mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1134
                        "Asked to create an empty extent in a full list:\n"
1135
                        "count = %u, tree depth = %u",
1136
                        le16_to_cpu(el->l_count),
1137
                        le16_to_cpu(el->l_tree_depth));
1138
 
1139
        ocfs2_shift_records_right(el);
1140
 
1141
set_and_inc:
1142
        le16_add_cpu(&el->l_next_free_rec, 1);
1143
        memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1144
}
1145
 
1146
/*
1147
 * For a rotation which involves two leaf nodes, the "root node" is
1148
 * the lowest level tree node which contains a path to both leafs. This
1149
 * resulting set of information can be used to form a complete "subtree"
1150
 *
1151
 * This function is passed two full paths from the dinode down to a
1152
 * pair of adjacent leaves. It's task is to figure out which path
1153
 * index contains the subtree root - this can be the root index itself
1154
 * in a worst-case rotation.
1155
 *
1156
 * The array index of the subtree root is passed back.
1157
 */
1158
static int ocfs2_find_subtree_root(struct inode *inode,
1159
                                   struct ocfs2_path *left,
1160
                                   struct ocfs2_path *right)
1161
{
1162
        int i = 0;
1163
 
1164
        /*
1165
         * Check that the caller passed in two paths from the same tree.
1166
         */
1167
        BUG_ON(path_root_bh(left) != path_root_bh(right));
1168
 
1169
        do {
1170
                i++;
1171
 
1172
                /*
1173
                 * The caller didn't pass two adjacent paths.
1174
                 */
1175
                mlog_bug_on_msg(i > left->p_tree_depth,
1176
                                "Inode %lu, left depth %u, right depth %u\n"
1177
                                "left leaf blk %llu, right leaf blk %llu\n",
1178
                                inode->i_ino, left->p_tree_depth,
1179
                                right->p_tree_depth,
1180
                                (unsigned long long)path_leaf_bh(left)->b_blocknr,
1181
                                (unsigned long long)path_leaf_bh(right)->b_blocknr);
1182
        } while (left->p_node[i].bh->b_blocknr ==
1183
                 right->p_node[i].bh->b_blocknr);
1184
 
1185
        return i - 1;
1186
}
1187
 
1188
typedef void (path_insert_t)(void *, struct buffer_head *);
1189
 
1190
/*
1191
 * Traverse a btree path in search of cpos, starting at root_el.
1192
 *
1193
 * This code can be called with a cpos larger than the tree, in which
1194
 * case it will return the rightmost path.
1195
 */
1196
static int __ocfs2_find_path(struct inode *inode,
1197
                             struct ocfs2_extent_list *root_el, u32 cpos,
1198
                             path_insert_t *func, void *data)
1199
{
1200
        int i, ret = 0;
1201
        u32 range;
1202
        u64 blkno;
1203
        struct buffer_head *bh = NULL;
1204
        struct ocfs2_extent_block *eb;
1205
        struct ocfs2_extent_list *el;
1206
        struct ocfs2_extent_rec *rec;
1207
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
1208
 
1209
        el = root_el;
1210
        while (el->l_tree_depth) {
1211
                if (le16_to_cpu(el->l_next_free_rec) == 0) {
1212
                        ocfs2_error(inode->i_sb,
1213
                                    "Inode %llu has empty extent list at "
1214
                                    "depth %u\n",
1215
                                    (unsigned long long)oi->ip_blkno,
1216
                                    le16_to_cpu(el->l_tree_depth));
1217
                        ret = -EROFS;
1218
                        goto out;
1219
 
1220
                }
1221
 
1222
                for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1223
                        rec = &el->l_recs[i];
1224
 
1225
                        /*
1226
                         * In the case that cpos is off the allocation
1227
                         * tree, this should just wind up returning the
1228
                         * rightmost record.
1229
                         */
1230
                        range = le32_to_cpu(rec->e_cpos) +
1231
                                ocfs2_rec_clusters(el, rec);
1232
                        if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1233
                            break;
1234
                }
1235
 
1236
                blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1237
                if (blkno == 0) {
1238
                        ocfs2_error(inode->i_sb,
1239
                                    "Inode %llu has bad blkno in extent list "
1240
                                    "at depth %u (index %d)\n",
1241
                                    (unsigned long long)oi->ip_blkno,
1242
                                    le16_to_cpu(el->l_tree_depth), i);
1243
                        ret = -EROFS;
1244
                        goto out;
1245
                }
1246
 
1247
                brelse(bh);
1248
                bh = NULL;
1249
                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1250
                                       &bh, OCFS2_BH_CACHED, inode);
1251
                if (ret) {
1252
                        mlog_errno(ret);
1253
                        goto out;
1254
                }
1255
 
1256
                eb = (struct ocfs2_extent_block *) bh->b_data;
1257
                el = &eb->h_list;
1258
                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1259
                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1260
                        ret = -EIO;
1261
                        goto out;
1262
                }
1263
 
1264
                if (le16_to_cpu(el->l_next_free_rec) >
1265
                    le16_to_cpu(el->l_count)) {
1266
                        ocfs2_error(inode->i_sb,
1267
                                    "Inode %llu has bad count in extent list "
1268
                                    "at block %llu (next free=%u, count=%u)\n",
1269
                                    (unsigned long long)oi->ip_blkno,
1270
                                    (unsigned long long)bh->b_blocknr,
1271
                                    le16_to_cpu(el->l_next_free_rec),
1272
                                    le16_to_cpu(el->l_count));
1273
                        ret = -EROFS;
1274
                        goto out;
1275
                }
1276
 
1277
                if (func)
1278
                        func(data, bh);
1279
        }
1280
 
1281
out:
1282
        /*
1283
         * Catch any trailing bh that the loop didn't handle.
1284
         */
1285
        brelse(bh);
1286
 
1287
        return ret;
1288
}
1289
 
1290
/*
1291
 * Given an initialized path (that is, it has a valid root extent
1292
 * list), this function will traverse the btree in search of the path
1293
 * which would contain cpos.
1294
 *
1295
 * The path traveled is recorded in the path structure.
1296
 *
1297
 * Note that this will not do any comparisons on leaf node extent
1298
 * records, so it will work fine in the case that we just added a tree
1299
 * branch.
1300
 */
1301
struct find_path_data {
1302
        int index;
1303
        struct ocfs2_path *path;
1304
};
1305
static void find_path_ins(void *data, struct buffer_head *bh)
1306
{
1307
        struct find_path_data *fp = data;
1308
 
1309
        get_bh(bh);
1310
        ocfs2_path_insert_eb(fp->path, fp->index, bh);
1311
        fp->index++;
1312
}
1313
static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1314
                           u32 cpos)
1315
{
1316
        struct find_path_data data;
1317
 
1318
        data.index = 1;
1319
        data.path = path;
1320
        return __ocfs2_find_path(inode, path_root_el(path), cpos,
1321
                                 find_path_ins, &data);
1322
}
1323
 
1324
static void find_leaf_ins(void *data, struct buffer_head *bh)
1325
{
1326
        struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1327
        struct ocfs2_extent_list *el = &eb->h_list;
1328
        struct buffer_head **ret = data;
1329
 
1330
        /* We want to retain only the leaf block. */
1331
        if (le16_to_cpu(el->l_tree_depth) == 0) {
1332
                get_bh(bh);
1333
                *ret = bh;
1334
        }
1335
}
1336
/*
1337
 * Find the leaf block in the tree which would contain cpos. No
1338
 * checking of the actual leaf is done.
1339
 *
1340
 * Some paths want to call this instead of allocating a path structure
1341
 * and calling ocfs2_find_path().
1342
 *
1343
 * This function doesn't handle non btree extent lists.
1344
 */
1345
int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1346
                    u32 cpos, struct buffer_head **leaf_bh)
1347
{
1348
        int ret;
1349
        struct buffer_head *bh = NULL;
1350
 
1351
        ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1352
        if (ret) {
1353
                mlog_errno(ret);
1354
                goto out;
1355
        }
1356
 
1357
        *leaf_bh = bh;
1358
out:
1359
        return ret;
1360
}
1361
 
1362
/*
1363
 * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1364
 *
1365
 * Basically, we've moved stuff around at the bottom of the tree and
1366
 * we need to fix up the extent records above the changes to reflect
1367
 * the new changes.
1368
 *
1369
 * left_rec: the record on the left.
1370
 * left_child_el: is the child list pointed to by left_rec
1371
 * right_rec: the record to the right of left_rec
1372
 * right_child_el: is the child list pointed to by right_rec
1373
 *
1374
 * By definition, this only works on interior nodes.
1375
 */
1376
static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1377
                                  struct ocfs2_extent_list *left_child_el,
1378
                                  struct ocfs2_extent_rec *right_rec,
1379
                                  struct ocfs2_extent_list *right_child_el)
1380
{
1381
        u32 left_clusters, right_end;
1382
 
1383
        /*
1384
         * Interior nodes never have holes. Their cpos is the cpos of
1385
         * the leftmost record in their child list. Their cluster
1386
         * count covers the full theoretical range of their child list
1387
         * - the range between their cpos and the cpos of the record
1388
         * immediately to their right.
1389
         */
1390
        left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1391
        if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1392
                BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1393
                left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1394
        }
1395
        left_clusters -= le32_to_cpu(left_rec->e_cpos);
1396
        left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1397
 
1398
        /*
1399
         * Calculate the rightmost cluster count boundary before
1400
         * moving cpos - we will need to adjust clusters after
1401
         * updating e_cpos to keep the same highest cluster count.
1402
         */
1403
        right_end = le32_to_cpu(right_rec->e_cpos);
1404
        right_end += le32_to_cpu(right_rec->e_int_clusters);
1405
 
1406
        right_rec->e_cpos = left_rec->e_cpos;
1407
        le32_add_cpu(&right_rec->e_cpos, left_clusters);
1408
 
1409
        right_end -= le32_to_cpu(right_rec->e_cpos);
1410
        right_rec->e_int_clusters = cpu_to_le32(right_end);
1411
}
1412
 
1413
/*
1414
 * Adjust the adjacent root node records involved in a
1415
 * rotation. left_el_blkno is passed in as a key so that we can easily
1416
 * find it's index in the root list.
1417
 */
1418
static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1419
                                      struct ocfs2_extent_list *left_el,
1420
                                      struct ocfs2_extent_list *right_el,
1421
                                      u64 left_el_blkno)
1422
{
1423
        int i;
1424
 
1425
        BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1426
               le16_to_cpu(left_el->l_tree_depth));
1427
 
1428
        for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1429
                if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1430
                        break;
1431
        }
1432
 
1433
        /*
1434
         * The path walking code should have never returned a root and
1435
         * two paths which are not adjacent.
1436
         */
1437
        BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1438
 
1439
        ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1440
                                      &root_el->l_recs[i + 1], right_el);
1441
}
1442
 
1443
/*
1444
 * We've changed a leaf block (in right_path) and need to reflect that
1445
 * change back up the subtree.
1446
 *
1447
 * This happens in multiple places:
1448
 *   - When we've moved an extent record from the left path leaf to the right
1449
 *     path leaf to make room for an empty extent in the left path leaf.
1450
 *   - When our insert into the right path leaf is at the leftmost edge
1451
 *     and requires an update of the path immediately to it's left. This
1452
 *     can occur at the end of some types of rotation and appending inserts.
1453
 */
1454
static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1455
                                       struct ocfs2_path *left_path,
1456
                                       struct ocfs2_path *right_path,
1457
                                       int subtree_index)
1458
{
1459
        int ret, i, idx;
1460
        struct ocfs2_extent_list *el, *left_el, *right_el;
1461
        struct ocfs2_extent_rec *left_rec, *right_rec;
1462
        struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1463
 
1464
        /*
1465
         * Update the counts and position values within all the
1466
         * interior nodes to reflect the leaf rotation we just did.
1467
         *
1468
         * The root node is handled below the loop.
1469
         *
1470
         * We begin the loop with right_el and left_el pointing to the
1471
         * leaf lists and work our way up.
1472
         *
1473
         * NOTE: within this loop, left_el and right_el always refer
1474
         * to the *child* lists.
1475
         */
1476
        left_el = path_leaf_el(left_path);
1477
        right_el = path_leaf_el(right_path);
1478
        for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1479
                mlog(0, "Adjust records at index %u\n", i);
1480
 
1481
                /*
1482
                 * One nice property of knowing that all of these
1483
                 * nodes are below the root is that we only deal with
1484
                 * the leftmost right node record and the rightmost
1485
                 * left node record.
1486
                 */
1487
                el = left_path->p_node[i].el;
1488
                idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1489
                left_rec = &el->l_recs[idx];
1490
 
1491
                el = right_path->p_node[i].el;
1492
                right_rec = &el->l_recs[0];
1493
 
1494
                ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1495
                                              right_el);
1496
 
1497
                ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1498
                if (ret)
1499
                        mlog_errno(ret);
1500
 
1501
                ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1502
                if (ret)
1503
                        mlog_errno(ret);
1504
 
1505
                /*
1506
                 * Setup our list pointers now so that the current
1507
                 * parents become children in the next iteration.
1508
                 */
1509
                left_el = left_path->p_node[i].el;
1510
                right_el = right_path->p_node[i].el;
1511
        }
1512
 
1513
        /*
1514
         * At the root node, adjust the two adjacent records which
1515
         * begin our path to the leaves.
1516
         */
1517
 
1518
        el = left_path->p_node[subtree_index].el;
1519
        left_el = left_path->p_node[subtree_index + 1].el;
1520
        right_el = right_path->p_node[subtree_index + 1].el;
1521
 
1522
        ocfs2_adjust_root_records(el, left_el, right_el,
1523
                                  left_path->p_node[subtree_index + 1].bh->b_blocknr);
1524
 
1525
        root_bh = left_path->p_node[subtree_index].bh;
1526
 
1527
        ret = ocfs2_journal_dirty(handle, root_bh);
1528
        if (ret)
1529
                mlog_errno(ret);
1530
}
1531
 
1532
static int ocfs2_rotate_subtree_right(struct inode *inode,
1533
                                      handle_t *handle,
1534
                                      struct ocfs2_path *left_path,
1535
                                      struct ocfs2_path *right_path,
1536
                                      int subtree_index)
1537
{
1538
        int ret, i;
1539
        struct buffer_head *right_leaf_bh;
1540
        struct buffer_head *left_leaf_bh = NULL;
1541
        struct buffer_head *root_bh;
1542
        struct ocfs2_extent_list *right_el, *left_el;
1543
        struct ocfs2_extent_rec move_rec;
1544
 
1545
        left_leaf_bh = path_leaf_bh(left_path);
1546
        left_el = path_leaf_el(left_path);
1547
 
1548
        if (left_el->l_next_free_rec != left_el->l_count) {
1549
                ocfs2_error(inode->i_sb,
1550
                            "Inode %llu has non-full interior leaf node %llu"
1551
                            "(next free = %u)",
1552
                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
1553
                            (unsigned long long)left_leaf_bh->b_blocknr,
1554
                            le16_to_cpu(left_el->l_next_free_rec));
1555
                return -EROFS;
1556
        }
1557
 
1558
        /*
1559
         * This extent block may already have an empty record, so we
1560
         * return early if so.
1561
         */
1562
        if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1563
                return 0;
1564
 
1565
        root_bh = left_path->p_node[subtree_index].bh;
1566
        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1567
 
1568
        ret = ocfs2_journal_access(handle, inode, root_bh,
1569
                                   OCFS2_JOURNAL_ACCESS_WRITE);
1570
        if (ret) {
1571
                mlog_errno(ret);
1572
                goto out;
1573
        }
1574
 
1575
        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1576
                ret = ocfs2_journal_access(handle, inode,
1577
                                           right_path->p_node[i].bh,
1578
                                           OCFS2_JOURNAL_ACCESS_WRITE);
1579
                if (ret) {
1580
                        mlog_errno(ret);
1581
                        goto out;
1582
                }
1583
 
1584
                ret = ocfs2_journal_access(handle, inode,
1585
                                           left_path->p_node[i].bh,
1586
                                           OCFS2_JOURNAL_ACCESS_WRITE);
1587
                if (ret) {
1588
                        mlog_errno(ret);
1589
                        goto out;
1590
                }
1591
        }
1592
 
1593
        right_leaf_bh = path_leaf_bh(right_path);
1594
        right_el = path_leaf_el(right_path);
1595
 
1596
        /* This is a code error, not a disk corruption. */
1597
        mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1598
                        "because rightmost leaf block %llu is empty\n",
1599
                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
1600
                        (unsigned long long)right_leaf_bh->b_blocknr);
1601
 
1602
        ocfs2_create_empty_extent(right_el);
1603
 
1604
        ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1605
        if (ret) {
1606
                mlog_errno(ret);
1607
                goto out;
1608
        }
1609
 
1610
        /* Do the copy now. */
1611
        i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1612
        move_rec = left_el->l_recs[i];
1613
        right_el->l_recs[0] = move_rec;
1614
 
1615
        /*
1616
         * Clear out the record we just copied and shift everything
1617
         * over, leaving an empty extent in the left leaf.
1618
         *
1619
         * We temporarily subtract from next_free_rec so that the
1620
         * shift will lose the tail record (which is now defunct).
1621
         */
1622
        le16_add_cpu(&left_el->l_next_free_rec, -1);
1623
        ocfs2_shift_records_right(left_el);
1624
        memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1625
        le16_add_cpu(&left_el->l_next_free_rec, 1);
1626
 
1627
        ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1628
        if (ret) {
1629
                mlog_errno(ret);
1630
                goto out;
1631
        }
1632
 
1633
        ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1634
                                subtree_index);
1635
 
1636
out:
1637
        return ret;
1638
}
1639
 
1640
/*
1641
 * Given a full path, determine what cpos value would return us a path
1642
 * containing the leaf immediately to the left of the current one.
1643
 *
1644
 * Will return zero if the path passed in is already the leftmost path.
1645
 */
1646
static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1647
                                         struct ocfs2_path *path, u32 *cpos)
1648
{
1649
        int i, j, ret = 0;
1650
        u64 blkno;
1651
        struct ocfs2_extent_list *el;
1652
 
1653
        BUG_ON(path->p_tree_depth == 0);
1654
 
1655
        *cpos = 0;
1656
 
1657
        blkno = path_leaf_bh(path)->b_blocknr;
1658
 
1659
        /* Start at the tree node just above the leaf and work our way up. */
1660
        i = path->p_tree_depth - 1;
1661
        while (i >= 0) {
1662
                el = path->p_node[i].el;
1663
 
1664
                /*
1665
                 * Find the extent record just before the one in our
1666
                 * path.
1667
                 */
1668
                for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1669
                        if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1670
                                if (j == 0) {
1671
                                        if (i == 0) {
1672
                                                /*
1673
                                                 * We've determined that the
1674
                                                 * path specified is already
1675
                                                 * the leftmost one - return a
1676
                                                 * cpos of zero.
1677
                                                 */
1678
                                                goto out;
1679
                                        }
1680
                                        /*
1681
                                         * The leftmost record points to our
1682
                                         * leaf - we need to travel up the
1683
                                         * tree one level.
1684
                                         */
1685
                                        goto next_node;
1686
                                }
1687
 
1688
                                *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1689
                                *cpos = *cpos + ocfs2_rec_clusters(el,
1690
                                                           &el->l_recs[j - 1]);
1691
                                *cpos = *cpos - 1;
1692
                                goto out;
1693
                        }
1694
                }
1695
 
1696
                /*
1697
                 * If we got here, we never found a valid node where
1698
                 * the tree indicated one should be.
1699
                 */
1700
                ocfs2_error(sb,
1701
                            "Invalid extent tree at extent block %llu\n",
1702
                            (unsigned long long)blkno);
1703
                ret = -EROFS;
1704
                goto out;
1705
 
1706
next_node:
1707
                blkno = path->p_node[i].bh->b_blocknr;
1708
                i--;
1709
        }
1710
 
1711
out:
1712
        return ret;
1713
}
1714
 
1715
/*
1716
 * Extend the transaction by enough credits to complete the rotation,
1717
 * and still leave at least the original number of credits allocated
1718
 * to this transaction.
1719
 */
1720
static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1721
                                           int op_credits,
1722
                                           struct ocfs2_path *path)
1723
{
1724
        int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
1725
 
1726
        if (handle->h_buffer_credits < credits)
1727
                return ocfs2_extend_trans(handle, credits);
1728
 
1729
        return 0;
1730
}
1731
 
1732
/*
1733
 * Trap the case where we're inserting into the theoretical range past
1734
 * the _actual_ left leaf range. Otherwise, we'll rotate a record
1735
 * whose cpos is less than ours into the right leaf.
1736
 *
1737
 * It's only necessary to look at the rightmost record of the left
1738
 * leaf because the logic that calls us should ensure that the
1739
 * theoretical ranges in the path components above the leaves are
1740
 * correct.
1741
 */
1742
static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1743
                                                 u32 insert_cpos)
1744
{
1745
        struct ocfs2_extent_list *left_el;
1746
        struct ocfs2_extent_rec *rec;
1747
        int next_free;
1748
 
1749
        left_el = path_leaf_el(left_path);
1750
        next_free = le16_to_cpu(left_el->l_next_free_rec);
1751
        rec = &left_el->l_recs[next_free - 1];
1752
 
1753
        if (insert_cpos > le32_to_cpu(rec->e_cpos))
1754
                return 1;
1755
        return 0;
1756
}
1757
 
1758
static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
1759
{
1760
        int next_free = le16_to_cpu(el->l_next_free_rec);
1761
        unsigned int range;
1762
        struct ocfs2_extent_rec *rec;
1763
 
1764
        if (next_free == 0)
1765
                return 0;
1766
 
1767
        rec = &el->l_recs[0];
1768
        if (ocfs2_is_empty_extent(rec)) {
1769
                /* Empty list. */
1770
                if (next_free == 1)
1771
                        return 0;
1772
                rec = &el->l_recs[1];
1773
        }
1774
 
1775
        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1776
        if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1777
                return 1;
1778
        return 0;
1779
}
1780
 
1781
/*
1782
 * Rotate all the records in a btree right one record, starting at insert_cpos.
1783
 *
1784
 * The path to the rightmost leaf should be passed in.
1785
 *
1786
 * The array is assumed to be large enough to hold an entire path (tree depth).
1787
 *
1788
 * Upon succesful return from this function:
1789
 *
1790
 * - The 'right_path' array will contain a path to the leaf block
1791
 *   whose range contains e_cpos.
1792
 * - That leaf block will have a single empty extent in list index 0.
1793
 * - In the case that the rotation requires a post-insert update,
1794
 *   *ret_left_path will contain a valid path which can be passed to
1795
 *   ocfs2_insert_path().
1796
 */
1797
static int ocfs2_rotate_tree_right(struct inode *inode,
1798
                                   handle_t *handle,
1799
                                   enum ocfs2_split_type split,
1800
                                   u32 insert_cpos,
1801
                                   struct ocfs2_path *right_path,
1802
                                   struct ocfs2_path **ret_left_path)
1803
{
1804
        int ret, start, orig_credits = handle->h_buffer_credits;
1805
        u32 cpos;
1806
        struct ocfs2_path *left_path = NULL;
1807
 
1808
        *ret_left_path = NULL;
1809
 
1810
        left_path = ocfs2_new_path(path_root_bh(right_path),
1811
                                   path_root_el(right_path));
1812
        if (!left_path) {
1813
                ret = -ENOMEM;
1814
                mlog_errno(ret);
1815
                goto out;
1816
        }
1817
 
1818
        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
1819
        if (ret) {
1820
                mlog_errno(ret);
1821
                goto out;
1822
        }
1823
 
1824
        mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
1825
 
1826
        /*
1827
         * What we want to do here is:
1828
         *
1829
         * 1) Start with the rightmost path.
1830
         *
1831
         * 2) Determine a path to the leaf block directly to the left
1832
         *    of that leaf.
1833
         *
1834
         * 3) Determine the 'subtree root' - the lowest level tree node
1835
         *    which contains a path to both leaves.
1836
         *
1837
         * 4) Rotate the subtree.
1838
         *
1839
         * 5) Find the next subtree by considering the left path to be
1840
         *    the new right path.
1841
         *
1842
         * The check at the top of this while loop also accepts
1843
         * insert_cpos == cpos because cpos is only a _theoretical_
1844
         * value to get us the left path - insert_cpos might very well
1845
         * be filling that hole.
1846
         *
1847
         * Stop at a cpos of '0' because we either started at the
1848
         * leftmost branch (i.e., a tree with one branch and a
1849
         * rotation inside of it), or we've gone as far as we can in
1850
         * rotating subtrees.
1851
         */
1852
        while (cpos && insert_cpos <= cpos) {
1853
                mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
1854
                     insert_cpos, cpos);
1855
 
1856
                ret = ocfs2_find_path(inode, left_path, cpos);
1857
                if (ret) {
1858
                        mlog_errno(ret);
1859
                        goto out;
1860
                }
1861
 
1862
                mlog_bug_on_msg(path_leaf_bh(left_path) ==
1863
                                path_leaf_bh(right_path),
1864
                                "Inode %lu: error during insert of %u "
1865
                                "(left path cpos %u) results in two identical "
1866
                                "paths ending at %llu\n",
1867
                                inode->i_ino, insert_cpos, cpos,
1868
                                (unsigned long long)
1869
                                path_leaf_bh(left_path)->b_blocknr);
1870
 
1871
                if (split == SPLIT_NONE &&
1872
                    ocfs2_rotate_requires_path_adjustment(left_path,
1873
                                                          insert_cpos)) {
1874
 
1875
                        /*
1876
                         * We've rotated the tree as much as we
1877
                         * should. The rest is up to
1878
                         * ocfs2_insert_path() to complete, after the
1879
                         * record insertion. We indicate this
1880
                         * situation by returning the left path.
1881
                         *
1882
                         * The reason we don't adjust the records here
1883
                         * before the record insert is that an error
1884
                         * later might break the rule where a parent
1885
                         * record e_cpos will reflect the actual
1886
                         * e_cpos of the 1st nonempty record of the
1887
                         * child list.
1888
                         */
1889
                        *ret_left_path = left_path;
1890
                        goto out_ret_path;
1891
                }
1892
 
1893
                start = ocfs2_find_subtree_root(inode, left_path, right_path);
1894
 
1895
                mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
1896
                     start,
1897
                     (unsigned long long) right_path->p_node[start].bh->b_blocknr,
1898
                     right_path->p_tree_depth);
1899
 
1900
                ret = ocfs2_extend_rotate_transaction(handle, start,
1901
                                                      orig_credits, right_path);
1902
                if (ret) {
1903
                        mlog_errno(ret);
1904
                        goto out;
1905
                }
1906
 
1907
                ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
1908
                                                 right_path, start);
1909
                if (ret) {
1910
                        mlog_errno(ret);
1911
                        goto out;
1912
                }
1913
 
1914
                if (split != SPLIT_NONE &&
1915
                    ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
1916
                                                insert_cpos)) {
1917
                        /*
1918
                         * A rotate moves the rightmost left leaf
1919
                         * record over to the leftmost right leaf
1920
                         * slot. If we're doing an extent split
1921
                         * instead of a real insert, then we have to
1922
                         * check that the extent to be split wasn't
1923
                         * just moved over. If it was, then we can
1924
                         * exit here, passing left_path back -
1925
                         * ocfs2_split_extent() is smart enough to
1926
                         * search both leaves.
1927
                         */
1928
                        *ret_left_path = left_path;
1929
                        goto out_ret_path;
1930
                }
1931
 
1932
                /*
1933
                 * There is no need to re-read the next right path
1934
                 * as we know that it'll be our current left
1935
                 * path. Optimize by copying values instead.
1936
                 */
1937
                ocfs2_mv_path(right_path, left_path);
1938
 
1939
                ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1940
                                                    &cpos);
1941
                if (ret) {
1942
                        mlog_errno(ret);
1943
                        goto out;
1944
                }
1945
        }
1946
 
1947
out:
1948
        ocfs2_free_path(left_path);
1949
 
1950
out_ret_path:
1951
        return ret;
1952
}
1953
 
1954
static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
1955
                                      struct ocfs2_path *path)
1956
{
1957
        int i, idx;
1958
        struct ocfs2_extent_rec *rec;
1959
        struct ocfs2_extent_list *el;
1960
        struct ocfs2_extent_block *eb;
1961
        u32 range;
1962
 
1963
        /* Path should always be rightmost. */
1964
        eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
1965
        BUG_ON(eb->h_next_leaf_blk != 0ULL);
1966
 
1967
        el = &eb->h_list;
1968
        BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1969
        idx = le16_to_cpu(el->l_next_free_rec) - 1;
1970
        rec = &el->l_recs[idx];
1971
        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1972
 
1973
        for (i = 0; i < path->p_tree_depth; i++) {
1974
                el = path->p_node[i].el;
1975
                idx = le16_to_cpu(el->l_next_free_rec) - 1;
1976
                rec = &el->l_recs[idx];
1977
 
1978
                rec->e_int_clusters = cpu_to_le32(range);
1979
                le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
1980
 
1981
                ocfs2_journal_dirty(handle, path->p_node[i].bh);
1982
        }
1983
}
1984
 
1985
static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
1986
                              struct ocfs2_cached_dealloc_ctxt *dealloc,
1987
                              struct ocfs2_path *path, int unlink_start)
1988
{
1989
        int ret, i;
1990
        struct ocfs2_extent_block *eb;
1991
        struct ocfs2_extent_list *el;
1992
        struct buffer_head *bh;
1993
 
1994
        for(i = unlink_start; i < path_num_items(path); i++) {
1995
                bh = path->p_node[i].bh;
1996
 
1997
                eb = (struct ocfs2_extent_block *)bh->b_data;
1998
                /*
1999
                 * Not all nodes might have had their final count
2000
                 * decremented by the caller - handle this here.
2001
                 */
2002
                el = &eb->h_list;
2003
                if (le16_to_cpu(el->l_next_free_rec) > 1) {
2004
                        mlog(ML_ERROR,
2005
                             "Inode %llu, attempted to remove extent block "
2006
                             "%llu with %u records\n",
2007
                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
2008
                             (unsigned long long)le64_to_cpu(eb->h_blkno),
2009
                             le16_to_cpu(el->l_next_free_rec));
2010
 
2011
                        ocfs2_journal_dirty(handle, bh);
2012
                        ocfs2_remove_from_cache(inode, bh);
2013
                        continue;
2014
                }
2015
 
2016
                el->l_next_free_rec = 0;
2017
                memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2018
 
2019
                ocfs2_journal_dirty(handle, bh);
2020
 
2021
                ret = ocfs2_cache_extent_block_free(dealloc, eb);
2022
                if (ret)
2023
                        mlog_errno(ret);
2024
 
2025
                ocfs2_remove_from_cache(inode, bh);
2026
        }
2027
}
2028
 
2029
static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2030
                                 struct ocfs2_path *left_path,
2031
                                 struct ocfs2_path *right_path,
2032
                                 int subtree_index,
2033
                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
2034
{
2035
        int i;
2036
        struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2037
        struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2038
        struct ocfs2_extent_list *el;
2039
        struct ocfs2_extent_block *eb;
2040
 
2041
        el = path_leaf_el(left_path);
2042
 
2043
        eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2044
 
2045
        for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2046
                if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2047
                        break;
2048
 
2049
        BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2050
 
2051
        memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2052
        le16_add_cpu(&root_el->l_next_free_rec, -1);
2053
 
2054
        eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2055
        eb->h_next_leaf_blk = 0;
2056
 
2057
        ocfs2_journal_dirty(handle, root_bh);
2058
        ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2059
 
2060
        ocfs2_unlink_path(inode, handle, dealloc, right_path,
2061
                          subtree_index + 1);
2062
}
2063
 
2064
static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2065
                                     struct ocfs2_path *left_path,
2066
                                     struct ocfs2_path *right_path,
2067
                                     int subtree_index,
2068
                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2069
                                     int *deleted)
2070
{
2071
        int ret, i, del_right_subtree = 0, right_has_empty = 0;
2072
        struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
2073
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2074
        struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2075
        struct ocfs2_extent_block *eb;
2076
 
2077
        *deleted = 0;
2078
 
2079
        right_leaf_el = path_leaf_el(right_path);
2080
        left_leaf_el = path_leaf_el(left_path);
2081
        root_bh = left_path->p_node[subtree_index].bh;
2082
        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2083
 
2084
        if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2085
                return 0;
2086
 
2087
        eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2088
        if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2089
                /*
2090
                 * It's legal for us to proceed if the right leaf is
2091
                 * the rightmost one and it has an empty extent. There
2092
                 * are two cases to handle - whether the leaf will be
2093
                 * empty after removal or not. If the leaf isn't empty
2094
                 * then just remove the empty extent up front. The
2095
                 * next block will handle empty leaves by flagging
2096
                 * them for unlink.
2097
                 *
2098
                 * Non rightmost leaves will throw -EAGAIN and the
2099
                 * caller can manually move the subtree and retry.
2100
                 */
2101
 
2102
                if (eb->h_next_leaf_blk != 0ULL)
2103
                        return -EAGAIN;
2104
 
2105
                if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2106
                        ret = ocfs2_journal_access(handle, inode,
2107
                                                   path_leaf_bh(right_path),
2108
                                                   OCFS2_JOURNAL_ACCESS_WRITE);
2109
                        if (ret) {
2110
                                mlog_errno(ret);
2111
                                goto out;
2112
                        }
2113
 
2114
                        ocfs2_remove_empty_extent(right_leaf_el);
2115
                } else
2116
                        right_has_empty = 1;
2117
        }
2118
 
2119
        if (eb->h_next_leaf_blk == 0ULL &&
2120
            le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2121
                /*
2122
                 * We have to update i_last_eb_blk during the meta
2123
                 * data delete.
2124
                 */
2125
                ret = ocfs2_journal_access(handle, inode, di_bh,
2126
                                           OCFS2_JOURNAL_ACCESS_WRITE);
2127
                if (ret) {
2128
                        mlog_errno(ret);
2129
                        goto out;
2130
                }
2131
 
2132
                del_right_subtree = 1;
2133
        }
2134
 
2135
        /*
2136
         * Getting here with an empty extent in the right path implies
2137
         * that it's the rightmost path and will be deleted.
2138
         */
2139
        BUG_ON(right_has_empty && !del_right_subtree);
2140
 
2141
        ret = ocfs2_journal_access(handle, inode, root_bh,
2142
                                   OCFS2_JOURNAL_ACCESS_WRITE);
2143
        if (ret) {
2144
                mlog_errno(ret);
2145
                goto out;
2146
        }
2147
 
2148
        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2149
                ret = ocfs2_journal_access(handle, inode,
2150
                                           right_path->p_node[i].bh,
2151
                                           OCFS2_JOURNAL_ACCESS_WRITE);
2152
                if (ret) {
2153
                        mlog_errno(ret);
2154
                        goto out;
2155
                }
2156
 
2157
                ret = ocfs2_journal_access(handle, inode,
2158
                                           left_path->p_node[i].bh,
2159
                                           OCFS2_JOURNAL_ACCESS_WRITE);
2160
                if (ret) {
2161
                        mlog_errno(ret);
2162
                        goto out;
2163
                }
2164
        }
2165
 
2166
        if (!right_has_empty) {
2167
                /*
2168
                 * Only do this if we're moving a real
2169
                 * record. Otherwise, the action is delayed until
2170
                 * after removal of the right path in which case we
2171
                 * can do a simple shift to remove the empty extent.
2172
                 */
2173
                ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2174
                memset(&right_leaf_el->l_recs[0], 0,
2175
                       sizeof(struct ocfs2_extent_rec));
2176
        }
2177
        if (eb->h_next_leaf_blk == 0ULL) {
2178
                /*
2179
                 * Move recs over to get rid of empty extent, decrease
2180
                 * next_free. This is allowed to remove the last
2181
                 * extent in our leaf (setting l_next_free_rec to
2182
                 * zero) - the delete code below won't care.
2183
                 */
2184
                ocfs2_remove_empty_extent(right_leaf_el);
2185
        }
2186
 
2187
        ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2188
        if (ret)
2189
                mlog_errno(ret);
2190
        ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2191
        if (ret)
2192
                mlog_errno(ret);
2193
 
2194
        if (del_right_subtree) {
2195
                ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2196
                                     subtree_index, dealloc);
2197
                ocfs2_update_edge_lengths(inode, handle, left_path);
2198
 
2199
                eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2200
                di->i_last_eb_blk = eb->h_blkno;
2201
 
2202
                /*
2203
                 * Removal of the extent in the left leaf was skipped
2204
                 * above so we could delete the right path
2205
                 * 1st.
2206
                 */
2207
                if (right_has_empty)
2208
                        ocfs2_remove_empty_extent(left_leaf_el);
2209
 
2210
                ret = ocfs2_journal_dirty(handle, di_bh);
2211
                if (ret)
2212
                        mlog_errno(ret);
2213
 
2214
                *deleted = 1;
2215
        } else
2216
                ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2217
                                           subtree_index);
2218
 
2219
out:
2220
        return ret;
2221
}
2222
 
2223
/*
2224
 * Given a full path, determine what cpos value would return us a path
2225
 * containing the leaf immediately to the right of the current one.
2226
 *
2227
 * Will return zero if the path passed in is already the rightmost path.
2228
 *
2229
 * This looks similar, but is subtly different to
2230
 * ocfs2_find_cpos_for_left_leaf().
2231
 */
2232
static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2233
                                          struct ocfs2_path *path, u32 *cpos)
2234
{
2235
        int i, j, ret = 0;
2236
        u64 blkno;
2237
        struct ocfs2_extent_list *el;
2238
 
2239
        *cpos = 0;
2240
 
2241
        if (path->p_tree_depth == 0)
2242
                return 0;
2243
 
2244
        blkno = path_leaf_bh(path)->b_blocknr;
2245
 
2246
        /* Start at the tree node just above the leaf and work our way up. */
2247
        i = path->p_tree_depth - 1;
2248
        while (i >= 0) {
2249
                int next_free;
2250
 
2251
                el = path->p_node[i].el;
2252
 
2253
                /*
2254
                 * Find the extent record just after the one in our
2255
                 * path.
2256
                 */
2257
                next_free = le16_to_cpu(el->l_next_free_rec);
2258
                for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2259
                        if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2260
                                if (j == (next_free - 1)) {
2261
                                        if (i == 0) {
2262
                                                /*
2263
                                                 * We've determined that the
2264
                                                 * path specified is already
2265
                                                 * the rightmost one - return a
2266
                                                 * cpos of zero.
2267
                                                 */
2268
                                                goto out;
2269
                                        }
2270
                                        /*
2271
                                         * The rightmost record points to our
2272
                                         * leaf - we need to travel up the
2273
                                         * tree one level.
2274
                                         */
2275
                                        goto next_node;
2276
                                }
2277
 
2278
                                *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2279
                                goto out;
2280
                        }
2281
                }
2282
 
2283
                /*
2284
                 * If we got here, we never found a valid node where
2285
                 * the tree indicated one should be.
2286
                 */
2287
                ocfs2_error(sb,
2288
                            "Invalid extent tree at extent block %llu\n",
2289
                            (unsigned long long)blkno);
2290
                ret = -EROFS;
2291
                goto out;
2292
 
2293
next_node:
2294
                blkno = path->p_node[i].bh->b_blocknr;
2295
                i--;
2296
        }
2297
 
2298
out:
2299
        return ret;
2300
}
2301
 
2302
static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2303
                                            handle_t *handle,
2304
                                            struct buffer_head *bh,
2305
                                            struct ocfs2_extent_list *el)
2306
{
2307
        int ret;
2308
 
2309
        if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2310
                return 0;
2311
 
2312
        ret = ocfs2_journal_access(handle, inode, bh,
2313
                                   OCFS2_JOURNAL_ACCESS_WRITE);
2314
        if (ret) {
2315
                mlog_errno(ret);
2316
                goto out;
2317
        }
2318
 
2319
        ocfs2_remove_empty_extent(el);
2320
 
2321
        ret = ocfs2_journal_dirty(handle, bh);
2322
        if (ret)
2323
                mlog_errno(ret);
2324
 
2325
out:
2326
        return ret;
2327
}
2328
 
2329
static int __ocfs2_rotate_tree_left(struct inode *inode,
2330
                                    handle_t *handle, int orig_credits,
2331
                                    struct ocfs2_path *path,
2332
                                    struct ocfs2_cached_dealloc_ctxt *dealloc,
2333
                                    struct ocfs2_path **empty_extent_path)
2334
{
2335
        int ret, subtree_root, deleted;
2336
        u32 right_cpos;
2337
        struct ocfs2_path *left_path = NULL;
2338
        struct ocfs2_path *right_path = NULL;
2339
 
2340
        BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2341
 
2342
        *empty_extent_path = NULL;
2343
 
2344
        ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2345
                                             &right_cpos);
2346
        if (ret) {
2347
                mlog_errno(ret);
2348
                goto out;
2349
        }
2350
 
2351
        left_path = ocfs2_new_path(path_root_bh(path),
2352
                                   path_root_el(path));
2353
        if (!left_path) {
2354
                ret = -ENOMEM;
2355
                mlog_errno(ret);
2356
                goto out;
2357
        }
2358
 
2359
        ocfs2_cp_path(left_path, path);
2360
 
2361
        right_path = ocfs2_new_path(path_root_bh(path),
2362
                                    path_root_el(path));
2363
        if (!right_path) {
2364
                ret = -ENOMEM;
2365
                mlog_errno(ret);
2366
                goto out;
2367
        }
2368
 
2369
        while (right_cpos) {
2370
                ret = ocfs2_find_path(inode, right_path, right_cpos);
2371
                if (ret) {
2372
                        mlog_errno(ret);
2373
                        goto out;
2374
                }
2375
 
2376
                subtree_root = ocfs2_find_subtree_root(inode, left_path,
2377
                                                       right_path);
2378
 
2379
                mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2380
                     subtree_root,
2381
                     (unsigned long long)
2382
                     right_path->p_node[subtree_root].bh->b_blocknr,
2383
                     right_path->p_tree_depth);
2384
 
2385
                ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2386
                                                      orig_credits, left_path);
2387
                if (ret) {
2388
                        mlog_errno(ret);
2389
                        goto out;
2390
                }
2391
 
2392
                /*
2393
                 * Caller might still want to make changes to the
2394
                 * tree root, so re-add it to the journal here.
2395
                 */
2396
                ret = ocfs2_journal_access(handle, inode,
2397
                                           path_root_bh(left_path),
2398
                                           OCFS2_JOURNAL_ACCESS_WRITE);
2399
                if (ret) {
2400
                        mlog_errno(ret);
2401
                        goto out;
2402
                }
2403
 
2404
                ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2405
                                                right_path, subtree_root,
2406
                                                dealloc, &deleted);
2407
                if (ret == -EAGAIN) {
2408
                        /*
2409
                         * The rotation has to temporarily stop due to
2410
                         * the right subtree having an empty
2411
                         * extent. Pass it back to the caller for a
2412
                         * fixup.
2413
                         */
2414
                        *empty_extent_path = right_path;
2415
                        right_path = NULL;
2416
                        goto out;
2417
                }
2418
                if (ret) {
2419
                        mlog_errno(ret);
2420
                        goto out;
2421
                }
2422
 
2423
                /*
2424
                 * The subtree rotate might have removed records on
2425
                 * the rightmost edge. If so, then rotation is
2426
                 * complete.
2427
                 */
2428
                if (deleted)
2429
                        break;
2430
 
2431
                ocfs2_mv_path(left_path, right_path);
2432
 
2433
                ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2434
                                                     &right_cpos);
2435
                if (ret) {
2436
                        mlog_errno(ret);
2437
                        goto out;
2438
                }
2439
        }
2440
 
2441
out:
2442
        ocfs2_free_path(right_path);
2443
        ocfs2_free_path(left_path);
2444
 
2445
        return ret;
2446
}
2447
 
2448
static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2449
                                       struct ocfs2_path *path,
2450
                                       struct ocfs2_cached_dealloc_ctxt *dealloc)
2451
{
2452
        int ret, subtree_index;
2453
        u32 cpos;
2454
        struct ocfs2_path *left_path = NULL;
2455
        struct ocfs2_dinode *di;
2456
        struct ocfs2_extent_block *eb;
2457
        struct ocfs2_extent_list *el;
2458
 
2459
        /*
2460
         * XXX: This code assumes that the root is an inode, which is
2461
         * true for now but may change as tree code gets generic.
2462
         */
2463
        di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
2464
        if (!OCFS2_IS_VALID_DINODE(di)) {
2465
                ret = -EIO;
2466
                ocfs2_error(inode->i_sb,
2467
                            "Inode %llu has invalid path root",
2468
                            (unsigned long long)OCFS2_I(inode)->ip_blkno);
2469
                goto out;
2470
        }
2471
 
2472
        /*
2473
         * There's two ways we handle this depending on
2474
         * whether path is the only existing one.
2475
         */
2476
        ret = ocfs2_extend_rotate_transaction(handle, 0,
2477
                                              handle->h_buffer_credits,
2478
                                              path);
2479
        if (ret) {
2480
                mlog_errno(ret);
2481
                goto out;
2482
        }
2483
 
2484
        ret = ocfs2_journal_access_path(inode, handle, path);
2485
        if (ret) {
2486
                mlog_errno(ret);
2487
                goto out;
2488
        }
2489
 
2490
        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2491
        if (ret) {
2492
                mlog_errno(ret);
2493
                goto out;
2494
        }
2495
 
2496
        if (cpos) {
2497
                /*
2498
                 * We have a path to the left of this one - it needs
2499
                 * an update too.
2500
                 */
2501
                left_path = ocfs2_new_path(path_root_bh(path),
2502
                                           path_root_el(path));
2503
                if (!left_path) {
2504
                        ret = -ENOMEM;
2505
                        mlog_errno(ret);
2506
                        goto out;
2507
                }
2508
 
2509
                ret = ocfs2_find_path(inode, left_path, cpos);
2510
                if (ret) {
2511
                        mlog_errno(ret);
2512
                        goto out;
2513
                }
2514
 
2515
                ret = ocfs2_journal_access_path(inode, handle, left_path);
2516
                if (ret) {
2517
                        mlog_errno(ret);
2518
                        goto out;
2519
                }
2520
 
2521
                subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2522
 
2523
                ocfs2_unlink_subtree(inode, handle, left_path, path,
2524
                                     subtree_index, dealloc);
2525
                ocfs2_update_edge_lengths(inode, handle, left_path);
2526
 
2527
                eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2528
                di->i_last_eb_blk = eb->h_blkno;
2529
        } else {
2530
                /*
2531
                 * 'path' is also the leftmost path which
2532
                 * means it must be the only one. This gets
2533
                 * handled differently because we want to
2534
                 * revert the inode back to having extents
2535
                 * in-line.
2536
                 */
2537
                ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2538
 
2539
                el = &di->id2.i_list;
2540
                el->l_tree_depth = 0;
2541
                el->l_next_free_rec = 0;
2542
                memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2543
 
2544
                di->i_last_eb_blk = 0;
2545
        }
2546
 
2547
        ocfs2_journal_dirty(handle, path_root_bh(path));
2548
 
2549
out:
2550
        ocfs2_free_path(left_path);
2551
        return ret;
2552
}
2553
 
2554
/*
2555
 * Left rotation of btree records.
2556
 *
2557
 * In many ways, this is (unsurprisingly) the opposite of right
2558
 * rotation. We start at some non-rightmost path containing an empty
2559
 * extent in the leaf block. The code works its way to the rightmost
2560
 * path by rotating records to the left in every subtree.
2561
 *
2562
 * This is used by any code which reduces the number of extent records
2563
 * in a leaf. After removal, an empty record should be placed in the
2564
 * leftmost list position.
2565
 *
2566
 * This won't handle a length update of the rightmost path records if
2567
 * the rightmost tree leaf record is removed so the caller is
2568
 * responsible for detecting and correcting that.
2569
 */
2570
static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2571
                                  struct ocfs2_path *path,
2572
                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
2573
{
2574
        int ret, orig_credits = handle->h_buffer_credits;
2575
        struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2576
        struct ocfs2_extent_block *eb;
2577
        struct ocfs2_extent_list *el;
2578
 
2579
        el = path_leaf_el(path);
2580
        if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2581
                return 0;
2582
 
2583
        if (path->p_tree_depth == 0) {
2584
rightmost_no_delete:
2585
                /*
2586
                 * In-inode extents. This is trivially handled, so do
2587
                 * it up front.
2588
                 */
2589
                ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2590
                                                       path_leaf_bh(path),
2591
                                                       path_leaf_el(path));
2592
                if (ret)
2593
                        mlog_errno(ret);
2594
                goto out;
2595
        }
2596
 
2597
        /*
2598
         * Handle rightmost branch now. There's several cases:
2599
         *  1) simple rotation leaving records in there. That's trivial.
2600
         *  2) rotation requiring a branch delete - there's no more
2601
         *     records left. Two cases of this:
2602
         *     a) There are branches to the left.
2603
         *     b) This is also the leftmost (the only) branch.
2604
         *
2605
         *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
2606
         *  2a) we need the left branch so that we can update it with the unlink
2607
         *  2b) we need to bring the inode back to inline extents.
2608
         */
2609
 
2610
        eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2611
        el = &eb->h_list;
2612
        if (eb->h_next_leaf_blk == 0) {
2613
                /*
2614
                 * This gets a bit tricky if we're going to delete the
2615
                 * rightmost path. Get the other cases out of the way
2616
                 * 1st.
2617
                 */
2618
                if (le16_to_cpu(el->l_next_free_rec) > 1)
2619
                        goto rightmost_no_delete;
2620
 
2621
                if (le16_to_cpu(el->l_next_free_rec) == 0) {
2622
                        ret = -EIO;
2623
                        ocfs2_error(inode->i_sb,
2624
                                    "Inode %llu has empty extent block at %llu",
2625
                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
2626
                                    (unsigned long long)le64_to_cpu(eb->h_blkno));
2627
                        goto out;
2628
                }
2629
 
2630
                /*
2631
                 * XXX: The caller can not trust "path" any more after
2632
                 * this as it will have been deleted. What do we do?
2633
                 *
2634
                 * In theory the rotate-for-merge code will never get
2635
                 * here because it'll always ask for a rotate in a
2636
                 * nonempty list.
2637
                 */
2638
 
2639
                ret = ocfs2_remove_rightmost_path(inode, handle, path,
2640
                                                  dealloc);
2641
                if (ret)
2642
                        mlog_errno(ret);
2643
                goto out;
2644
        }
2645
 
2646
        /*
2647
         * Now we can loop, remembering the path we get from -EAGAIN
2648
         * and restarting from there.
2649
         */
2650
try_rotate:
2651
        ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2652
                                       dealloc, &restart_path);
2653
        if (ret && ret != -EAGAIN) {
2654
                mlog_errno(ret);
2655
                goto out;
2656
        }
2657
 
2658
        while (ret == -EAGAIN) {
2659
                tmp_path = restart_path;
2660
                restart_path = NULL;
2661
 
2662
                ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2663
                                               tmp_path, dealloc,
2664
                                               &restart_path);
2665
                if (ret && ret != -EAGAIN) {
2666
                        mlog_errno(ret);
2667
                        goto out;
2668
                }
2669
 
2670
                ocfs2_free_path(tmp_path);
2671
                tmp_path = NULL;
2672
 
2673
                if (ret == 0)
2674
                        goto try_rotate;
2675
        }
2676
 
2677
out:
2678
        ocfs2_free_path(tmp_path);
2679
        ocfs2_free_path(restart_path);
2680
        return ret;
2681
}
2682
 
2683
static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2684
                                int index)
2685
{
2686
        struct ocfs2_extent_rec *rec = &el->l_recs[index];
2687
        unsigned int size;
2688
 
2689
        if (rec->e_leaf_clusters == 0) {
2690
                /*
2691
                 * We consumed all of the merged-from record. An empty
2692
                 * extent cannot exist anywhere but the 1st array
2693
                 * position, so move things over if the merged-from
2694
                 * record doesn't occupy that position.
2695
                 *
2696
                 * This creates a new empty extent so the caller
2697
                 * should be smart enough to have removed any existing
2698
                 * ones.
2699
                 */
2700
                if (index > 0) {
2701
                        BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2702
                        size = index * sizeof(struct ocfs2_extent_rec);
2703
                        memmove(&el->l_recs[1], &el->l_recs[0], size);
2704
                }
2705
 
2706
                /*
2707
                 * Always memset - the caller doesn't check whether it
2708
                 * created an empty extent, so there could be junk in
2709
                 * the other fields.
2710
                 */
2711
                memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2712
        }
2713
}
2714
 
2715
/*
2716
 * Remove split_rec clusters from the record at index and merge them
2717
 * onto the beginning of the record at index + 1.
2718
 */
2719
static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
2720
                                handle_t *handle,
2721
                                struct ocfs2_extent_rec *split_rec,
2722
                                struct ocfs2_extent_list *el, int index)
2723
{
2724
        int ret;
2725
        unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2726
        struct ocfs2_extent_rec *left_rec;
2727
        struct ocfs2_extent_rec *right_rec;
2728
 
2729
        BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
2730
 
2731
        left_rec = &el->l_recs[index];
2732
        right_rec = &el->l_recs[index + 1];
2733
 
2734
        ret = ocfs2_journal_access(handle, inode, bh,
2735
                                   OCFS2_JOURNAL_ACCESS_WRITE);
2736
        if (ret) {
2737
                mlog_errno(ret);
2738
                goto out;
2739
        }
2740
 
2741
        le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
2742
 
2743
        le32_add_cpu(&right_rec->e_cpos, -split_clusters);
2744
        le64_add_cpu(&right_rec->e_blkno,
2745
                     -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
2746
        le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
2747
 
2748
        ocfs2_cleanup_merge(el, index);
2749
 
2750
        ret = ocfs2_journal_dirty(handle, bh);
2751
        if (ret)
2752
                mlog_errno(ret);
2753
 
2754
out:
2755
        return ret;
2756
}
2757
 
2758
/*
2759
 * Remove split_rec clusters from the record at index and merge them
2760
 * onto the tail of the record at index - 1.
2761
 */
2762
static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
2763
                                handle_t *handle,
2764
                                struct ocfs2_extent_rec *split_rec,
2765
                                struct ocfs2_extent_list *el, int index)
2766
{
2767
        int ret, has_empty_extent = 0;
2768
        unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2769
        struct ocfs2_extent_rec *left_rec;
2770
        struct ocfs2_extent_rec *right_rec;
2771
 
2772
        BUG_ON(index <= 0);
2773
 
2774
        left_rec = &el->l_recs[index - 1];
2775
        right_rec = &el->l_recs[index];
2776
        if (ocfs2_is_empty_extent(&el->l_recs[0]))
2777
                has_empty_extent = 1;
2778
 
2779
        ret = ocfs2_journal_access(handle, inode, bh,
2780
                                   OCFS2_JOURNAL_ACCESS_WRITE);
2781
        if (ret) {
2782
                mlog_errno(ret);
2783
                goto out;
2784
        }
2785
 
2786
        if (has_empty_extent && index == 1) {
2787
                /*
2788
                 * The easy case - we can just plop the record right in.
2789
                 */
2790
                *left_rec = *split_rec;
2791
 
2792
                has_empty_extent = 0;
2793
        } else {
2794
                le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
2795
        }
2796
 
2797
        le32_add_cpu(&right_rec->e_cpos, split_clusters);
2798
        le64_add_cpu(&right_rec->e_blkno,
2799
                     ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
2800
        le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
2801
 
2802
        ocfs2_cleanup_merge(el, index);
2803
 
2804
        ret = ocfs2_journal_dirty(handle, bh);
2805
        if (ret)
2806
                mlog_errno(ret);
2807
 
2808
out:
2809
        return ret;
2810
}
2811
 
2812
static int ocfs2_try_to_merge_extent(struct inode *inode,
2813
                                     handle_t *handle,
2814
                                     struct ocfs2_path *left_path,
2815
                                     int split_index,
2816
                                     struct ocfs2_extent_rec *split_rec,
2817
                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2818
                                     struct ocfs2_merge_ctxt *ctxt)
2819
 
2820
{
2821
        int ret = 0;
2822
        struct ocfs2_extent_list *el = path_leaf_el(left_path);
2823
        struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
2824
 
2825
        BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
2826
 
2827
        if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
2828
                /*
2829
                 * The merge code will need to create an empty
2830
                 * extent to take the place of the newly
2831
                 * emptied slot. Remove any pre-existing empty
2832
                 * extents - having more than one in a leaf is
2833
                 * illegal.
2834
                 */
2835
                ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2836
                                             dealloc);
2837
                if (ret) {
2838
                        mlog_errno(ret);
2839
                        goto out;
2840
                }
2841
                split_index--;
2842
                rec = &el->l_recs[split_index];
2843
        }
2844
 
2845
        if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
2846
                /*
2847
                 * Left-right contig implies this.
2848
                 */
2849
                BUG_ON(!ctxt->c_split_covers_rec);
2850
                BUG_ON(split_index == 0);
2851
 
2852
                /*
2853
                 * Since the leftright insert always covers the entire
2854
                 * extent, this call will delete the insert record
2855
                 * entirely, resulting in an empty extent record added to
2856
                 * the extent block.
2857
                 *
2858
                 * Since the adding of an empty extent shifts
2859
                 * everything back to the right, there's no need to
2860
                 * update split_index here.
2861
                 */
2862
                ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path),
2863
                                           handle, split_rec, el, split_index);
2864
                if (ret) {
2865
                        mlog_errno(ret);
2866
                        goto out;
2867
                }
2868
 
2869
                /*
2870
                 * We can only get this from logic error above.
2871
                 */
2872
                BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
2873
 
2874
                /*
2875
                 * The left merge left us with an empty extent, remove
2876
                 * it.
2877
                 */
2878
                ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
2879
                if (ret) {
2880
                        mlog_errno(ret);
2881
                        goto out;
2882
                }
2883
                split_index--;
2884
                rec = &el->l_recs[split_index];
2885
 
2886
                /*
2887
                 * Note that we don't pass split_rec here on purpose -
2888
                 * we've merged it into the left side.
2889
                 */
2890
                ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path),
2891
                                            handle, rec, el, split_index);
2892
                if (ret) {
2893
                        mlog_errno(ret);
2894
                        goto out;
2895
                }
2896
 
2897
                BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
2898
 
2899
                ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2900
                                             dealloc);
2901
                /*
2902
                 * Error from this last rotate is not critical, so
2903
                 * print but don't bubble it up.
2904
                 */
2905
                if (ret)
2906
                        mlog_errno(ret);
2907
                ret = 0;
2908
        } else {
2909
                /*
2910
                 * Merge a record to the left or right.
2911
                 *
2912
                 * 'contig_type' is relative to the existing record,
2913
                 * so for example, if we're "right contig", it's to
2914
                 * the record on the left (hence the left merge).
2915
                 */
2916
                if (ctxt->c_contig_type == CONTIG_RIGHT) {
2917
                        ret = ocfs2_merge_rec_left(inode,
2918
                                                   path_leaf_bh(left_path),
2919
                                                   handle, split_rec, el,
2920
                                                   split_index);
2921
                        if (ret) {
2922
                                mlog_errno(ret);
2923
                                goto out;
2924
                        }
2925
                } else {
2926
                        ret = ocfs2_merge_rec_right(inode,
2927
                                                    path_leaf_bh(left_path),
2928
                                                    handle, split_rec, el,
2929
                                                    split_index);
2930
                        if (ret) {
2931
                                mlog_errno(ret);
2932
                                goto out;
2933
                        }
2934
                }
2935
 
2936
                if (ctxt->c_split_covers_rec) {
2937
                        /*
2938
                         * The merge may have left an empty extent in
2939
                         * our leaf. Try to rotate it away.
2940
                         */
2941
                        ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2942
                                                     dealloc);
2943
                        if (ret)
2944
                                mlog_errno(ret);
2945
                        ret = 0;
2946
                }
2947
        }
2948
 
2949
out:
2950
        return ret;
2951
}
2952
 
2953
static void ocfs2_subtract_from_rec(struct super_block *sb,
2954
                                    enum ocfs2_split_type split,
2955
                                    struct ocfs2_extent_rec *rec,
2956
                                    struct ocfs2_extent_rec *split_rec)
2957
{
2958
        u64 len_blocks;
2959
 
2960
        len_blocks = ocfs2_clusters_to_blocks(sb,
2961
                                le16_to_cpu(split_rec->e_leaf_clusters));
2962
 
2963
        if (split == SPLIT_LEFT) {
2964
                /*
2965
                 * Region is on the left edge of the existing
2966
                 * record.
2967
                 */
2968
                le32_add_cpu(&rec->e_cpos,
2969
                             le16_to_cpu(split_rec->e_leaf_clusters));
2970
                le64_add_cpu(&rec->e_blkno, len_blocks);
2971
                le16_add_cpu(&rec->e_leaf_clusters,
2972
                             -le16_to_cpu(split_rec->e_leaf_clusters));
2973
        } else {
2974
                /*
2975
                 * Region is on the right edge of the existing
2976
                 * record.
2977
                 */
2978
                le16_add_cpu(&rec->e_leaf_clusters,
2979
                             -le16_to_cpu(split_rec->e_leaf_clusters));
2980
        }
2981
}
2982
 
2983
/*
2984
 * Do the final bits of extent record insertion at the target leaf
2985
 * list. If this leaf is part of an allocation tree, it is assumed
2986
 * that the tree above has been prepared.
2987
 */
2988
static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
2989
                                 struct ocfs2_extent_list *el,
2990
                                 struct ocfs2_insert_type *insert,
2991
                                 struct inode *inode)
2992
{
2993
        int i = insert->ins_contig_index;
2994
        unsigned int range;
2995
        struct ocfs2_extent_rec *rec;
2996
 
2997
        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
2998
 
2999
        if (insert->ins_split != SPLIT_NONE) {
3000
                i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3001
                BUG_ON(i == -1);
3002
                rec = &el->l_recs[i];
3003
                ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3004
                                        insert_rec);
3005
                goto rotate;
3006
        }
3007
 
3008
        /*
3009
         * Contiguous insert - either left or right.
3010
         */
3011
        if (insert->ins_contig != CONTIG_NONE) {
3012
                rec = &el->l_recs[i];
3013
                if (insert->ins_contig == CONTIG_LEFT) {
3014
                        rec->e_blkno = insert_rec->e_blkno;
3015
                        rec->e_cpos = insert_rec->e_cpos;
3016
                }
3017
                le16_add_cpu(&rec->e_leaf_clusters,
3018
                             le16_to_cpu(insert_rec->e_leaf_clusters));
3019
                return;
3020
        }
3021
 
3022
        /*
3023
         * Handle insert into an empty leaf.
3024
         */
3025
        if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3026
            ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3027
             ocfs2_is_empty_extent(&el->l_recs[0]))) {
3028
                el->l_recs[0] = *insert_rec;
3029
                el->l_next_free_rec = cpu_to_le16(1);
3030
                return;
3031
        }
3032
 
3033
        /*
3034
         * Appending insert.
3035
         */
3036
        if (insert->ins_appending == APPEND_TAIL) {
3037
                i = le16_to_cpu(el->l_next_free_rec) - 1;
3038
                rec = &el->l_recs[i];
3039
                range = le32_to_cpu(rec->e_cpos)
3040
                        + le16_to_cpu(rec->e_leaf_clusters);
3041
                BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3042
 
3043
                mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3044
                                le16_to_cpu(el->l_count),
3045
                                "inode %lu, depth %u, count %u, next free %u, "
3046
                                "rec.cpos %u, rec.clusters %u, "
3047
                                "insert.cpos %u, insert.clusters %u\n",
3048
                                inode->i_ino,
3049
                                le16_to_cpu(el->l_tree_depth),
3050
                                le16_to_cpu(el->l_count),
3051
                                le16_to_cpu(el->l_next_free_rec),
3052
                                le32_to_cpu(el->l_recs[i].e_cpos),
3053
                                le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3054
                                le32_to_cpu(insert_rec->e_cpos),
3055
                                le16_to_cpu(insert_rec->e_leaf_clusters));
3056
                i++;
3057
                el->l_recs[i] = *insert_rec;
3058
                le16_add_cpu(&el->l_next_free_rec, 1);
3059
                return;
3060
        }
3061
 
3062
rotate:
3063
        /*
3064
         * Ok, we have to rotate.
3065
         *
3066
         * At this point, it is safe to assume that inserting into an
3067
         * empty leaf and appending to a leaf have both been handled
3068
         * above.
3069
         *
3070
         * This leaf needs to have space, either by the empty 1st
3071
         * extent record, or by virtue of an l_next_rec < l_count.
3072
         */
3073
        ocfs2_rotate_leaf(el, insert_rec);
3074
}
3075
 
3076
static inline void ocfs2_update_dinode_clusters(struct inode *inode,
3077
                                                struct ocfs2_dinode *di,
3078
                                                u32 clusters)
3079
{
3080
        le32_add_cpu(&di->i_clusters, clusters);
3081
        spin_lock(&OCFS2_I(inode)->ip_lock);
3082
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
3083
        spin_unlock(&OCFS2_I(inode)->ip_lock);
3084
}
3085
 
3086
static void ocfs2_adjust_rightmost_records(struct inode *inode,
3087
                                           handle_t *handle,
3088
                                           struct ocfs2_path *path,
3089
                                           struct ocfs2_extent_rec *insert_rec)
3090
{
3091
        int ret, i, next_free;
3092
        struct buffer_head *bh;
3093
        struct ocfs2_extent_list *el;
3094
        struct ocfs2_extent_rec *rec;
3095
 
3096
        /*
3097
         * Update everything except the leaf block.
3098
         */
3099
        for (i = 0; i < path->p_tree_depth; i++) {
3100
                bh = path->p_node[i].bh;
3101
                el = path->p_node[i].el;
3102
 
3103
                next_free = le16_to_cpu(el->l_next_free_rec);
3104
                if (next_free == 0) {
3105
                        ocfs2_error(inode->i_sb,
3106
                                    "Dinode %llu has a bad extent list",
3107
                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
3108
                        ret = -EIO;
3109
                        return;
3110
                }
3111
 
3112
                rec = &el->l_recs[next_free - 1];
3113
 
3114
                rec->e_int_clusters = insert_rec->e_cpos;
3115
                le32_add_cpu(&rec->e_int_clusters,
3116
                             le16_to_cpu(insert_rec->e_leaf_clusters));
3117
                le32_add_cpu(&rec->e_int_clusters,
3118
                             -le32_to_cpu(rec->e_cpos));
3119
 
3120
                ret = ocfs2_journal_dirty(handle, bh);
3121
                if (ret)
3122
                        mlog_errno(ret);
3123
 
3124
        }
3125
}
3126
 
3127
static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3128
                                    struct ocfs2_extent_rec *insert_rec,
3129
                                    struct ocfs2_path *right_path,
3130
                                    struct ocfs2_path **ret_left_path)
3131
{
3132
        int ret, next_free;
3133
        struct ocfs2_extent_list *el;
3134
        struct ocfs2_path *left_path = NULL;
3135
 
3136
        *ret_left_path = NULL;
3137
 
3138
        /*
3139
         * This shouldn't happen for non-trees. The extent rec cluster
3140
         * count manipulation below only works for interior nodes.
3141
         */
3142
        BUG_ON(right_path->p_tree_depth == 0);
3143
 
3144
        /*
3145
         * If our appending insert is at the leftmost edge of a leaf,
3146
         * then we might need to update the rightmost records of the
3147
         * neighboring path.
3148
         */
3149
        el = path_leaf_el(right_path);
3150
        next_free = le16_to_cpu(el->l_next_free_rec);
3151
        if (next_free == 0 ||
3152
            (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3153
                u32 left_cpos;
3154
 
3155
                ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3156
                                                    &left_cpos);
3157
                if (ret) {
3158
                        mlog_errno(ret);
3159
                        goto out;
3160
                }
3161
 
3162
                mlog(0, "Append may need a left path update. cpos: %u, "
3163
                     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3164
                     left_cpos);
3165
 
3166
                /*
3167
                 * No need to worry if the append is already in the
3168
                 * leftmost leaf.
3169
                 */
3170
                if (left_cpos) {
3171
                        left_path = ocfs2_new_path(path_root_bh(right_path),
3172
                                                   path_root_el(right_path));
3173
                        if (!left_path) {
3174
                                ret = -ENOMEM;
3175
                                mlog_errno(ret);
3176
                                goto out;
3177
                        }
3178
 
3179
                        ret = ocfs2_find_path(inode, left_path, left_cpos);
3180
                        if (ret) {
3181
                                mlog_errno(ret);
3182
                                goto out;
3183
                        }
3184
 
3185
                        /*
3186
                         * ocfs2_insert_path() will pass the left_path to the
3187
                         * journal for us.
3188
                         */
3189
                }
3190
        }
3191
 
3192
        ret = ocfs2_journal_access_path(inode, handle, right_path);
3193
        if (ret) {
3194
                mlog_errno(ret);
3195
                goto out;
3196
        }
3197
 
3198
        ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3199
 
3200
        *ret_left_path = left_path;
3201
        ret = 0;
3202
out:
3203
        if (ret != 0)
3204
                ocfs2_free_path(left_path);
3205
 
3206
        return ret;
3207
}
3208
 
3209
static void ocfs2_split_record(struct inode *inode,
3210
                               struct ocfs2_path *left_path,
3211
                               struct ocfs2_path *right_path,
3212
                               struct ocfs2_extent_rec *split_rec,
3213
                               enum ocfs2_split_type split)
3214
{
3215
        int index;
3216
        u32 cpos = le32_to_cpu(split_rec->e_cpos);
3217
        struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3218
        struct ocfs2_extent_rec *rec, *tmprec;
3219
 
3220
        right_el = path_leaf_el(right_path);;
3221
        if (left_path)
3222
                left_el = path_leaf_el(left_path);
3223
 
3224
        el = right_el;
3225
        insert_el = right_el;
3226
        index = ocfs2_search_extent_list(el, cpos);
3227
        if (index != -1) {
3228
                if (index == 0 && left_path) {
3229
                        BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3230
 
3231
                        /*
3232
                         * This typically means that the record
3233
                         * started in the left path but moved to the
3234
                         * right as a result of rotation. We either
3235
                         * move the existing record to the left, or we
3236
                         * do the later insert there.
3237
                         *
3238
                         * In this case, the left path should always
3239
                         * exist as the rotate code will have passed
3240
                         * it back for a post-insert update.
3241
                         */
3242
 
3243
                        if (split == SPLIT_LEFT) {
3244
                                /*
3245
                                 * It's a left split. Since we know
3246
                                 * that the rotate code gave us an
3247
                                 * empty extent in the left path, we
3248
                                 * can just do the insert there.
3249
                                 */
3250
                                insert_el = left_el;
3251
                        } else {
3252
                                /*
3253
                                 * Right split - we have to move the
3254
                                 * existing record over to the left
3255
                                 * leaf. The insert will be into the
3256
                                 * newly created empty extent in the
3257
                                 * right leaf.
3258
                                 */
3259
                                tmprec = &right_el->l_recs[index];
3260
                                ocfs2_rotate_leaf(left_el, tmprec);
3261
                                el = left_el;
3262
 
3263
                                memset(tmprec, 0, sizeof(*tmprec));
3264
                                index = ocfs2_search_extent_list(left_el, cpos);
3265
                                BUG_ON(index == -1);
3266
                        }
3267
                }
3268
        } else {
3269
                BUG_ON(!left_path);
3270
                BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3271
                /*
3272
                 * Left path is easy - we can just allow the insert to
3273
                 * happen.
3274
                 */
3275
                el = left_el;
3276
                insert_el = left_el;
3277
                index = ocfs2_search_extent_list(el, cpos);
3278
                BUG_ON(index == -1);
3279
        }
3280
 
3281
        rec = &el->l_recs[index];
3282
        ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3283
        ocfs2_rotate_leaf(insert_el, split_rec);
3284
}
3285
 
3286
/*
3287
 * This function only does inserts on an allocation b-tree. For dinode
3288
 * lists, ocfs2_insert_at_leaf() is called directly.
3289
 *
3290
 * right_path is the path we want to do the actual insert
3291
 * in. left_path should only be passed in if we need to update that
3292
 * portion of the tree after an edge insert.
3293
 */
3294
static int ocfs2_insert_path(struct inode *inode,
3295
                             handle_t *handle,
3296
                             struct ocfs2_path *left_path,
3297
                             struct ocfs2_path *right_path,
3298
                             struct ocfs2_extent_rec *insert_rec,
3299
                             struct ocfs2_insert_type *insert)
3300
{
3301
        int ret, subtree_index;
3302
        struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3303
 
3304
        if (left_path) {
3305
                int credits = handle->h_buffer_credits;
3306
 
3307
                /*
3308
                 * There's a chance that left_path got passed back to
3309
                 * us without being accounted for in the
3310
                 * journal. Extend our transaction here to be sure we
3311
                 * can change those blocks.
3312
                 */
3313
                credits += left_path->p_tree_depth;
3314
 
3315
                ret = ocfs2_extend_trans(handle, credits);
3316
                if (ret < 0) {
3317
                        mlog_errno(ret);
3318
                        goto out;
3319
                }
3320
 
3321
                ret = ocfs2_journal_access_path(inode, handle, left_path);
3322
                if (ret < 0) {
3323
                        mlog_errno(ret);
3324
                        goto out;
3325
                }
3326
        }
3327
 
3328
        /*
3329
         * Pass both paths to the journal. The majority of inserts
3330
         * will be touching all components anyway.
3331
         */
3332
        ret = ocfs2_journal_access_path(inode, handle, right_path);
3333
        if (ret < 0) {
3334
                mlog_errno(ret);
3335
                goto out;
3336
        }
3337
 
3338
        if (insert->ins_split != SPLIT_NONE) {
3339
                /*
3340
                 * We could call ocfs2_insert_at_leaf() for some types
3341
                 * of splits, but it's easier to just let one seperate
3342
                 * function sort it all out.
3343
                 */
3344
                ocfs2_split_record(inode, left_path, right_path,
3345
                                   insert_rec, insert->ins_split);
3346
 
3347
                /*
3348
                 * Split might have modified either leaf and we don't
3349
                 * have a guarantee that the later edge insert will
3350
                 * dirty this for us.
3351
                 */
3352
                if (left_path)
3353
                        ret = ocfs2_journal_dirty(handle,
3354
                                                  path_leaf_bh(left_path));
3355
                        if (ret)
3356
                                mlog_errno(ret);
3357
        } else
3358
                ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3359
                                     insert, inode);
3360
 
3361
        ret = ocfs2_journal_dirty(handle, leaf_bh);
3362
        if (ret)
3363
                mlog_errno(ret);
3364
 
3365
        if (left_path) {
3366
                /*
3367
                 * The rotate code has indicated that we need to fix
3368
                 * up portions of the tree after the insert.
3369
                 *
3370
                 * XXX: Should we extend the transaction here?
3371
                 */
3372
                subtree_index = ocfs2_find_subtree_root(inode, left_path,
3373
                                                        right_path);
3374
                ocfs2_complete_edge_insert(inode, handle, left_path,
3375
                                           right_path, subtree_index);
3376
        }
3377
 
3378
        ret = 0;
3379
out:
3380
        return ret;
3381
}
3382
 
3383
static int ocfs2_do_insert_extent(struct inode *inode,
3384
                                  handle_t *handle,
3385
                                  struct buffer_head *di_bh,
3386
                                  struct ocfs2_extent_rec *insert_rec,
3387
                                  struct ocfs2_insert_type *type)
3388
{
3389
        int ret, rotate = 0;
3390
        u32 cpos;
3391
        struct ocfs2_path *right_path = NULL;
3392
        struct ocfs2_path *left_path = NULL;
3393
        struct ocfs2_dinode *di;
3394
        struct ocfs2_extent_list *el;
3395
 
3396
        di = (struct ocfs2_dinode *) di_bh->b_data;
3397
        el = &di->id2.i_list;
3398
 
3399
        ret = ocfs2_journal_access(handle, inode, di_bh,
3400
                                   OCFS2_JOURNAL_ACCESS_WRITE);
3401
        if (ret) {
3402
                mlog_errno(ret);
3403
                goto out;
3404
        }
3405
 
3406
        if (le16_to_cpu(el->l_tree_depth) == 0) {
3407
                ocfs2_insert_at_leaf(insert_rec, el, type, inode);
3408
                goto out_update_clusters;
3409
        }
3410
 
3411
        right_path = ocfs2_new_inode_path(di_bh);
3412
        if (!right_path) {
3413
                ret = -ENOMEM;
3414
                mlog_errno(ret);
3415
                goto out;
3416
        }
3417
 
3418
        /*
3419
         * Determine the path to start with. Rotations need the
3420
         * rightmost path, everything else can go directly to the
3421
         * target leaf.
3422
         */
3423
        cpos = le32_to_cpu(insert_rec->e_cpos);
3424
        if (type->ins_appending == APPEND_NONE &&
3425
            type->ins_contig == CONTIG_NONE) {
3426
                rotate = 1;
3427
                cpos = UINT_MAX;
3428
        }
3429
 
3430
        ret = ocfs2_find_path(inode, right_path, cpos);
3431
        if (ret) {
3432
                mlog_errno(ret);
3433
                goto out;
3434
        }
3435
 
3436
        /*
3437
         * Rotations and appends need special treatment - they modify
3438
         * parts of the tree's above them.
3439
         *
3440
         * Both might pass back a path immediate to the left of the
3441
         * one being inserted to. This will be cause
3442
         * ocfs2_insert_path() to modify the rightmost records of
3443
         * left_path to account for an edge insert.
3444
         *
3445
         * XXX: When modifying this code, keep in mind that an insert
3446
         * can wind up skipping both of these two special cases...
3447
         */
3448
        if (rotate) {
3449
                ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
3450
                                              le32_to_cpu(insert_rec->e_cpos),
3451
                                              right_path, &left_path);
3452
                if (ret) {
3453
                        mlog_errno(ret);
3454
                        goto out;
3455
                }
3456
 
3457
                /*
3458
                 * ocfs2_rotate_tree_right() might have extended the
3459
                 * transaction without re-journaling our tree root.
3460
                 */
3461
                ret = ocfs2_journal_access(handle, inode, di_bh,
3462
                                           OCFS2_JOURNAL_ACCESS_WRITE);
3463
                if (ret) {
3464
                        mlog_errno(ret);
3465
                        goto out;
3466
                }
3467
        } else if (type->ins_appending == APPEND_TAIL
3468
                   && type->ins_contig != CONTIG_LEFT) {
3469
                ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
3470
                                               right_path, &left_path);
3471
                if (ret) {
3472
                        mlog_errno(ret);
3473
                        goto out;
3474
                }
3475
        }
3476
 
3477
        ret = ocfs2_insert_path(inode, handle, left_path, right_path,
3478
                                insert_rec, type);
3479
        if (ret) {
3480
                mlog_errno(ret);
3481
                goto out;
3482
        }
3483
 
3484
out_update_clusters:
3485
        if (type->ins_split == SPLIT_NONE)
3486
                ocfs2_update_dinode_clusters(inode, di,
3487
                                             le16_to_cpu(insert_rec->e_leaf_clusters));
3488
 
3489
        ret = ocfs2_journal_dirty(handle, di_bh);
3490
        if (ret)
3491
                mlog_errno(ret);
3492
 
3493
out:
3494
        ocfs2_free_path(left_path);
3495
        ocfs2_free_path(right_path);
3496
 
3497
        return ret;
3498
}
3499
 
3500
static enum ocfs2_contig_type
3501
ocfs2_figure_merge_contig_type(struct inode *inode,
3502
                               struct ocfs2_extent_list *el, int index,
3503
                               struct ocfs2_extent_rec *split_rec)
3504
{
3505
        struct ocfs2_extent_rec *rec;
3506
        enum ocfs2_contig_type ret = CONTIG_NONE;
3507
 
3508
        /*
3509
         * We're careful to check for an empty extent record here -
3510
         * the merge code will know what to do if it sees one.
3511
         */
3512
 
3513
        if (index > 0) {
3514
                rec = &el->l_recs[index - 1];
3515
                if (index == 1 && ocfs2_is_empty_extent(rec)) {
3516
                        if (split_rec->e_cpos == el->l_recs[index].e_cpos)
3517
                                ret = CONTIG_RIGHT;
3518
                } else {
3519
                        ret = ocfs2_extent_contig(inode, rec, split_rec);
3520
                }
3521
        }
3522
 
3523
        if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) {
3524
                enum ocfs2_contig_type contig_type;
3525
 
3526
                rec = &el->l_recs[index + 1];
3527
                contig_type = ocfs2_extent_contig(inode, rec, split_rec);
3528
 
3529
                if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
3530
                        ret = CONTIG_LEFTRIGHT;
3531
                else if (ret == CONTIG_NONE)
3532
                        ret = contig_type;
3533
        }
3534
 
3535
        return ret;
3536
}
3537
 
3538
static void ocfs2_figure_contig_type(struct inode *inode,
3539
                                     struct ocfs2_insert_type *insert,
3540
                                     struct ocfs2_extent_list *el,
3541
                                     struct ocfs2_extent_rec *insert_rec)
3542
{
3543
        int i;
3544
        enum ocfs2_contig_type contig_type = CONTIG_NONE;
3545
 
3546
        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3547
 
3548
        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
3549
                contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
3550
                                                  insert_rec);
3551
                if (contig_type != CONTIG_NONE) {
3552
                        insert->ins_contig_index = i;
3553
                        break;
3554
                }
3555
        }
3556
        insert->ins_contig = contig_type;
3557
}
3558
 
3559
/*
3560
 * This should only be called against the righmost leaf extent list.
3561
 *
3562
 * ocfs2_figure_appending_type() will figure out whether we'll have to
3563
 * insert at the tail of the rightmost leaf.
3564
 *
3565
 * This should also work against the dinode list for tree's with 0
3566
 * depth. If we consider the dinode list to be the rightmost leaf node
3567
 * then the logic here makes sense.
3568
 */
3569
static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
3570
                                        struct ocfs2_extent_list *el,
3571
                                        struct ocfs2_extent_rec *insert_rec)
3572
{
3573
        int i;
3574
        u32 cpos = le32_to_cpu(insert_rec->e_cpos);
3575
        struct ocfs2_extent_rec *rec;
3576
 
3577
        insert->ins_appending = APPEND_NONE;
3578
 
3579
        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3580
 
3581
        if (!el->l_next_free_rec)
3582
                goto set_tail_append;
3583
 
3584
        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
3585
                /* Were all records empty? */
3586
                if (le16_to_cpu(el->l_next_free_rec) == 1)
3587
                        goto set_tail_append;
3588
        }
3589
 
3590
        i = le16_to_cpu(el->l_next_free_rec) - 1;
3591
        rec = &el->l_recs[i];
3592
 
3593
        if (cpos >=
3594
            (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
3595
                goto set_tail_append;
3596
 
3597
        return;
3598
 
3599
set_tail_append:
3600
        insert->ins_appending = APPEND_TAIL;
3601
}
3602
 
3603
/*
3604
 * Helper function called at the begining of an insert.
3605
 *
3606
 * This computes a few things that are commonly used in the process of
3607
 * inserting into the btree:
3608
 *   - Whether the new extent is contiguous with an existing one.
3609
 *   - The current tree depth.
3610
 *   - Whether the insert is an appending one.
3611
 *   - The total # of free records in the tree.
3612
 *
3613
 * All of the information is stored on the ocfs2_insert_type
3614
 * structure.
3615
 */
3616
static int ocfs2_figure_insert_type(struct inode *inode,
3617
                                    struct buffer_head *di_bh,
3618
                                    struct buffer_head **last_eb_bh,
3619
                                    struct ocfs2_extent_rec *insert_rec,
3620
                                    int *free_records,
3621
                                    struct ocfs2_insert_type *insert)
3622
{
3623
        int ret;
3624
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3625
        struct ocfs2_extent_block *eb;
3626
        struct ocfs2_extent_list *el;
3627
        struct ocfs2_path *path = NULL;
3628
        struct buffer_head *bh = NULL;
3629
 
3630
        insert->ins_split = SPLIT_NONE;
3631
 
3632
        el = &di->id2.i_list;
3633
        insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
3634
 
3635
        if (el->l_tree_depth) {
3636
                /*
3637
                 * If we have tree depth, we read in the
3638
                 * rightmost extent block ahead of time as
3639
                 * ocfs2_figure_insert_type() and ocfs2_add_branch()
3640
                 * may want it later.
3641
                 */
3642
                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
3643
                                       le64_to_cpu(di->i_last_eb_blk), &bh,
3644
                                       OCFS2_BH_CACHED, inode);
3645
                if (ret) {
3646
                        mlog_exit(ret);
3647
                        goto out;
3648
                }
3649
                eb = (struct ocfs2_extent_block *) bh->b_data;
3650
                el = &eb->h_list;
3651
        }
3652
 
3653
        /*
3654
         * Unless we have a contiguous insert, we'll need to know if
3655
         * there is room left in our allocation tree for another
3656
         * extent record.
3657
         *
3658
         * XXX: This test is simplistic, we can search for empty
3659
         * extent records too.
3660
         */
3661
        *free_records = le16_to_cpu(el->l_count) -
3662
                le16_to_cpu(el->l_next_free_rec);
3663
 
3664
        if (!insert->ins_tree_depth) {
3665
                ocfs2_figure_contig_type(inode, insert, el, insert_rec);
3666
                ocfs2_figure_appending_type(insert, el, insert_rec);
3667
                return 0;
3668
        }
3669
 
3670
        path = ocfs2_new_inode_path(di_bh);
3671
        if (!path) {
3672
                ret = -ENOMEM;
3673
                mlog_errno(ret);
3674
                goto out;
3675
        }
3676
 
3677
        /*
3678
         * In the case that we're inserting past what the tree
3679
         * currently accounts for, ocfs2_find_path() will return for
3680
         * us the rightmost tree path. This is accounted for below in
3681
         * the appending code.
3682
         */
3683
        ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
3684
        if (ret) {
3685
                mlog_errno(ret);
3686
                goto out;
3687
        }
3688
 
3689
        el = path_leaf_el(path);
3690
 
3691
        /*
3692
         * Now that we have the path, there's two things we want to determine:
3693
         * 1) Contiguousness (also set contig_index if this is so)
3694
         *
3695
         * 2) Are we doing an append? We can trivially break this up
3696
         *     into two types of appends: simple record append, or a
3697
         *     rotate inside the tail leaf.
3698
         */
3699
        ocfs2_figure_contig_type(inode, insert, el, insert_rec);
3700
 
3701
        /*
3702
         * The insert code isn't quite ready to deal with all cases of
3703
         * left contiguousness. Specifically, if it's an insert into
3704
         * the 1st record in a leaf, it will require the adjustment of
3705
         * cluster count on the last record of the path directly to it's
3706
         * left. For now, just catch that case and fool the layers
3707
         * above us. This works just fine for tree_depth == 0, which
3708
         * is why we allow that above.
3709
         */
3710
        if (insert->ins_contig == CONTIG_LEFT &&
3711
            insert->ins_contig_index == 0)
3712
                insert->ins_contig = CONTIG_NONE;
3713
 
3714
        /*
3715
         * Ok, so we can simply compare against last_eb to figure out
3716
         * whether the path doesn't exist. This will only happen in
3717
         * the case that we're doing a tail append, so maybe we can
3718
         * take advantage of that information somehow.
3719
         */
3720
        if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
3721
                /*
3722
                 * Ok, ocfs2_find_path() returned us the rightmost
3723
                 * tree path. This might be an appending insert. There are
3724
                 * two cases:
3725
                 *    1) We're doing a true append at the tail:
3726
                 *      -This might even be off the end of the leaf
3727
                 *    2) We're "appending" by rotating in the tail
3728
                 */
3729
                ocfs2_figure_appending_type(insert, el, insert_rec);
3730
        }
3731
 
3732
out:
3733
        ocfs2_free_path(path);
3734
 
3735
        if (ret == 0)
3736
                *last_eb_bh = bh;
3737
        else
3738
                brelse(bh);
3739
        return ret;
3740
}
3741
 
3742
/*
3743
 * Insert an extent into an inode btree.
3744
 *
3745
 * The caller needs to update fe->i_clusters
3746
 */
3747
int ocfs2_insert_extent(struct ocfs2_super *osb,
3748
                        handle_t *handle,
3749
                        struct inode *inode,
3750
                        struct buffer_head *fe_bh,
3751
                        u32 cpos,
3752
                        u64 start_blk,
3753
                        u32 new_clusters,
3754
                        u8 flags,
3755
                        struct ocfs2_alloc_context *meta_ac)
3756
{
3757
        int status;
3758
        int uninitialized_var(free_records);
3759
        struct buffer_head *last_eb_bh = NULL;
3760
        struct ocfs2_insert_type insert = {0, };
3761
        struct ocfs2_extent_rec rec;
3762
 
3763
        BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
3764
 
3765
        mlog(0, "add %u clusters at position %u to inode %llu\n",
3766
             new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
3767
 
3768
        mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
3769
                        (OCFS2_I(inode)->ip_clusters != cpos),
3770
                        "Device %s, asking for sparse allocation: inode %llu, "
3771
                        "cpos %u, clusters %u\n",
3772
                        osb->dev_str,
3773
                        (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
3774
                        OCFS2_I(inode)->ip_clusters);
3775
 
3776
        memset(&rec, 0, sizeof(rec));
3777
        rec.e_cpos = cpu_to_le32(cpos);
3778
        rec.e_blkno = cpu_to_le64(start_blk);
3779
        rec.e_leaf_clusters = cpu_to_le16(new_clusters);
3780
        rec.e_flags = flags;
3781
 
3782
        status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
3783
                                          &free_records, &insert);
3784
        if (status < 0) {
3785
                mlog_errno(status);
3786
                goto bail;
3787
        }
3788
 
3789
        mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
3790
             "Insert.contig_index: %d, Insert.free_records: %d, "
3791
             "Insert.tree_depth: %d\n",
3792
             insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
3793
             free_records, insert.ins_tree_depth);
3794
 
3795
        if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
3796
                status = ocfs2_grow_tree(inode, handle, fe_bh,
3797
                                         &insert.ins_tree_depth, &last_eb_bh,
3798
                                         meta_ac);
3799
                if (status) {
3800
                        mlog_errno(status);
3801
                        goto bail;
3802
                }
3803
        }
3804
 
3805
        /* Finally, we can add clusters. This might rotate the tree for us. */
3806
        status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
3807
        if (status < 0)
3808
                mlog_errno(status);
3809
        else
3810
                ocfs2_extent_map_insert_rec(inode, &rec);
3811
 
3812
bail:
3813
        if (last_eb_bh)
3814
                brelse(last_eb_bh);
3815
 
3816
        mlog_exit(status);
3817
        return status;
3818
}
3819
 
3820
static void ocfs2_make_right_split_rec(struct super_block *sb,
3821
                                       struct ocfs2_extent_rec *split_rec,
3822
                                       u32 cpos,
3823
                                       struct ocfs2_extent_rec *rec)
3824
{
3825
        u32 rec_cpos = le32_to_cpu(rec->e_cpos);
3826
        u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
3827
 
3828
        memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
3829
 
3830
        split_rec->e_cpos = cpu_to_le32(cpos);
3831
        split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
3832
 
3833
        split_rec->e_blkno = rec->e_blkno;
3834
        le64_add_cpu(&split_rec->e_blkno,
3835
                     ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
3836
 
3837
        split_rec->e_flags = rec->e_flags;
3838
}
3839
 
3840
static int ocfs2_split_and_insert(struct inode *inode,
3841
                                  handle_t *handle,
3842
                                  struct ocfs2_path *path,
3843
                                  struct buffer_head *di_bh,
3844
                                  struct buffer_head **last_eb_bh,
3845
                                  int split_index,
3846
                                  struct ocfs2_extent_rec *orig_split_rec,
3847
                                  struct ocfs2_alloc_context *meta_ac)
3848
{
3849
        int ret = 0, depth;
3850
        unsigned int insert_range, rec_range, do_leftright = 0;
3851
        struct ocfs2_extent_rec tmprec;
3852
        struct ocfs2_extent_list *rightmost_el;
3853
        struct ocfs2_extent_rec rec;
3854
        struct ocfs2_extent_rec split_rec = *orig_split_rec;
3855
        struct ocfs2_insert_type insert;
3856
        struct ocfs2_extent_block *eb;
3857
        struct ocfs2_dinode *di;
3858
 
3859
leftright:
3860
        /*
3861
         * Store a copy of the record on the stack - it might move
3862
         * around as the tree is manipulated below.
3863
         */
3864
        rec = path_leaf_el(path)->l_recs[split_index];
3865
 
3866
        di = (struct ocfs2_dinode *)di_bh->b_data;
3867
        rightmost_el = &di->id2.i_list;
3868
 
3869
        depth = le16_to_cpu(rightmost_el->l_tree_depth);
3870
        if (depth) {
3871
                BUG_ON(!(*last_eb_bh));
3872
                eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
3873
                rightmost_el = &eb->h_list;
3874
        }
3875
 
3876
        if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
3877
            le16_to_cpu(rightmost_el->l_count)) {
3878
                ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
3879
                                      meta_ac);
3880
                if (ret) {
3881
                        mlog_errno(ret);
3882
                        goto out;
3883
                }
3884
        }
3885
 
3886
        memset(&insert, 0, sizeof(struct ocfs2_insert_type));
3887
        insert.ins_appending = APPEND_NONE;
3888
        insert.ins_contig = CONTIG_NONE;
3889
        insert.ins_tree_depth = depth;
3890
 
3891
        insert_range = le32_to_cpu(split_rec.e_cpos) +
3892
                le16_to_cpu(split_rec.e_leaf_clusters);
3893
        rec_range = le32_to_cpu(rec.e_cpos) +
3894
                le16_to_cpu(rec.e_leaf_clusters);
3895
 
3896
        if (split_rec.e_cpos == rec.e_cpos) {
3897
                insert.ins_split = SPLIT_LEFT;
3898
        } else if (insert_range == rec_range) {
3899
                insert.ins_split = SPLIT_RIGHT;
3900
        } else {
3901
                /*
3902
                 * Left/right split. We fake this as a right split
3903
                 * first and then make a second pass as a left split.
3904
                 */
3905
                insert.ins_split = SPLIT_RIGHT;
3906
 
3907
                ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
3908
                                           &rec);
3909
 
3910
                split_rec = tmprec;
3911
 
3912
                BUG_ON(do_leftright);
3913
                do_leftright = 1;
3914
        }
3915
 
3916
        ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
3917
                                     &insert);
3918
        if (ret) {
3919
                mlog_errno(ret);
3920
                goto out;
3921
        }
3922
 
3923
        if (do_leftright == 1) {
3924
                u32 cpos;
3925
                struct ocfs2_extent_list *el;
3926
 
3927
                do_leftright++;
3928
                split_rec = *orig_split_rec;
3929
 
3930
                ocfs2_reinit_path(path, 1);
3931
 
3932
                cpos = le32_to_cpu(split_rec.e_cpos);
3933
                ret = ocfs2_find_path(inode, path, cpos);
3934
                if (ret) {
3935
                        mlog_errno(ret);
3936
                        goto out;
3937
                }
3938
 
3939
                el = path_leaf_el(path);
3940
                split_index = ocfs2_search_extent_list(el, cpos);
3941
                goto leftright;
3942
        }
3943
out:
3944
 
3945
        return ret;
3946
}
3947
 
3948
/*
3949
 * Mark part or all of the extent record at split_index in the leaf
3950
 * pointed to by path as written. This removes the unwritten
3951
 * extent flag.
3952
 *
3953
 * Care is taken to handle contiguousness so as to not grow the tree.
3954
 *
3955
 * meta_ac is not strictly necessary - we only truly need it if growth
3956
 * of the tree is required. All other cases will degrade into a less
3957
 * optimal tree layout.
3958
 *
3959
 * last_eb_bh should be the rightmost leaf block for any inode with a
3960
 * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
3961
 *
3962
 * This code is optimized for readability - several passes might be
3963
 * made over certain portions of the tree. All of those blocks will
3964
 * have been brought into cache (and pinned via the journal), so the
3965
 * extra overhead is not expressed in terms of disk reads.
3966
 */
3967
static int __ocfs2_mark_extent_written(struct inode *inode,
3968
                                       struct buffer_head *di_bh,
3969
                                       handle_t *handle,
3970
                                       struct ocfs2_path *path,
3971
                                       int split_index,
3972
                                       struct ocfs2_extent_rec *split_rec,
3973
                                       struct ocfs2_alloc_context *meta_ac,
3974
                                       struct ocfs2_cached_dealloc_ctxt *dealloc)
3975
{
3976
        int ret = 0;
3977
        struct ocfs2_extent_list *el = path_leaf_el(path);
3978
        struct buffer_head *last_eb_bh = NULL;
3979
        struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3980
        struct ocfs2_merge_ctxt ctxt;
3981
        struct ocfs2_extent_list *rightmost_el;
3982
 
3983
        if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
3984
                ret = -EIO;
3985
                mlog_errno(ret);
3986
                goto out;
3987
        }
3988
 
3989
        if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
3990
            ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
3991
             (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
3992
                ret = -EIO;
3993
                mlog_errno(ret);
3994
                goto out;
3995
        }
3996
 
3997
        ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
3998
                                                            split_index,
3999
                                                            split_rec);
4000
 
4001
        /*
4002
         * The core merge / split code wants to know how much room is
4003
         * left in this inodes allocation tree, so we pass the
4004
         * rightmost extent list.
4005
         */
4006
        if (path->p_tree_depth) {
4007
                struct ocfs2_extent_block *eb;
4008
                struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4009
 
4010
                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4011
                                       le64_to_cpu(di->i_last_eb_blk),
4012
                                       &last_eb_bh, OCFS2_BH_CACHED, inode);
4013
                if (ret) {
4014
                        mlog_exit(ret);
4015
                        goto out;
4016
                }
4017
 
4018
                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4019
                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4020
                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4021
                        ret = -EROFS;
4022
                        goto out;
4023
                }
4024
 
4025
                rightmost_el = &eb->h_list;
4026
        } else
4027
                rightmost_el = path_root_el(path);
4028
 
4029
        if (rec->e_cpos == split_rec->e_cpos &&
4030
            rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4031
                ctxt.c_split_covers_rec = 1;
4032
        else
4033
                ctxt.c_split_covers_rec = 0;
4034
 
4035
        ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4036
 
4037
        mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4038
             split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4039
             ctxt.c_split_covers_rec);
4040
 
4041
        if (ctxt.c_contig_type == CONTIG_NONE) {
4042
                if (ctxt.c_split_covers_rec)
4043
                        el->l_recs[split_index] = *split_rec;
4044
                else
4045
                        ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
4046
                                                     &last_eb_bh, split_index,
4047
                                                     split_rec, meta_ac);
4048
                if (ret)
4049
                        mlog_errno(ret);
4050
        } else {
4051
                ret = ocfs2_try_to_merge_extent(inode, handle, path,
4052
                                                split_index, split_rec,
4053
                                                dealloc, &ctxt);
4054
                if (ret)
4055
                        mlog_errno(ret);
4056
        }
4057
 
4058
out:
4059
        brelse(last_eb_bh);
4060
        return ret;
4061
}
4062
 
4063
/*
4064
 * Mark the already-existing extent at cpos as written for len clusters.
4065
 *
4066
 * If the existing extent is larger than the request, initiate a
4067
 * split. An attempt will be made at merging with adjacent extents.
4068
 *
4069
 * The caller is responsible for passing down meta_ac if we'll need it.
4070
 */
4071
int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
4072
                              handle_t *handle, u32 cpos, u32 len, u32 phys,
4073
                              struct ocfs2_alloc_context *meta_ac,
4074
                              struct ocfs2_cached_dealloc_ctxt *dealloc)
4075
{
4076
        int ret, index;
4077
        u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4078
        struct ocfs2_extent_rec split_rec;
4079
        struct ocfs2_path *left_path = NULL;
4080
        struct ocfs2_extent_list *el;
4081
 
4082
        mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4083
             inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4084
 
4085
        if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4086
                ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4087
                            "that are being written to, but the feature bit "
4088
                            "is not set in the super block.",
4089
                            (unsigned long long)OCFS2_I(inode)->ip_blkno);
4090
                ret = -EROFS;
4091
                goto out;
4092
        }
4093
 
4094
        /*
4095
         * XXX: This should be fixed up so that we just re-insert the
4096
         * next extent records.
4097
         */
4098
        ocfs2_extent_map_trunc(inode, 0);
4099
 
4100
        left_path = ocfs2_new_inode_path(di_bh);
4101
        if (!left_path) {
4102
                ret = -ENOMEM;
4103
                mlog_errno(ret);
4104
                goto out;
4105
        }
4106
 
4107
        ret = ocfs2_find_path(inode, left_path, cpos);
4108
        if (ret) {
4109
                mlog_errno(ret);
4110
                goto out;
4111
        }
4112
        el = path_leaf_el(left_path);
4113
 
4114
        index = ocfs2_search_extent_list(el, cpos);
4115
        if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4116
                ocfs2_error(inode->i_sb,
4117
                            "Inode %llu has an extent at cpos %u which can no "
4118
                            "longer be found.\n",
4119
                            (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4120
                ret = -EROFS;
4121
                goto out;
4122
        }
4123
 
4124
        memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4125
        split_rec.e_cpos = cpu_to_le32(cpos);
4126
        split_rec.e_leaf_clusters = cpu_to_le16(len);
4127
        split_rec.e_blkno = cpu_to_le64(start_blkno);
4128
        split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4129
        split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4130
 
4131
        ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
4132
                                          index, &split_rec, meta_ac, dealloc);
4133
        if (ret)
4134
                mlog_errno(ret);
4135
 
4136
out:
4137
        ocfs2_free_path(left_path);
4138
        return ret;
4139
}
4140
 
4141
static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4142
                            handle_t *handle, struct ocfs2_path *path,
4143
                            int index, u32 new_range,
4144
                            struct ocfs2_alloc_context *meta_ac)
4145
{
4146
        int ret, depth, credits = handle->h_buffer_credits;
4147
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4148
        struct buffer_head *last_eb_bh = NULL;
4149
        struct ocfs2_extent_block *eb;
4150
        struct ocfs2_extent_list *rightmost_el, *el;
4151
        struct ocfs2_extent_rec split_rec;
4152
        struct ocfs2_extent_rec *rec;
4153
        struct ocfs2_insert_type insert;
4154
 
4155
        /*
4156
         * Setup the record to split before we grow the tree.
4157
         */
4158
        el = path_leaf_el(path);
4159
        rec = &el->l_recs[index];
4160
        ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4161
 
4162
        depth = path->p_tree_depth;
4163
        if (depth > 0) {
4164
                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4165
                                       le64_to_cpu(di->i_last_eb_blk),
4166
                                       &last_eb_bh, OCFS2_BH_CACHED, inode);
4167
                if (ret < 0) {
4168
                        mlog_errno(ret);
4169
                        goto out;
4170
                }
4171
 
4172
                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4173
                rightmost_el = &eb->h_list;
4174
        } else
4175
                rightmost_el = path_leaf_el(path);
4176
 
4177
        credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
4178
        ret = ocfs2_extend_trans(handle, credits);
4179
        if (ret) {
4180
                mlog_errno(ret);
4181
                goto out;
4182
        }
4183
 
4184
        if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4185
            le16_to_cpu(rightmost_el->l_count)) {
4186
                ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
4187
                                      meta_ac);
4188
                if (ret) {
4189
                        mlog_errno(ret);
4190
                        goto out;
4191
                }
4192
        }
4193
 
4194
        memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4195
        insert.ins_appending = APPEND_NONE;
4196
        insert.ins_contig = CONTIG_NONE;
4197
        insert.ins_split = SPLIT_RIGHT;
4198
        insert.ins_tree_depth = depth;
4199
 
4200
        ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
4201
        if (ret)
4202
                mlog_errno(ret);
4203
 
4204
out:
4205
        brelse(last_eb_bh);
4206
        return ret;
4207
}
4208
 
4209
static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4210
                              struct ocfs2_path *path, int index,
4211
                              struct ocfs2_cached_dealloc_ctxt *dealloc,
4212
                              u32 cpos, u32 len)
4213
{
4214
        int ret;
4215
        u32 left_cpos, rec_range, trunc_range;
4216
        int wants_rotate = 0, is_rightmost_tree_rec = 0;
4217
        struct super_block *sb = inode->i_sb;
4218
        struct ocfs2_path *left_path = NULL;
4219
        struct ocfs2_extent_list *el = path_leaf_el(path);
4220
        struct ocfs2_extent_rec *rec;
4221
        struct ocfs2_extent_block *eb;
4222
 
4223
        if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
4224
                ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
4225
                if (ret) {
4226
                        mlog_errno(ret);
4227
                        goto out;
4228
                }
4229
 
4230
                index--;
4231
        }
4232
 
4233
        if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
4234
            path->p_tree_depth) {
4235
                /*
4236
                 * Check whether this is the rightmost tree record. If
4237
                 * we remove all of this record or part of its right
4238
                 * edge then an update of the record lengths above it
4239
                 * will be required.
4240
                 */
4241
                eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
4242
                if (eb->h_next_leaf_blk == 0)
4243
                        is_rightmost_tree_rec = 1;
4244
        }
4245
 
4246
        rec = &el->l_recs[index];
4247
        if (index == 0 && path->p_tree_depth &&
4248
            le32_to_cpu(rec->e_cpos) == cpos) {
4249
                /*
4250
                 * Changing the leftmost offset (via partial or whole
4251
                 * record truncate) of an interior (or rightmost) path
4252
                 * means we have to update the subtree that is formed
4253
                 * by this leaf and the one to it's left.
4254
                 *
4255
                 * There are two cases we can skip:
4256
                 *   1) Path is the leftmost one in our inode tree.
4257
                 *   2) The leaf is rightmost and will be empty after
4258
                 *      we remove the extent record - the rotate code
4259
                 *      knows how to update the newly formed edge.
4260
                 */
4261
 
4262
                ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
4263
                                                    &left_cpos);
4264
                if (ret) {
4265
                        mlog_errno(ret);
4266
                        goto out;
4267
                }
4268
 
4269
                if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
4270
                        left_path = ocfs2_new_path(path_root_bh(path),
4271
                                                   path_root_el(path));
4272
                        if (!left_path) {
4273
                                ret = -ENOMEM;
4274
                                mlog_errno(ret);
4275
                                goto out;
4276
                        }
4277
 
4278
                        ret = ocfs2_find_path(inode, left_path, left_cpos);
4279
                        if (ret) {
4280
                                mlog_errno(ret);
4281
                                goto out;
4282
                        }
4283
                }
4284
        }
4285
 
4286
        ret = ocfs2_extend_rotate_transaction(handle, 0,
4287
                                              handle->h_buffer_credits,
4288
                                              path);
4289
        if (ret) {
4290
                mlog_errno(ret);
4291
                goto out;
4292
        }
4293
 
4294
        ret = ocfs2_journal_access_path(inode, handle, path);
4295
        if (ret) {
4296
                mlog_errno(ret);
4297
                goto out;
4298
        }
4299
 
4300
        ret = ocfs2_journal_access_path(inode, handle, left_path);
4301
        if (ret) {
4302
                mlog_errno(ret);
4303
                goto out;
4304
        }
4305
 
4306
        rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4307
        trunc_range = cpos + len;
4308
 
4309
        if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
4310
                int next_free;
4311
 
4312
                memset(rec, 0, sizeof(*rec));
4313
                ocfs2_cleanup_merge(el, index);
4314
                wants_rotate = 1;
4315
 
4316
                next_free = le16_to_cpu(el->l_next_free_rec);
4317
                if (is_rightmost_tree_rec && next_free > 1) {
4318
                        /*
4319
                         * We skip the edge update if this path will
4320
                         * be deleted by the rotate code.
4321
                         */
4322
                        rec = &el->l_recs[next_free - 1];
4323
                        ocfs2_adjust_rightmost_records(inode, handle, path,
4324
                                                       rec);
4325
                }
4326
        } else if (le32_to_cpu(rec->e_cpos) == cpos) {
4327
                /* Remove leftmost portion of the record. */
4328
                le32_add_cpu(&rec->e_cpos, len);
4329
                le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
4330
                le16_add_cpu(&rec->e_leaf_clusters, -len);
4331
        } else if (rec_range == trunc_range) {
4332
                /* Remove rightmost portion of the record */
4333
                le16_add_cpu(&rec->e_leaf_clusters, -len);
4334
                if (is_rightmost_tree_rec)
4335
                        ocfs2_adjust_rightmost_records(inode, handle, path, rec);
4336
        } else {
4337
                /* Caller should have trapped this. */
4338
                mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
4339
                     "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
4340
                     le32_to_cpu(rec->e_cpos),
4341
                     le16_to_cpu(rec->e_leaf_clusters), cpos, len);
4342
                BUG();
4343
        }
4344
 
4345
        if (left_path) {
4346
                int subtree_index;
4347
 
4348
                subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
4349
                ocfs2_complete_edge_insert(inode, handle, left_path, path,
4350
                                           subtree_index);
4351
        }
4352
 
4353
        ocfs2_journal_dirty(handle, path_leaf_bh(path));
4354
 
4355
        ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
4356
        if (ret) {
4357
                mlog_errno(ret);
4358
                goto out;
4359
        }
4360
 
4361
out:
4362
        ocfs2_free_path(left_path);
4363
        return ret;
4364
}
4365
 
4366
int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
4367
                        u32 cpos, u32 len, handle_t *handle,
4368
                        struct ocfs2_alloc_context *meta_ac,
4369
                        struct ocfs2_cached_dealloc_ctxt *dealloc)
4370
{
4371
        int ret, index;
4372
        u32 rec_range, trunc_range;
4373
        struct ocfs2_extent_rec *rec;
4374
        struct ocfs2_extent_list *el;
4375
        struct ocfs2_path *path;
4376
 
4377
        ocfs2_extent_map_trunc(inode, 0);
4378
 
4379
        path = ocfs2_new_inode_path(di_bh);
4380
        if (!path) {
4381
                ret = -ENOMEM;
4382
                mlog_errno(ret);
4383
                goto out;
4384
        }
4385
 
4386
        ret = ocfs2_find_path(inode, path, cpos);
4387
        if (ret) {
4388
                mlog_errno(ret);
4389
                goto out;
4390
        }
4391
 
4392
        el = path_leaf_el(path);
4393
        index = ocfs2_search_extent_list(el, cpos);
4394
        if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4395
                ocfs2_error(inode->i_sb,
4396
                            "Inode %llu has an extent at cpos %u which can no "
4397
                            "longer be found.\n",
4398
                            (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4399
                ret = -EROFS;
4400
                goto out;
4401
        }
4402
 
4403
        /*
4404
         * We have 3 cases of extent removal:
4405
         *   1) Range covers the entire extent rec
4406
         *   2) Range begins or ends on one edge of the extent rec
4407
         *   3) Range is in the middle of the extent rec (no shared edges)
4408
         *
4409
         * For case 1 we remove the extent rec and left rotate to
4410
         * fill the hole.
4411
         *
4412
         * For case 2 we just shrink the existing extent rec, with a
4413
         * tree update if the shrinking edge is also the edge of an
4414
         * extent block.
4415
         *
4416
         * For case 3 we do a right split to turn the extent rec into
4417
         * something case 2 can handle.
4418
         */
4419
        rec = &el->l_recs[index];
4420
        rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4421
        trunc_range = cpos + len;
4422
 
4423
        BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
4424
 
4425
        mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
4426
             "(cpos %u, len %u)\n",
4427
             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
4428
             le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
4429
 
4430
        if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
4431
                ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4432
                                         cpos, len);
4433
                if (ret) {
4434
                        mlog_errno(ret);
4435
                        goto out;
4436
                }
4437
        } else {
4438
                ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
4439
                                       trunc_range, meta_ac);
4440
                if (ret) {
4441
                        mlog_errno(ret);
4442
                        goto out;
4443
                }
4444
 
4445
                /*
4446
                 * The split could have manipulated the tree enough to
4447
                 * move the record location, so we have to look for it again.
4448
                 */
4449
                ocfs2_reinit_path(path, 1);
4450
 
4451
                ret = ocfs2_find_path(inode, path, cpos);
4452
                if (ret) {
4453
                        mlog_errno(ret);
4454
                        goto out;
4455
                }
4456
 
4457
                el = path_leaf_el(path);
4458
                index = ocfs2_search_extent_list(el, cpos);
4459
                if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4460
                        ocfs2_error(inode->i_sb,
4461
                                    "Inode %llu: split at cpos %u lost record.",
4462
                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
4463
                                    cpos);
4464
                        ret = -EROFS;
4465
                        goto out;
4466
                }
4467
 
4468
                /*
4469
                 * Double check our values here. If anything is fishy,
4470
                 * it's easier to catch it at the top level.
4471
                 */
4472
                rec = &el->l_recs[index];
4473
                rec_range = le32_to_cpu(rec->e_cpos) +
4474
                        ocfs2_rec_clusters(el, rec);
4475
                if (rec_range != trunc_range) {
4476
                        ocfs2_error(inode->i_sb,
4477
                                    "Inode %llu: error after split at cpos %u"
4478
                                    "trunc len %u, existing record is (%u,%u)",
4479
                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
4480
                                    cpos, len, le32_to_cpu(rec->e_cpos),
4481
                                    ocfs2_rec_clusters(el, rec));
4482
                        ret = -EROFS;
4483
                        goto out;
4484
                }
4485
 
4486
                ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4487
                                         cpos, len);
4488
                if (ret) {
4489
                        mlog_errno(ret);
4490
                        goto out;
4491
                }
4492
        }
4493
 
4494
out:
4495
        ocfs2_free_path(path);
4496
        return ret;
4497
}
4498
 
4499
int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
4500
{
4501
        struct buffer_head *tl_bh = osb->osb_tl_bh;
4502
        struct ocfs2_dinode *di;
4503
        struct ocfs2_truncate_log *tl;
4504
 
4505
        di = (struct ocfs2_dinode *) tl_bh->b_data;
4506
        tl = &di->id2.i_dealloc;
4507
 
4508
        mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
4509
                        "slot %d, invalid truncate log parameters: used = "
4510
                        "%u, count = %u\n", osb->slot_num,
4511
                        le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
4512
        return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
4513
}
4514
 
4515
static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
4516
                                           unsigned int new_start)
4517
{
4518
        unsigned int tail_index;
4519
        unsigned int current_tail;
4520
 
4521
        /* No records, nothing to coalesce */
4522
        if (!le16_to_cpu(tl->tl_used))
4523
                return 0;
4524
 
4525
        tail_index = le16_to_cpu(tl->tl_used) - 1;
4526
        current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
4527
        current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
4528
 
4529
        return current_tail == new_start;
4530
}
4531
 
4532
int ocfs2_truncate_log_append(struct ocfs2_super *osb,
4533
                              handle_t *handle,
4534
                              u64 start_blk,
4535
                              unsigned int num_clusters)
4536
{
4537
        int status, index;
4538
        unsigned int start_cluster, tl_count;
4539
        struct inode *tl_inode = osb->osb_tl_inode;
4540
        struct buffer_head *tl_bh = osb->osb_tl_bh;
4541
        struct ocfs2_dinode *di;
4542
        struct ocfs2_truncate_log *tl;
4543
 
4544
        mlog_entry("start_blk = %llu, num_clusters = %u\n",
4545
                   (unsigned long long)start_blk, num_clusters);
4546
 
4547
        BUG_ON(mutex_trylock(&tl_inode->i_mutex));
4548
 
4549
        start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
4550
 
4551
        di = (struct ocfs2_dinode *) tl_bh->b_data;
4552
        tl = &di->id2.i_dealloc;
4553
        if (!OCFS2_IS_VALID_DINODE(di)) {
4554
                OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
4555
                status = -EIO;
4556
                goto bail;
4557
        }
4558
 
4559
        tl_count = le16_to_cpu(tl->tl_count);
4560
        mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
4561
                        tl_count == 0,
4562
                        "Truncate record count on #%llu invalid "
4563
                        "wanted %u, actual %u\n",
4564
                        (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
4565
                        ocfs2_truncate_recs_per_inode(osb->sb),
4566
                        le16_to_cpu(tl->tl_count));
4567
 
4568
        /* Caller should have known to flush before calling us. */
4569
        index = le16_to_cpu(tl->tl_used);
4570
        if (index >= tl_count) {
4571
                status = -ENOSPC;
4572
                mlog_errno(status);
4573
                goto bail;
4574
        }
4575
 
4576
        status = ocfs2_journal_access(handle, tl_inode, tl_bh,
4577
                                      OCFS2_JOURNAL_ACCESS_WRITE);
4578
        if (status < 0) {
4579
                mlog_errno(status);
4580
                goto bail;
4581
        }
4582
 
4583
        mlog(0, "Log truncate of %u clusters starting at cluster %u to "
4584
             "%llu (index = %d)\n", num_clusters, start_cluster,
4585
             (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
4586
 
4587
        if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
4588
                /*
4589
                 * Move index back to the record we are coalescing with.
4590
                 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
4591
                 */
4592
                index--;
4593
 
4594
                num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
4595
                mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
4596
                     index, le32_to_cpu(tl->tl_recs[index].t_start),
4597
                     num_clusters);
4598
        } else {
4599
                tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
4600
                tl->tl_used = cpu_to_le16(index + 1);
4601
        }
4602
        tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
4603
 
4604
        status = ocfs2_journal_dirty(handle, tl_bh);
4605
        if (status < 0) {
4606
                mlog_errno(status);
4607
                goto bail;
4608
        }
4609
 
4610
bail:
4611
        mlog_exit(status);
4612
        return status;
4613
}
4614
 
4615
static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
4616
                                         handle_t *handle,
4617
                                         struct inode *data_alloc_inode,
4618
                                         struct buffer_head *data_alloc_bh)
4619
{
4620
        int status = 0;
4621
        int i;
4622
        unsigned int num_clusters;
4623
        u64 start_blk;
4624
        struct ocfs2_truncate_rec rec;
4625
        struct ocfs2_dinode *di;
4626
        struct ocfs2_truncate_log *tl;
4627
        struct inode *tl_inode = osb->osb_tl_inode;
4628
        struct buffer_head *tl_bh = osb->osb_tl_bh;
4629
 
4630
        mlog_entry_void();
4631
 
4632
        di = (struct ocfs2_dinode *) tl_bh->b_data;
4633
        tl = &di->id2.i_dealloc;
4634
        i = le16_to_cpu(tl->tl_used) - 1;
4635
        while (i >= 0) {
4636
                /* Caller has given us at least enough credits to
4637
                 * update the truncate log dinode */
4638
                status = ocfs2_journal_access(handle, tl_inode, tl_bh,
4639
                                              OCFS2_JOURNAL_ACCESS_WRITE);
4640
                if (status < 0) {
4641
                        mlog_errno(status);
4642
                        goto bail;
4643
                }
4644
 
4645
                tl->tl_used = cpu_to_le16(i);
4646
 
4647
                status = ocfs2_journal_dirty(handle, tl_bh);
4648
                if (status < 0) {
4649
                        mlog_errno(status);
4650
                        goto bail;
4651
                }
4652
 
4653
                /* TODO: Perhaps we can calculate the bulk of the
4654
                 * credits up front rather than extending like
4655
                 * this. */
4656
                status = ocfs2_extend_trans(handle,
4657
                                            OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
4658
                if (status < 0) {
4659
                        mlog_errno(status);
4660
                        goto bail;
4661
                }
4662
 
4663
                rec = tl->tl_recs[i];
4664
                start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
4665
                                                    le32_to_cpu(rec.t_start));
4666
                num_clusters = le32_to_cpu(rec.t_clusters);
4667
 
4668
                /* if start_blk is not set, we ignore the record as
4669
                 * invalid. */
4670
                if (start_blk) {
4671
                        mlog(0, "free record %d, start = %u, clusters = %u\n",
4672
                             i, le32_to_cpu(rec.t_start), num_clusters);
4673
 
4674
                        status = ocfs2_free_clusters(handle, data_alloc_inode,
4675
                                                     data_alloc_bh, start_blk,
4676
                                                     num_clusters);
4677
                        if (status < 0) {
4678
                                mlog_errno(status);
4679
                                goto bail;
4680
                        }
4681
                }
4682
                i--;
4683
        }
4684
 
4685
bail:
4686
        mlog_exit(status);
4687
        return status;
4688
}
4689
 
4690
/* Expects you to already be holding tl_inode->i_mutex */
4691
int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
4692
{
4693
        int status;
4694
        unsigned int num_to_flush;
4695
        handle_t *handle;
4696
        struct inode *tl_inode = osb->osb_tl_inode;
4697
        struct inode *data_alloc_inode = NULL;
4698
        struct buffer_head *tl_bh = osb->osb_tl_bh;
4699
        struct buffer_head *data_alloc_bh = NULL;
4700
        struct ocfs2_dinode *di;
4701
        struct ocfs2_truncate_log *tl;
4702
 
4703
        mlog_entry_void();
4704
 
4705
        BUG_ON(mutex_trylock(&tl_inode->i_mutex));
4706
 
4707
        di = (struct ocfs2_dinode *) tl_bh->b_data;
4708
        tl = &di->id2.i_dealloc;
4709
        if (!OCFS2_IS_VALID_DINODE(di)) {
4710
                OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
4711
                status = -EIO;
4712
                goto out;
4713
        }
4714
 
4715
        num_to_flush = le16_to_cpu(tl->tl_used);
4716
        mlog(0, "Flush %u records from truncate log #%llu\n",
4717
             num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
4718
        if (!num_to_flush) {
4719
                status = 0;
4720
                goto out;
4721
        }
4722
 
4723
        data_alloc_inode = ocfs2_get_system_file_inode(osb,
4724
                                                       GLOBAL_BITMAP_SYSTEM_INODE,
4725
                                                       OCFS2_INVALID_SLOT);
4726
        if (!data_alloc_inode) {
4727
                status = -EINVAL;
4728
                mlog(ML_ERROR, "Could not get bitmap inode!\n");
4729
                goto out;
4730
        }
4731
 
4732
        mutex_lock(&data_alloc_inode->i_mutex);
4733
 
4734
        status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1);
4735
        if (status < 0) {
4736
                mlog_errno(status);
4737
                goto out_mutex;
4738
        }
4739
 
4740
        handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
4741
        if (IS_ERR(handle)) {
4742
                status = PTR_ERR(handle);
4743
                mlog_errno(status);
4744
                goto out_unlock;
4745
        }
4746
 
4747
        status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
4748
                                               data_alloc_bh);
4749
        if (status < 0)
4750
                mlog_errno(status);
4751
 
4752
        ocfs2_commit_trans(osb, handle);
4753
 
4754
out_unlock:
4755
        brelse(data_alloc_bh);
4756
        ocfs2_meta_unlock(data_alloc_inode, 1);
4757
 
4758
out_mutex:
4759
        mutex_unlock(&data_alloc_inode->i_mutex);
4760
        iput(data_alloc_inode);
4761
 
4762
out:
4763
        mlog_exit(status);
4764
        return status;
4765
}
4766
 
4767
int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
4768
{
4769
        int status;
4770
        struct inode *tl_inode = osb->osb_tl_inode;
4771
 
4772
        mutex_lock(&tl_inode->i_mutex);
4773
        status = __ocfs2_flush_truncate_log(osb);
4774
        mutex_unlock(&tl_inode->i_mutex);
4775
 
4776
        return status;
4777
}
4778
 
4779
static void ocfs2_truncate_log_worker(struct work_struct *work)
4780
{
4781
        int status;
4782
        struct ocfs2_super *osb =
4783
                container_of(work, struct ocfs2_super,
4784
                             osb_truncate_log_wq.work);
4785
 
4786
        mlog_entry_void();
4787
 
4788
        status = ocfs2_flush_truncate_log(osb);
4789
        if (status < 0)
4790
                mlog_errno(status);
4791
 
4792
        mlog_exit(status);
4793
}
4794
 
4795
#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
4796
void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
4797
                                       int cancel)
4798
{
4799
        if (osb->osb_tl_inode) {
4800
                /* We want to push off log flushes while truncates are
4801
                 * still running. */
4802
                if (cancel)
4803
                        cancel_delayed_work(&osb->osb_truncate_log_wq);
4804
 
4805
                queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
4806
                                   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
4807
        }
4808
}
4809
 
4810
static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
4811
                                       int slot_num,
4812
                                       struct inode **tl_inode,
4813
                                       struct buffer_head **tl_bh)
4814
{
4815
        int status;
4816
        struct inode *inode = NULL;
4817
        struct buffer_head *bh = NULL;
4818
 
4819
        inode = ocfs2_get_system_file_inode(osb,
4820
                                           TRUNCATE_LOG_SYSTEM_INODE,
4821
                                           slot_num);
4822
        if (!inode) {
4823
                status = -EINVAL;
4824
                mlog(ML_ERROR, "Could not get load truncate log inode!\n");
4825
                goto bail;
4826
        }
4827
 
4828
        status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
4829
                                  OCFS2_BH_CACHED, inode);
4830
        if (status < 0) {
4831
                iput(inode);
4832
                mlog_errno(status);
4833
                goto bail;
4834
        }
4835
 
4836
        *tl_inode = inode;
4837
        *tl_bh    = bh;
4838
bail:
4839
        mlog_exit(status);
4840
        return status;
4841
}
4842
 
4843
/* called during the 1st stage of node recovery. we stamp a clean
4844
 * truncate log and pass back a copy for processing later. if the
4845
 * truncate log does not require processing, a *tl_copy is set to
4846
 * NULL. */
4847
int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
4848
                                      int slot_num,
4849
                                      struct ocfs2_dinode **tl_copy)
4850
{
4851
        int status;
4852
        struct inode *tl_inode = NULL;
4853
        struct buffer_head *tl_bh = NULL;
4854
        struct ocfs2_dinode *di;
4855
        struct ocfs2_truncate_log *tl;
4856
 
4857
        *tl_copy = NULL;
4858
 
4859
        mlog(0, "recover truncate log from slot %d\n", slot_num);
4860
 
4861
        status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
4862
        if (status < 0) {
4863
                mlog_errno(status);
4864
                goto bail;
4865
        }
4866
 
4867
        di = (struct ocfs2_dinode *) tl_bh->b_data;
4868
        tl = &di->id2.i_dealloc;
4869
        if (!OCFS2_IS_VALID_DINODE(di)) {
4870
                OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
4871
                status = -EIO;
4872
                goto bail;
4873
        }
4874
 
4875
        if (le16_to_cpu(tl->tl_used)) {
4876
                mlog(0, "We'll have %u logs to recover\n",
4877
                     le16_to_cpu(tl->tl_used));
4878
 
4879
                *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
4880
                if (!(*tl_copy)) {
4881
                        status = -ENOMEM;
4882
                        mlog_errno(status);
4883
                        goto bail;
4884
                }
4885
 
4886
                /* Assuming the write-out below goes well, this copy
4887
                 * will be passed back to recovery for processing. */
4888
                memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
4889
 
4890
                /* All we need to do to clear the truncate log is set
4891
                 * tl_used. */
4892
                tl->tl_used = 0;
4893
 
4894
                status = ocfs2_write_block(osb, tl_bh, tl_inode);
4895
                if (status < 0) {
4896
                        mlog_errno(status);
4897
                        goto bail;
4898
                }
4899
        }
4900
 
4901
bail:
4902
        if (tl_inode)
4903
                iput(tl_inode);
4904
        if (tl_bh)
4905
                brelse(tl_bh);
4906
 
4907
        if (status < 0 && (*tl_copy)) {
4908
                kfree(*tl_copy);
4909
                *tl_copy = NULL;
4910
        }
4911
 
4912
        mlog_exit(status);
4913
        return status;
4914
}
4915
 
4916
int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
4917
                                         struct ocfs2_dinode *tl_copy)
4918
{
4919
        int status = 0;
4920
        int i;
4921
        unsigned int clusters, num_recs, start_cluster;
4922
        u64 start_blk;
4923
        handle_t *handle;
4924
        struct inode *tl_inode = osb->osb_tl_inode;
4925
        struct ocfs2_truncate_log *tl;
4926
 
4927
        mlog_entry_void();
4928
 
4929
        if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
4930
                mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
4931
                return -EINVAL;
4932
        }
4933
 
4934
        tl = &tl_copy->id2.i_dealloc;
4935
        num_recs = le16_to_cpu(tl->tl_used);
4936
        mlog(0, "cleanup %u records from %llu\n", num_recs,
4937
             (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
4938
 
4939
        mutex_lock(&tl_inode->i_mutex);
4940
        for(i = 0; i < num_recs; i++) {
4941
                if (ocfs2_truncate_log_needs_flush(osb)) {
4942
                        status = __ocfs2_flush_truncate_log(osb);
4943
                        if (status < 0) {
4944
                                mlog_errno(status);
4945
                                goto bail_up;
4946
                        }
4947
                }
4948
 
4949
                handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
4950
                if (IS_ERR(handle)) {
4951
                        status = PTR_ERR(handle);
4952
                        mlog_errno(status);
4953
                        goto bail_up;
4954
                }
4955
 
4956
                clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
4957
                start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
4958
                start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
4959
 
4960
                status = ocfs2_truncate_log_append(osb, handle,
4961
                                                   start_blk, clusters);
4962
                ocfs2_commit_trans(osb, handle);
4963
                if (status < 0) {
4964
                        mlog_errno(status);
4965
                        goto bail_up;
4966
                }
4967
        }
4968
 
4969
bail_up:
4970
        mutex_unlock(&tl_inode->i_mutex);
4971
 
4972
        mlog_exit(status);
4973
        return status;
4974
}
4975
 
4976
void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
4977
{
4978
        int status;
4979
        struct inode *tl_inode = osb->osb_tl_inode;
4980
 
4981
        mlog_entry_void();
4982
 
4983
        if (tl_inode) {
4984
                cancel_delayed_work(&osb->osb_truncate_log_wq);
4985
                flush_workqueue(ocfs2_wq);
4986
 
4987
                status = ocfs2_flush_truncate_log(osb);
4988
                if (status < 0)
4989
                        mlog_errno(status);
4990
 
4991
                brelse(osb->osb_tl_bh);
4992
                iput(osb->osb_tl_inode);
4993
        }
4994
 
4995
        mlog_exit_void();
4996
}
4997
 
4998
int ocfs2_truncate_log_init(struct ocfs2_super *osb)
4999
{
5000
        int status;
5001
        struct inode *tl_inode = NULL;
5002
        struct buffer_head *tl_bh = NULL;
5003
 
5004
        mlog_entry_void();
5005
 
5006
        status = ocfs2_get_truncate_log_info(osb,
5007
                                             osb->slot_num,
5008
                                             &tl_inode,
5009
                                             &tl_bh);
5010
        if (status < 0)
5011
                mlog_errno(status);
5012
 
5013
        /* ocfs2_truncate_log_shutdown keys on the existence of
5014
         * osb->osb_tl_inode so we don't set any of the osb variables
5015
         * until we're sure all is well. */
5016
        INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
5017
                          ocfs2_truncate_log_worker);
5018
        osb->osb_tl_bh    = tl_bh;
5019
        osb->osb_tl_inode = tl_inode;
5020
 
5021
        mlog_exit(status);
5022
        return status;
5023
}
5024
 
5025
/*
5026
 * Delayed de-allocation of suballocator blocks.
5027
 *
5028
 * Some sets of block de-allocations might involve multiple suballocator inodes.
5029
 *
5030
 * The locking for this can get extremely complicated, especially when
5031
 * the suballocator inodes to delete from aren't known until deep
5032
 * within an unrelated codepath.
5033
 *
5034
 * ocfs2_extent_block structures are a good example of this - an inode
5035
 * btree could have been grown by any number of nodes each allocating
5036
 * out of their own suballoc inode.
5037
 *
5038
 * These structures allow the delay of block de-allocation until a
5039
 * later time, when locking of multiple cluster inodes won't cause
5040
 * deadlock.
5041
 */
5042
 
5043
/*
5044
 * Describes a single block free from a suballocator
5045
 */
5046
struct ocfs2_cached_block_free {
5047
        struct ocfs2_cached_block_free          *free_next;
5048
        u64                                     free_blk;
5049
        unsigned int                            free_bit;
5050
};
5051
 
5052
struct ocfs2_per_slot_free_list {
5053
        struct ocfs2_per_slot_free_list         *f_next_suballocator;
5054
        int                                     f_inode_type;
5055
        int                                     f_slot;
5056
        struct ocfs2_cached_block_free          *f_first;
5057
};
5058
 
5059
static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5060
                                   int sysfile_type,
5061
                                   int slot,
5062
                                   struct ocfs2_cached_block_free *head)
5063
{
5064
        int ret;
5065
        u64 bg_blkno;
5066
        handle_t *handle;
5067
        struct inode *inode;
5068
        struct buffer_head *di_bh = NULL;
5069
        struct ocfs2_cached_block_free *tmp;
5070
 
5071
        inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5072
        if (!inode) {
5073
                ret = -EINVAL;
5074
                mlog_errno(ret);
5075
                goto out;
5076
        }
5077
 
5078
        mutex_lock(&inode->i_mutex);
5079
 
5080
        ret = ocfs2_meta_lock(inode, &di_bh, 1);
5081
        if (ret) {
5082
                mlog_errno(ret);
5083
                goto out_mutex;
5084
        }
5085
 
5086
        handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5087
        if (IS_ERR(handle)) {
5088
                ret = PTR_ERR(handle);
5089
                mlog_errno(ret);
5090
                goto out_unlock;
5091
        }
5092
 
5093
        while (head) {
5094
                bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5095
                                                      head->free_bit);
5096
                mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5097
                     head->free_bit, (unsigned long long)head->free_blk);
5098
 
5099
                ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5100
                                               head->free_bit, bg_blkno, 1);
5101
                if (ret) {
5102
                        mlog_errno(ret);
5103
                        goto out_journal;
5104
                }
5105
 
5106
                ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5107
                if (ret) {
5108
                        mlog_errno(ret);
5109
                        goto out_journal;
5110
                }
5111
 
5112
                tmp = head;
5113
                head = head->free_next;
5114
                kfree(tmp);
5115
        }
5116
 
5117
out_journal:
5118
        ocfs2_commit_trans(osb, handle);
5119
 
5120
out_unlock:
5121
        ocfs2_meta_unlock(inode, 1);
5122
        brelse(di_bh);
5123
out_mutex:
5124
        mutex_unlock(&inode->i_mutex);
5125
        iput(inode);
5126
out:
5127
        while(head) {
5128
                /* Premature exit may have left some dangling items. */
5129
                tmp = head;
5130
                head = head->free_next;
5131
                kfree(tmp);
5132
        }
5133
 
5134
        return ret;
5135
}
5136
 
5137
int ocfs2_run_deallocs(struct ocfs2_super *osb,
5138
                       struct ocfs2_cached_dealloc_ctxt *ctxt)
5139
{
5140
        int ret = 0, ret2;
5141
        struct ocfs2_per_slot_free_list *fl;
5142
 
5143
        if (!ctxt)
5144
                return 0;
5145
 
5146
        while (ctxt->c_first_suballocator) {
5147
                fl = ctxt->c_first_suballocator;
5148
 
5149
                if (fl->f_first) {
5150
                        mlog(0, "Free items: (type %u, slot %d)\n",
5151
                             fl->f_inode_type, fl->f_slot);
5152
                        ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
5153
                                                       fl->f_slot, fl->f_first);
5154
                        if (ret2)
5155
                                mlog_errno(ret2);
5156
                        if (!ret)
5157
                                ret = ret2;
5158
                }
5159
 
5160
                ctxt->c_first_suballocator = fl->f_next_suballocator;
5161
                kfree(fl);
5162
        }
5163
 
5164
        return ret;
5165
}
5166
 
5167
static struct ocfs2_per_slot_free_list *
5168
ocfs2_find_per_slot_free_list(int type,
5169
                              int slot,
5170
                              struct ocfs2_cached_dealloc_ctxt *ctxt)
5171
{
5172
        struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
5173
 
5174
        while (fl) {
5175
                if (fl->f_inode_type == type && fl->f_slot == slot)
5176
                        return fl;
5177
 
5178
                fl = fl->f_next_suballocator;
5179
        }
5180
 
5181
        fl = kmalloc(sizeof(*fl), GFP_NOFS);
5182
        if (fl) {
5183
                fl->f_inode_type = type;
5184
                fl->f_slot = slot;
5185
                fl->f_first = NULL;
5186
                fl->f_next_suballocator = ctxt->c_first_suballocator;
5187
 
5188
                ctxt->c_first_suballocator = fl;
5189
        }
5190
        return fl;
5191
}
5192
 
5193
static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
5194
                                     int type, int slot, u64 blkno,
5195
                                     unsigned int bit)
5196
{
5197
        int ret;
5198
        struct ocfs2_per_slot_free_list *fl;
5199
        struct ocfs2_cached_block_free *item;
5200
 
5201
        fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
5202
        if (fl == NULL) {
5203
                ret = -ENOMEM;
5204
                mlog_errno(ret);
5205
                goto out;
5206
        }
5207
 
5208
        item = kmalloc(sizeof(*item), GFP_NOFS);
5209
        if (item == NULL) {
5210
                ret = -ENOMEM;
5211
                mlog_errno(ret);
5212
                goto out;
5213
        }
5214
 
5215
        mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
5216
             type, slot, bit, (unsigned long long)blkno);
5217
 
5218
        item->free_blk = blkno;
5219
        item->free_bit = bit;
5220
        item->free_next = fl->f_first;
5221
 
5222
        fl->f_first = item;
5223
 
5224
        ret = 0;
5225
out:
5226
        return ret;
5227
}
5228
 
5229
static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
5230
                                         struct ocfs2_extent_block *eb)
5231
{
5232
        return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
5233
                                         le16_to_cpu(eb->h_suballoc_slot),
5234
                                         le64_to_cpu(eb->h_blkno),
5235
                                         le16_to_cpu(eb->h_suballoc_bit));
5236
}
5237
 
5238
/* This function will figure out whether the currently last extent
5239
 * block will be deleted, and if it will, what the new last extent
5240
 * block will be so we can update his h_next_leaf_blk field, as well
5241
 * as the dinodes i_last_eb_blk */
5242
static int ocfs2_find_new_last_ext_blk(struct inode *inode,
5243
                                       unsigned int clusters_to_del,
5244
                                       struct ocfs2_path *path,
5245
                                       struct buffer_head **new_last_eb)
5246
{
5247
        int next_free, ret = 0;
5248
        u32 cpos;
5249
        struct ocfs2_extent_rec *rec;
5250
        struct ocfs2_extent_block *eb;
5251
        struct ocfs2_extent_list *el;
5252
        struct buffer_head *bh = NULL;
5253
 
5254
        *new_last_eb = NULL;
5255
 
5256
        /* we have no tree, so of course, no last_eb. */
5257
        if (!path->p_tree_depth)
5258
                goto out;
5259
 
5260
        /* trunc to zero special case - this makes tree_depth = 0
5261
         * regardless of what it is.  */
5262
        if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
5263
                goto out;
5264
 
5265
        el = path_leaf_el(path);
5266
        BUG_ON(!el->l_next_free_rec);
5267
 
5268
        /*
5269
         * Make sure that this extent list will actually be empty
5270
         * after we clear away the data. We can shortcut out if
5271
         * there's more than one non-empty extent in the
5272
         * list. Otherwise, a check of the remaining extent is
5273
         * necessary.
5274
         */
5275
        next_free = le16_to_cpu(el->l_next_free_rec);
5276
        rec = NULL;
5277
        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
5278
                if (next_free > 2)
5279
                        goto out;
5280
 
5281
                /* We may have a valid extent in index 1, check it. */
5282
                if (next_free == 2)
5283
                        rec = &el->l_recs[1];
5284
 
5285
                /*
5286
                 * Fall through - no more nonempty extents, so we want
5287
                 * to delete this leaf.
5288
                 */
5289
        } else {
5290
                if (next_free > 1)
5291
                        goto out;
5292
 
5293
                rec = &el->l_recs[0];
5294
        }
5295
 
5296
        if (rec) {
5297
                /*
5298
                 * Check it we'll only be trimming off the end of this
5299
                 * cluster.
5300
                 */
5301
                if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
5302
                        goto out;
5303
        }
5304
 
5305
        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
5306
        if (ret) {
5307
                mlog_errno(ret);
5308
                goto out;
5309
        }
5310
 
5311
        ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
5312
        if (ret) {
5313
                mlog_errno(ret);
5314
                goto out;
5315
        }
5316
 
5317
        eb = (struct ocfs2_extent_block *) bh->b_data;
5318
        el = &eb->h_list;
5319
        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
5320
                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
5321
                ret = -EROFS;
5322
                goto out;
5323
        }
5324
 
5325
        *new_last_eb = bh;
5326
        get_bh(*new_last_eb);
5327
        mlog(0, "returning block %llu, (cpos: %u)\n",
5328
             (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
5329
out:
5330
        brelse(bh);
5331
 
5332
        return ret;
5333
}
5334
 
5335
/*
5336
 * Trim some clusters off the rightmost edge of a tree. Only called
5337
 * during truncate.
5338
 *
5339
 * The caller needs to:
5340
 *   - start journaling of each path component.
5341
 *   - compute and fully set up any new last ext block
5342
 */
5343
static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
5344
                           handle_t *handle, struct ocfs2_truncate_context *tc,
5345
                           u32 clusters_to_del, u64 *delete_start)
5346
{
5347
        int ret, i, index = path->p_tree_depth;
5348
        u32 new_edge = 0;
5349
        u64 deleted_eb = 0;
5350
        struct buffer_head *bh;
5351
        struct ocfs2_extent_list *el;
5352
        struct ocfs2_extent_rec *rec;
5353
 
5354
        *delete_start = 0;
5355
 
5356
        while (index >= 0) {
5357
                bh = path->p_node[index].bh;
5358
                el = path->p_node[index].el;
5359
 
5360
                mlog(0, "traveling tree (index = %d, block = %llu)\n",
5361
                     index,  (unsigned long long)bh->b_blocknr);
5362
 
5363
                BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
5364
 
5365
                if (index !=
5366
                    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
5367
                        ocfs2_error(inode->i_sb,
5368
                                    "Inode %lu has invalid ext. block %llu",
5369
                                    inode->i_ino,
5370
                                    (unsigned long long)bh->b_blocknr);
5371
                        ret = -EROFS;
5372
                        goto out;
5373
                }
5374
 
5375
find_tail_record:
5376
                i = le16_to_cpu(el->l_next_free_rec) - 1;
5377
                rec = &el->l_recs[i];
5378
 
5379
                mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
5380
                     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
5381
                     ocfs2_rec_clusters(el, rec),
5382
                     (unsigned long long)le64_to_cpu(rec->e_blkno),
5383
                     le16_to_cpu(el->l_next_free_rec));
5384
 
5385
                BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
5386
 
5387
                if (le16_to_cpu(el->l_tree_depth) == 0) {
5388
                        /*
5389
                         * If the leaf block contains a single empty
5390
                         * extent and no records, we can just remove
5391
                         * the block.
5392
                         */
5393
                        if (i == 0 && ocfs2_is_empty_extent(rec)) {
5394
                                memset(rec, 0,
5395
                                       sizeof(struct ocfs2_extent_rec));
5396
                                el->l_next_free_rec = cpu_to_le16(0);
5397
 
5398
                                goto delete;
5399
                        }
5400
 
5401
                        /*
5402
                         * Remove any empty extents by shifting things
5403
                         * left. That should make life much easier on
5404
                         * the code below. This condition is rare
5405
                         * enough that we shouldn't see a performance
5406
                         * hit.
5407
                         */
5408
                        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
5409
                                le16_add_cpu(&el->l_next_free_rec, -1);
5410
 
5411
                                for(i = 0;
5412
                                    i < le16_to_cpu(el->l_next_free_rec); i++)
5413
                                        el->l_recs[i] = el->l_recs[i + 1];
5414
 
5415
                                memset(&el->l_recs[i], 0,
5416
                                       sizeof(struct ocfs2_extent_rec));
5417
 
5418
                                /*
5419
                                 * We've modified our extent list. The
5420
                                 * simplest way to handle this change
5421
                                 * is to being the search from the
5422
                                 * start again.
5423
                                 */
5424
                                goto find_tail_record;
5425
                        }
5426
 
5427
                        le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
5428
 
5429
                        /*
5430
                         * We'll use "new_edge" on our way back up the
5431
                         * tree to know what our rightmost cpos is.
5432
                         */
5433
                        new_edge = le16_to_cpu(rec->e_leaf_clusters);
5434
                        new_edge += le32_to_cpu(rec->e_cpos);
5435
 
5436
                        /*
5437
                         * The caller will use this to delete data blocks.
5438
                         */
5439
                        *delete_start = le64_to_cpu(rec->e_blkno)
5440
                                + ocfs2_clusters_to_blocks(inode->i_sb,
5441
                                        le16_to_cpu(rec->e_leaf_clusters));
5442
 
5443
                        /*
5444
                         * If it's now empty, remove this record.
5445
                         */
5446
                        if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
5447
                                memset(rec, 0,
5448
                                       sizeof(struct ocfs2_extent_rec));
5449
                                le16_add_cpu(&el->l_next_free_rec, -1);
5450
                        }
5451
                } else {
5452
                        if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
5453
                                memset(rec, 0,
5454
                                       sizeof(struct ocfs2_extent_rec));
5455
                                le16_add_cpu(&el->l_next_free_rec, -1);
5456
 
5457
                                goto delete;
5458
                        }
5459
 
5460
                        /* Can this actually happen? */
5461
                        if (le16_to_cpu(el->l_next_free_rec) == 0)
5462
                                goto delete;
5463
 
5464
                        /*
5465
                         * We never actually deleted any clusters
5466
                         * because our leaf was empty. There's no
5467
                         * reason to adjust the rightmost edge then.
5468
                         */
5469
                        if (new_edge == 0)
5470
                                goto delete;
5471
 
5472
                        rec->e_int_clusters = cpu_to_le32(new_edge);
5473
                        le32_add_cpu(&rec->e_int_clusters,
5474
                                     -le32_to_cpu(rec->e_cpos));
5475
 
5476
                         /*
5477
                          * A deleted child record should have been
5478
                          * caught above.
5479
                          */
5480
                         BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
5481
                }
5482
 
5483
delete:
5484
                ret = ocfs2_journal_dirty(handle, bh);
5485
                if (ret) {
5486
                        mlog_errno(ret);
5487
                        goto out;
5488
                }
5489
 
5490
                mlog(0, "extent list container %llu, after: record %d: "
5491
                     "(%u, %u, %llu), next = %u.\n",
5492
                     (unsigned long long)bh->b_blocknr, i,
5493
                     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
5494
                     (unsigned long long)le64_to_cpu(rec->e_blkno),
5495
                     le16_to_cpu(el->l_next_free_rec));
5496
 
5497
                /*
5498
                 * We must be careful to only attempt delete of an
5499
                 * extent block (and not the root inode block).
5500
                 */
5501
                if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
5502
                        struct ocfs2_extent_block *eb =
5503
                                (struct ocfs2_extent_block *)bh->b_data;
5504
 
5505
                        /*
5506
                         * Save this for use when processing the
5507
                         * parent block.
5508
                         */
5509
                        deleted_eb = le64_to_cpu(eb->h_blkno);
5510
 
5511
                        mlog(0, "deleting this extent block.\n");
5512
 
5513
                        ocfs2_remove_from_cache(inode, bh);
5514
 
5515
                        BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
5516
                        BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
5517
                        BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
5518
 
5519
                        ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
5520
                        /* An error here is not fatal. */
5521
                        if (ret < 0)
5522
                                mlog_errno(ret);
5523
                } else {
5524
                        deleted_eb = 0;
5525
                }
5526
 
5527
                index--;
5528
        }
5529
 
5530
        ret = 0;
5531
out:
5532
        return ret;
5533
}
5534
 
5535
static int ocfs2_do_truncate(struct ocfs2_super *osb,
5536
                             unsigned int clusters_to_del,
5537
                             struct inode *inode,
5538
                             struct buffer_head *fe_bh,
5539
                             handle_t *handle,
5540
                             struct ocfs2_truncate_context *tc,
5541
                             struct ocfs2_path *path)
5542
{
5543
        int status;
5544
        struct ocfs2_dinode *fe;
5545
        struct ocfs2_extent_block *last_eb = NULL;
5546
        struct ocfs2_extent_list *el;
5547
        struct buffer_head *last_eb_bh = NULL;
5548
        u64 delete_blk = 0;
5549
 
5550
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
5551
 
5552
        status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
5553
                                             path, &last_eb_bh);
5554
        if (status < 0) {
5555
                mlog_errno(status);
5556
                goto bail;
5557
        }
5558
 
5559
        /*
5560
         * Each component will be touched, so we might as well journal
5561
         * here to avoid having to handle errors later.
5562
         */
5563
        status = ocfs2_journal_access_path(inode, handle, path);
5564
        if (status < 0) {
5565
                mlog_errno(status);
5566
                goto bail;
5567
        }
5568
 
5569
        if (last_eb_bh) {
5570
                status = ocfs2_journal_access(handle, inode, last_eb_bh,
5571
                                              OCFS2_JOURNAL_ACCESS_WRITE);
5572
                if (status < 0) {
5573
                        mlog_errno(status);
5574
                        goto bail;
5575
                }
5576
 
5577
                last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5578
        }
5579
 
5580
        el = &(fe->id2.i_list);
5581
 
5582
        /*
5583
         * Lower levels depend on this never happening, but it's best
5584
         * to check it up here before changing the tree.
5585
         */
5586
        if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
5587
                ocfs2_error(inode->i_sb,
5588
                            "Inode %lu has an empty extent record, depth %u\n",
5589
                            inode->i_ino, le16_to_cpu(el->l_tree_depth));
5590
                status = -EROFS;
5591
                goto bail;
5592
        }
5593
 
5594
        spin_lock(&OCFS2_I(inode)->ip_lock);
5595
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
5596
                                      clusters_to_del;
5597
        spin_unlock(&OCFS2_I(inode)->ip_lock);
5598
        le32_add_cpu(&fe->i_clusters, -clusters_to_del);
5599
        inode->i_blocks = ocfs2_inode_sector_count(inode);
5600
 
5601
        status = ocfs2_trim_tree(inode, path, handle, tc,
5602
                                 clusters_to_del, &delete_blk);
5603
        if (status) {
5604
                mlog_errno(status);
5605
                goto bail;
5606
        }
5607
 
5608
        if (le32_to_cpu(fe->i_clusters) == 0) {
5609
                /* trunc to zero is a special case. */
5610
                el->l_tree_depth = 0;
5611
                fe->i_last_eb_blk = 0;
5612
        } else if (last_eb)
5613
                fe->i_last_eb_blk = last_eb->h_blkno;
5614
 
5615
        status = ocfs2_journal_dirty(handle, fe_bh);
5616
        if (status < 0) {
5617
                mlog_errno(status);
5618
                goto bail;
5619
        }
5620
 
5621
        if (last_eb) {
5622
                /* If there will be a new last extent block, then by
5623
                 * definition, there cannot be any leaves to the right of
5624
                 * him. */
5625
                last_eb->h_next_leaf_blk = 0;
5626
                status = ocfs2_journal_dirty(handle, last_eb_bh);
5627
                if (status < 0) {
5628
                        mlog_errno(status);
5629
                        goto bail;
5630
                }
5631
        }
5632
 
5633
        if (delete_blk) {
5634
                status = ocfs2_truncate_log_append(osb, handle, delete_blk,
5635
                                                   clusters_to_del);
5636
                if (status < 0) {
5637
                        mlog_errno(status);
5638
                        goto bail;
5639
                }
5640
        }
5641
        status = 0;
5642
bail:
5643
 
5644
        mlog_exit(status);
5645
        return status;
5646
}
5647
 
5648
static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
5649
{
5650
        set_buffer_uptodate(bh);
5651
        mark_buffer_dirty(bh);
5652
        return 0;
5653
}
5654
 
5655
static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
5656
{
5657
        set_buffer_uptodate(bh);
5658
        mark_buffer_dirty(bh);
5659
        return ocfs2_journal_dirty_data(handle, bh);
5660
}
5661
 
5662
static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
5663
                                     unsigned int from, unsigned int to,
5664
                                     struct page *page, int zero, u64 *phys)
5665
{
5666
        int ret, partial = 0;
5667
 
5668
        ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
5669
        if (ret)
5670
                mlog_errno(ret);
5671
 
5672
        if (zero)
5673
                zero_user_page(page, from, to - from, KM_USER0);
5674
 
5675
        /*
5676
         * Need to set the buffers we zero'd into uptodate
5677
         * here if they aren't - ocfs2_map_page_blocks()
5678
         * might've skipped some
5679
         */
5680
        if (ocfs2_should_order_data(inode)) {
5681
                ret = walk_page_buffers(handle,
5682
                                        page_buffers(page),
5683
                                        from, to, &partial,
5684
                                        ocfs2_ordered_zero_func);
5685
                if (ret < 0)
5686
                        mlog_errno(ret);
5687
        } else {
5688
                ret = walk_page_buffers(handle, page_buffers(page),
5689
                                        from, to, &partial,
5690
                                        ocfs2_writeback_zero_func);
5691
                if (ret < 0)
5692
                        mlog_errno(ret);
5693
        }
5694
 
5695
        if (!partial)
5696
                SetPageUptodate(page);
5697
 
5698
        flush_dcache_page(page);
5699
}
5700
 
5701
static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
5702
                                     loff_t end, struct page **pages,
5703
                                     int numpages, u64 phys, handle_t *handle)
5704
{
5705
        int i;
5706
        struct page *page;
5707
        unsigned int from, to = PAGE_CACHE_SIZE;
5708
        struct super_block *sb = inode->i_sb;
5709
 
5710
        BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
5711
 
5712
        if (numpages == 0)
5713
                goto out;
5714
 
5715
        to = PAGE_CACHE_SIZE;
5716
        for(i = 0; i < numpages; i++) {
5717
                page = pages[i];
5718
 
5719
                from = start & (PAGE_CACHE_SIZE - 1);
5720
                if ((end >> PAGE_CACHE_SHIFT) == page->index)
5721
                        to = end & (PAGE_CACHE_SIZE - 1);
5722
 
5723
                BUG_ON(from > PAGE_CACHE_SIZE);
5724
                BUG_ON(to > PAGE_CACHE_SIZE);
5725
 
5726
                ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
5727
                                         &phys);
5728
 
5729
                start = (page->index + 1) << PAGE_CACHE_SHIFT;
5730
        }
5731
out:
5732
        if (pages)
5733
                ocfs2_unlock_and_free_pages(pages, numpages);
5734
}
5735
 
5736
static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
5737
                                struct page **pages, int *num)
5738
{
5739
        int numpages, ret = 0;
5740
        struct super_block *sb = inode->i_sb;
5741
        struct address_space *mapping = inode->i_mapping;
5742
        unsigned long index;
5743
        loff_t last_page_bytes;
5744
 
5745
        BUG_ON(start > end);
5746
 
5747
        BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
5748
               (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
5749
 
5750
        numpages = 0;
5751
        last_page_bytes = PAGE_ALIGN(end);
5752
        index = start >> PAGE_CACHE_SHIFT;
5753
        do {
5754
                pages[numpages] = grab_cache_page(mapping, index);
5755
                if (!pages[numpages]) {
5756
                        ret = -ENOMEM;
5757
                        mlog_errno(ret);
5758
                        goto out;
5759
                }
5760
 
5761
                numpages++;
5762
                index++;
5763
        } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
5764
 
5765
out:
5766
        if (ret != 0) {
5767
                if (pages)
5768
                        ocfs2_unlock_and_free_pages(pages, numpages);
5769
                numpages = 0;
5770
        }
5771
 
5772
        *num = numpages;
5773
 
5774
        return ret;
5775
}
5776
 
5777
/*
5778
 * Zero the area past i_size but still within an allocated
5779
 * cluster. This avoids exposing nonzero data on subsequent file
5780
 * extends.
5781
 *
5782
 * We need to call this before i_size is updated on the inode because
5783
 * otherwise block_write_full_page() will skip writeout of pages past
5784
 * i_size. The new_i_size parameter is passed for this reason.
5785
 */
5786
int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
5787
                                  u64 range_start, u64 range_end)
5788
{
5789
        int ret = 0, numpages;
5790
        struct page **pages = NULL;
5791
        u64 phys;
5792
        unsigned int ext_flags;
5793
        struct super_block *sb = inode->i_sb;
5794
 
5795
        /*
5796
         * File systems which don't support sparse files zero on every
5797
         * extend.
5798
         */
5799
        if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
5800
                return 0;
5801
 
5802
        pages = kcalloc(ocfs2_pages_per_cluster(sb),
5803
                        sizeof(struct page *), GFP_NOFS);
5804
        if (pages == NULL) {
5805
                ret = -ENOMEM;
5806
                mlog_errno(ret);
5807
                goto out;
5808
        }
5809
 
5810
        if (range_start == range_end)
5811
                goto out;
5812
 
5813
        ret = ocfs2_extent_map_get_blocks(inode,
5814
                                          range_start >> sb->s_blocksize_bits,
5815
                                          &phys, NULL, &ext_flags);
5816
        if (ret) {
5817
                mlog_errno(ret);
5818
                goto out;
5819
        }
5820
 
5821
        /*
5822
         * Tail is a hole, or is marked unwritten. In either case, we
5823
         * can count on read and write to return/push zero's.
5824
         */
5825
        if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
5826
                goto out;
5827
 
5828
        ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
5829
                                   &numpages);
5830
        if (ret) {
5831
                mlog_errno(ret);
5832
                goto out;
5833
        }
5834
 
5835
        ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
5836
                                 numpages, phys, handle);
5837
 
5838
        /*
5839
         * Initiate writeout of the pages we zero'd here. We don't
5840
         * wait on them - the truncate_inode_pages() call later will
5841
         * do that for us.
5842
         */
5843
        ret = do_sync_mapping_range(inode->i_mapping, range_start,
5844
                                    range_end - 1, SYNC_FILE_RANGE_WRITE);
5845
        if (ret)
5846
                mlog_errno(ret);
5847
 
5848
out:
5849
        if (pages)
5850
                kfree(pages);
5851
 
5852
        return ret;
5853
}
5854
 
5855
static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di)
5856
{
5857
        unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
5858
 
5859
        memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2));
5860
}
5861
 
5862
void ocfs2_dinode_new_extent_list(struct inode *inode,
5863
                                  struct ocfs2_dinode *di)
5864
{
5865
        ocfs2_zero_dinode_id2(inode, di);
5866
        di->id2.i_list.l_tree_depth = 0;
5867
        di->id2.i_list.l_next_free_rec = 0;
5868
        di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb));
5869
}
5870
 
5871
void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
5872
{
5873
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
5874
        struct ocfs2_inline_data *idata = &di->id2.i_data;
5875
 
5876
        spin_lock(&oi->ip_lock);
5877
        oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
5878
        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
5879
        spin_unlock(&oi->ip_lock);
5880
 
5881
        /*
5882
         * We clear the entire i_data structure here so that all
5883
         * fields can be properly initialized.
5884
         */
5885
        ocfs2_zero_dinode_id2(inode, di);
5886
 
5887
        idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb));
5888
}
5889
 
5890
int ocfs2_convert_inline_data_to_extents(struct inode *inode,
5891
                                         struct buffer_head *di_bh)
5892
{
5893
        int ret, i, has_data, num_pages = 0;
5894
        handle_t *handle;
5895
        u64 uninitialized_var(block);
5896
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
5897
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5898
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
5899
        struct ocfs2_alloc_context *data_ac = NULL;
5900
        struct page **pages = NULL;
5901
        loff_t end = osb->s_clustersize;
5902
 
5903
        has_data = i_size_read(inode) ? 1 : 0;
5904
 
5905
        if (has_data) {
5906
                pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
5907
                                sizeof(struct page *), GFP_NOFS);
5908
                if (pages == NULL) {
5909
                        ret = -ENOMEM;
5910
                        mlog_errno(ret);
5911
                        goto out;
5912
                }
5913
 
5914
                ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
5915
                if (ret) {
5916
                        mlog_errno(ret);
5917
                        goto out;
5918
                }
5919
        }
5920
 
5921
        handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
5922
        if (IS_ERR(handle)) {
5923
                ret = PTR_ERR(handle);
5924
                mlog_errno(ret);
5925
                goto out_unlock;
5926
        }
5927
 
5928
        ret = ocfs2_journal_access(handle, inode, di_bh,
5929
                                   OCFS2_JOURNAL_ACCESS_WRITE);
5930
        if (ret) {
5931
                mlog_errno(ret);
5932
                goto out_commit;
5933
        }
5934
 
5935
        if (has_data) {
5936
                u32 bit_off, num;
5937
                unsigned int page_end;
5938
                u64 phys;
5939
 
5940
                ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
5941
                                           &num);
5942
                if (ret) {
5943
                        mlog_errno(ret);
5944
                        goto out_commit;
5945
                }
5946
 
5947
                /*
5948
                 * Save two copies, one for insert, and one that can
5949
                 * be changed by ocfs2_map_and_dirty_page() below.
5950
                 */
5951
                block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
5952
 
5953
                /*
5954
                 * Non sparse file systems zero on extend, so no need
5955
                 * to do that now.
5956
                 */
5957
                if (!ocfs2_sparse_alloc(osb) &&
5958
                    PAGE_CACHE_SIZE < osb->s_clustersize)
5959
                        end = PAGE_CACHE_SIZE;
5960
 
5961
                ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
5962
                if (ret) {
5963
                        mlog_errno(ret);
5964
                        goto out_commit;
5965
                }
5966
 
5967
                /*
5968
                 * This should populate the 1st page for us and mark
5969
                 * it up to date.
5970
                 */
5971
                ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
5972
                if (ret) {
5973
                        mlog_errno(ret);
5974
                        goto out_commit;
5975
                }
5976
 
5977
                page_end = PAGE_CACHE_SIZE;
5978
                if (PAGE_CACHE_SIZE > osb->s_clustersize)
5979
                        page_end = osb->s_clustersize;
5980
 
5981
                for (i = 0; i < num_pages; i++)
5982
                        ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
5983
                                                 pages[i], i > 0, &phys);
5984
        }
5985
 
5986
        spin_lock(&oi->ip_lock);
5987
        oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
5988
        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
5989
        spin_unlock(&oi->ip_lock);
5990
 
5991
        ocfs2_dinode_new_extent_list(inode, di);
5992
 
5993
        ocfs2_journal_dirty(handle, di_bh);
5994
 
5995
        if (has_data) {
5996
                /*
5997
                 * An error at this point should be extremely rare. If
5998
                 * this proves to be false, we could always re-build
5999
                 * the in-inode data from our pages.
6000
                 */
6001
                ret = ocfs2_insert_extent(osb, handle, inode, di_bh,
6002
                                          0, block, 1, 0, NULL);
6003
                if (ret) {
6004
                        mlog_errno(ret);
6005
                        goto out_commit;
6006
                }
6007
 
6008
                inode->i_blocks = ocfs2_inode_sector_count(inode);
6009
        }
6010
 
6011
out_commit:
6012
        ocfs2_commit_trans(osb, handle);
6013
 
6014
out_unlock:
6015
        if (data_ac)
6016
                ocfs2_free_alloc_context(data_ac);
6017
 
6018
out:
6019
        if (pages) {
6020
                ocfs2_unlock_and_free_pages(pages, num_pages);
6021
                kfree(pages);
6022
        }
6023
 
6024
        return ret;
6025
}
6026
 
6027
/*
6028
 * It is expected, that by the time you call this function,
6029
 * inode->i_size and fe->i_size have been adjusted.
6030
 *
6031
 * WARNING: This will kfree the truncate context
6032
 */
6033
int ocfs2_commit_truncate(struct ocfs2_super *osb,
6034
                          struct inode *inode,
6035
                          struct buffer_head *fe_bh,
6036
                          struct ocfs2_truncate_context *tc)
6037
{
6038
        int status, i, credits, tl_sem = 0;
6039
        u32 clusters_to_del, new_highest_cpos, range;
6040
        struct ocfs2_extent_list *el;
6041
        handle_t *handle = NULL;
6042
        struct inode *tl_inode = osb->osb_tl_inode;
6043
        struct ocfs2_path *path = NULL;
6044
 
6045
        mlog_entry_void();
6046
 
6047
        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6048
                                                     i_size_read(inode));
6049
 
6050
        path = ocfs2_new_inode_path(fe_bh);
6051
        if (!path) {
6052
                status = -ENOMEM;
6053
                mlog_errno(status);
6054
                goto bail;
6055
        }
6056
 
6057
        ocfs2_extent_map_trunc(inode, new_highest_cpos);
6058
 
6059
start:
6060
        /*
6061
         * Check that we still have allocation to delete.
6062
         */
6063
        if (OCFS2_I(inode)->ip_clusters == 0) {
6064
                status = 0;
6065
                goto bail;
6066
        }
6067
 
6068
        /*
6069
         * Truncate always works against the rightmost tree branch.
6070
         */
6071
        status = ocfs2_find_path(inode, path, UINT_MAX);
6072
        if (status) {
6073
                mlog_errno(status);
6074
                goto bail;
6075
        }
6076
 
6077
        mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
6078
             OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
6079
 
6080
        /*
6081
         * By now, el will point to the extent list on the bottom most
6082
         * portion of this tree. Only the tail record is considered in
6083
         * each pass.
6084
         *
6085
         * We handle the following cases, in order:
6086
         * - empty extent: delete the remaining branch
6087
         * - remove the entire record
6088
         * - remove a partial record
6089
         * - no record needs to be removed (truncate has completed)
6090
         */
6091
        el = path_leaf_el(path);
6092
        if (le16_to_cpu(el->l_next_free_rec) == 0) {
6093
                ocfs2_error(inode->i_sb,
6094
                            "Inode %llu has empty extent block at %llu\n",
6095
                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
6096
                            (unsigned long long)path_leaf_bh(path)->b_blocknr);
6097
                status = -EROFS;
6098
                goto bail;
6099
        }
6100
 
6101
        i = le16_to_cpu(el->l_next_free_rec) - 1;
6102
        range = le32_to_cpu(el->l_recs[i].e_cpos) +
6103
                ocfs2_rec_clusters(el, &el->l_recs[i]);
6104
        if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
6105
                clusters_to_del = 0;
6106
        } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
6107
                clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
6108
        } else if (range > new_highest_cpos) {
6109
                clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
6110
                                   le32_to_cpu(el->l_recs[i].e_cpos)) -
6111
                                  new_highest_cpos;
6112
        } else {
6113
                status = 0;
6114
                goto bail;
6115
        }
6116
 
6117
        mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
6118
             clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
6119
 
6120
        mutex_lock(&tl_inode->i_mutex);
6121
        tl_sem = 1;
6122
        /* ocfs2_truncate_log_needs_flush guarantees us at least one
6123
         * record is free for use. If there isn't any, we flush to get
6124
         * an empty truncate log.  */
6125
        if (ocfs2_truncate_log_needs_flush(osb)) {
6126
                status = __ocfs2_flush_truncate_log(osb);
6127
                if (status < 0) {
6128
                        mlog_errno(status);
6129
                        goto bail;
6130
                }
6131
        }
6132
 
6133
        credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
6134
                                                (struct ocfs2_dinode *)fe_bh->b_data,
6135
                                                el);
6136
        handle = ocfs2_start_trans(osb, credits);
6137
        if (IS_ERR(handle)) {
6138
                status = PTR_ERR(handle);
6139
                handle = NULL;
6140
                mlog_errno(status);
6141
                goto bail;
6142
        }
6143
 
6144
        status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
6145
                                   tc, path);
6146
        if (status < 0) {
6147
                mlog_errno(status);
6148
                goto bail;
6149
        }
6150
 
6151
        mutex_unlock(&tl_inode->i_mutex);
6152
        tl_sem = 0;
6153
 
6154
        ocfs2_commit_trans(osb, handle);
6155
        handle = NULL;
6156
 
6157
        ocfs2_reinit_path(path, 1);
6158
 
6159
        /*
6160
         * The check above will catch the case where we've truncated
6161
         * away all allocation.
6162
         */
6163
        goto start;
6164
 
6165
bail:
6166
 
6167
        ocfs2_schedule_truncate_log_flush(osb, 1);
6168
 
6169
        if (tl_sem)
6170
                mutex_unlock(&tl_inode->i_mutex);
6171
 
6172
        if (handle)
6173
                ocfs2_commit_trans(osb, handle);
6174
 
6175
        ocfs2_run_deallocs(osb, &tc->tc_dealloc);
6176
 
6177
        ocfs2_free_path(path);
6178
 
6179
        /* This will drop the ext_alloc cluster lock for us */
6180
        ocfs2_free_truncate_context(tc);
6181
 
6182
        mlog_exit(status);
6183
        return status;
6184
}
6185
 
6186
/*
6187
 * Expects the inode to already be locked.
6188
 */
6189
int ocfs2_prepare_truncate(struct ocfs2_super *osb,
6190
                           struct inode *inode,
6191
                           struct buffer_head *fe_bh,
6192
                           struct ocfs2_truncate_context **tc)
6193
{
6194
        int status;
6195
        unsigned int new_i_clusters;
6196
        struct ocfs2_dinode *fe;
6197
        struct ocfs2_extent_block *eb;
6198
        struct buffer_head *last_eb_bh = NULL;
6199
 
6200
        mlog_entry_void();
6201
 
6202
        *tc = NULL;
6203
 
6204
        new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
6205
                                                  i_size_read(inode));
6206
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
6207
 
6208
        mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
6209
             "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
6210
             (unsigned long long)le64_to_cpu(fe->i_size));
6211
 
6212
        *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
6213
        if (!(*tc)) {
6214
                status = -ENOMEM;
6215
                mlog_errno(status);
6216
                goto bail;
6217
        }
6218
        ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
6219
 
6220
        if (fe->id2.i_list.l_tree_depth) {
6221
                status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
6222
                                          &last_eb_bh, OCFS2_BH_CACHED, inode);
6223
                if (status < 0) {
6224
                        mlog_errno(status);
6225
                        goto bail;
6226
                }
6227
                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6228
                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6229
                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6230
 
6231
                        brelse(last_eb_bh);
6232
                        status = -EIO;
6233
                        goto bail;
6234
                }
6235
        }
6236
 
6237
        (*tc)->tc_last_eb_bh = last_eb_bh;
6238
 
6239
        status = 0;
6240
bail:
6241
        if (status < 0) {
6242
                if (*tc)
6243
                        ocfs2_free_truncate_context(*tc);
6244
                *tc = NULL;
6245
        }
6246
        mlog_exit_void();
6247
        return status;
6248
}
6249
 
6250
/*
6251
 * 'start' is inclusive, 'end' is not.
6252
 */
6253
int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
6254
                          unsigned int start, unsigned int end, int trunc)
6255
{
6256
        int ret;
6257
        unsigned int numbytes;
6258
        handle_t *handle;
6259
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6260
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6261
        struct ocfs2_inline_data *idata = &di->id2.i_data;
6262
 
6263
        if (end > i_size_read(inode))
6264
                end = i_size_read(inode);
6265
 
6266
        BUG_ON(start >= end);
6267
 
6268
        if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
6269
            !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
6270
            !ocfs2_supports_inline_data(osb)) {
6271
                ocfs2_error(inode->i_sb,
6272
                            "Inline data flags for inode %llu don't agree! "
6273
                            "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
6274
                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
6275
                            le16_to_cpu(di->i_dyn_features),
6276
                            OCFS2_I(inode)->ip_dyn_features,
6277
                            osb->s_feature_incompat);
6278
                ret = -EROFS;
6279
                goto out;
6280
        }
6281
 
6282
        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
6283
        if (IS_ERR(handle)) {
6284
                ret = PTR_ERR(handle);
6285
                mlog_errno(ret);
6286
                goto out;
6287
        }
6288
 
6289
        ret = ocfs2_journal_access(handle, inode, di_bh,
6290
                                   OCFS2_JOURNAL_ACCESS_WRITE);
6291
        if (ret) {
6292
                mlog_errno(ret);
6293
                goto out_commit;
6294
        }
6295
 
6296
        numbytes = end - start;
6297
        memset(idata->id_data + start, 0, numbytes);
6298
 
6299
        /*
6300
         * No need to worry about the data page here - it's been
6301
         * truncated already and inline data doesn't need it for
6302
         * pushing zero's to disk, so we'll let readpage pick it up
6303
         * later.
6304
         */
6305
        if (trunc) {
6306
                i_size_write(inode, start);
6307
                di->i_size = cpu_to_le64(start);
6308
        }
6309
 
6310
        inode->i_blocks = ocfs2_inode_sector_count(inode);
6311
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
6312
 
6313
        di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
6314
        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
6315
 
6316
        ocfs2_journal_dirty(handle, di_bh);
6317
 
6318
out_commit:
6319
        ocfs2_commit_trans(osb, handle);
6320
 
6321
out:
6322
        return ret;
6323
}
6324
 
6325
static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
6326
{
6327
        /*
6328
         * The caller is responsible for completing deallocation
6329
         * before freeing the context.
6330
         */
6331
        if (tc->tc_dealloc.c_first_suballocator != NULL)
6332
                mlog(ML_NOTICE,
6333
                     "Truncate completion has non-empty dealloc context\n");
6334
 
6335
        if (tc->tc_last_eb_bh)
6336
                brelse(tc->tc_last_eb_bh);
6337
 
6338
        kfree(tc);
6339
}

powered by: WebSVN 2.1.0

© copyright 1999-2025 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.