OpenCores
URL https://opencores.org/ocsvn/test_project/test_project/trunk

Subversion Repositories test_project

[/] [test_project/] [trunk/] [linux_sd_driver/] [fs/] [xfs/] [xfs_inode.c] - Blame information for rev 82

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 62 marcus.erl
/*
2
 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3
 * All Rights Reserved.
4
 *
5
 * This program is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU General Public License as
7
 * published by the Free Software Foundation.
8
 *
9
 * This program is distributed in the hope that it would be useful,
10
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
 * GNU General Public License for more details.
13
 *
14
 * You should have received a copy of the GNU General Public License
15
 * along with this program; if not, write the Free Software Foundation,
16
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
 */
18
#include "xfs.h"
19
#include "xfs_fs.h"
20
#include "xfs_types.h"
21
#include "xfs_bit.h"
22
#include "xfs_log.h"
23
#include "xfs_inum.h"
24
#include "xfs_imap.h"
25
#include "xfs_trans.h"
26
#include "xfs_trans_priv.h"
27
#include "xfs_sb.h"
28
#include "xfs_ag.h"
29
#include "xfs_dir2.h"
30
#include "xfs_dmapi.h"
31
#include "xfs_mount.h"
32
#include "xfs_bmap_btree.h"
33
#include "xfs_alloc_btree.h"
34
#include "xfs_ialloc_btree.h"
35
#include "xfs_dir2_sf.h"
36
#include "xfs_attr_sf.h"
37
#include "xfs_dinode.h"
38
#include "xfs_inode.h"
39
#include "xfs_buf_item.h"
40
#include "xfs_inode_item.h"
41
#include "xfs_btree.h"
42
#include "xfs_alloc.h"
43
#include "xfs_ialloc.h"
44
#include "xfs_bmap.h"
45
#include "xfs_rw.h"
46
#include "xfs_error.h"
47
#include "xfs_utils.h"
48
#include "xfs_dir2_trace.h"
49
#include "xfs_quota.h"
50
#include "xfs_acl.h"
51
#include "xfs_filestream.h"
52
#include "xfs_vnodeops.h"
53
 
54
kmem_zone_t *xfs_ifork_zone;
55
kmem_zone_t *xfs_inode_zone;
56
kmem_zone_t *xfs_icluster_zone;
57
 
58
/*
59
 * Used in xfs_itruncate().  This is the maximum number of extents
60
 * freed from a file in a single transaction.
61
 */
62
#define XFS_ITRUNC_MAX_EXTENTS  2
63
 
64
STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
65
STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
66
STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
67
STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
68
 
69
#ifdef DEBUG
70
/*
71
 * Make sure that the extents in the given memory buffer
72
 * are valid.
73
 */
74
STATIC void
75
xfs_validate_extents(
76
        xfs_ifork_t             *ifp,
77
        int                     nrecs,
78
        xfs_exntfmt_t           fmt)
79
{
80
        xfs_bmbt_irec_t         irec;
81
        xfs_bmbt_rec_host_t     rec;
82
        int                     i;
83
 
84
        for (i = 0; i < nrecs; i++) {
85
                xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
86
                rec.l0 = get_unaligned(&ep->l0);
87
                rec.l1 = get_unaligned(&ep->l1);
88
                xfs_bmbt_get_all(&rec, &irec);
89
                if (fmt == XFS_EXTFMT_NOSTATE)
90
                        ASSERT(irec.br_state == XFS_EXT_NORM);
91
        }
92
}
93
#else /* DEBUG */
94
#define xfs_validate_extents(ifp, nrecs, fmt)
95
#endif /* DEBUG */
96
 
97
/*
98
 * Check that none of the inode's in the buffer have a next
99
 * unlinked field of 0.
100
 */
101
#if defined(DEBUG)
102
void
103
xfs_inobp_check(
104
        xfs_mount_t     *mp,
105
        xfs_buf_t       *bp)
106
{
107
        int             i;
108
        int             j;
109
        xfs_dinode_t    *dip;
110
 
111
        j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
112
 
113
        for (i = 0; i < j; i++) {
114
                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
115
                                        i * mp->m_sb.sb_inodesize);
116
                if (!dip->di_next_unlinked)  {
117
                        xfs_fs_cmn_err(CE_ALERT, mp,
118
                                "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p.  About to pop an ASSERT.",
119
                                bp);
120
                        ASSERT(dip->di_next_unlinked);
121
                }
122
        }
123
}
124
#endif
125
 
126
/*
127
 * This routine is called to map an inode number within a file
128
 * system to the buffer containing the on-disk version of the
129
 * inode.  It returns a pointer to the buffer containing the
130
 * on-disk inode in the bpp parameter, and in the dip parameter
131
 * it returns a pointer to the on-disk inode within that buffer.
132
 *
133
 * If a non-zero error is returned, then the contents of bpp and
134
 * dipp are undefined.
135
 *
136
 * Use xfs_imap() to determine the size and location of the
137
 * buffer to read from disk.
138
 */
139
STATIC int
140
xfs_inotobp(
141
        xfs_mount_t     *mp,
142
        xfs_trans_t     *tp,
143
        xfs_ino_t       ino,
144
        xfs_dinode_t    **dipp,
145
        xfs_buf_t       **bpp,
146
        int             *offset)
147
{
148
        int             di_ok;
149
        xfs_imap_t      imap;
150
        xfs_buf_t       *bp;
151
        int             error;
152
        xfs_dinode_t    *dip;
153
 
154
        /*
155
         * Call the space management code to find the location of the
156
         * inode on disk.
157
         */
158
        imap.im_blkno = 0;
159
        error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
160
        if (error != 0) {
161
                cmn_err(CE_WARN,
162
        "xfs_inotobp: xfs_imap()  returned an "
163
        "error %d on %s.  Returning error.", error, mp->m_fsname);
164
                return error;
165
        }
166
 
167
        /*
168
         * If the inode number maps to a block outside the bounds of the
169
         * file system then return NULL rather than calling read_buf
170
         * and panicing when we get an error from the driver.
171
         */
172
        if ((imap.im_blkno + imap.im_len) >
173
            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
174
                cmn_err(CE_WARN,
175
        "xfs_inotobp: inode number (%llu + %d) maps to a block outside the bounds "
176
        "of the file system %s.  Returning EINVAL.",
177
                        (unsigned long long)imap.im_blkno,
178
                        imap.im_len, mp->m_fsname);
179
                return XFS_ERROR(EINVAL);
180
        }
181
 
182
        /*
183
         * Read in the buffer.  If tp is NULL, xfs_trans_read_buf() will
184
         * default to just a read_buf() call.
185
         */
186
        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
187
                                   (int)imap.im_len, XFS_BUF_LOCK, &bp);
188
 
189
        if (error) {
190
                cmn_err(CE_WARN,
191
        "xfs_inotobp: xfs_trans_read_buf()  returned an "
192
        "error %d on %s.  Returning error.", error, mp->m_fsname);
193
                return error;
194
        }
195
        dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0);
196
        di_ok =
197
                be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
198
                XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
199
        if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
200
                        XFS_RANDOM_ITOBP_INOTOBP))) {
201
                XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip);
202
                xfs_trans_brelse(tp, bp);
203
                cmn_err(CE_WARN,
204
        "xfs_inotobp: XFS_TEST_ERROR()  returned an "
205
        "error on %s.  Returning EFSCORRUPTED.",  mp->m_fsname);
206
                return XFS_ERROR(EFSCORRUPTED);
207
        }
208
 
209
        xfs_inobp_check(mp, bp);
210
 
211
        /*
212
         * Set *dipp to point to the on-disk inode in the buffer.
213
         */
214
        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
215
        *bpp = bp;
216
        *offset = imap.im_boffset;
217
        return 0;
218
}
219
 
220
 
221
/*
222
 * This routine is called to map an inode to the buffer containing
223
 * the on-disk version of the inode.  It returns a pointer to the
224
 * buffer containing the on-disk inode in the bpp parameter, and in
225
 * the dip parameter it returns a pointer to the on-disk inode within
226
 * that buffer.
227
 *
228
 * If a non-zero error is returned, then the contents of bpp and
229
 * dipp are undefined.
230
 *
231
 * If the inode is new and has not yet been initialized, use xfs_imap()
232
 * to determine the size and location of the buffer to read from disk.
233
 * If the inode has already been mapped to its buffer and read in once,
234
 * then use the mapping information stored in the inode rather than
235
 * calling xfs_imap().  This allows us to avoid the overhead of looking
236
 * at the inode btree for small block file systems (see xfs_dilocate()).
237
 * We can tell whether the inode has been mapped in before by comparing
238
 * its disk block address to 0.  Only uninitialized inodes will have
239
 * 0 for the disk block address.
240
 */
241
int
242
xfs_itobp(
243
        xfs_mount_t     *mp,
244
        xfs_trans_t     *tp,
245
        xfs_inode_t     *ip,
246
        xfs_dinode_t    **dipp,
247
        xfs_buf_t       **bpp,
248
        xfs_daddr_t     bno,
249
        uint            imap_flags)
250
{
251
        xfs_imap_t      imap;
252
        xfs_buf_t       *bp;
253
        int             error;
254
        int             i;
255
        int             ni;
256
 
257
        if (ip->i_blkno == (xfs_daddr_t)0) {
258
                /*
259
                 * Call the space management code to find the location of the
260
                 * inode on disk.
261
                 */
262
                imap.im_blkno = bno;
263
                if ((error = xfs_imap(mp, tp, ip->i_ino, &imap,
264
                                        XFS_IMAP_LOOKUP | imap_flags)))
265
                        return error;
266
 
267
                /*
268
                 * If the inode number maps to a block outside the bounds
269
                 * of the file system then return NULL rather than calling
270
                 * read_buf and panicing when we get an error from the
271
                 * driver.
272
                 */
273
                if ((imap.im_blkno + imap.im_len) >
274
                    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
275
#ifdef DEBUG
276
                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
277
                                        "(imap.im_blkno (0x%llx) "
278
                                        "+ imap.im_len (0x%llx)) > "
279
                                        " XFS_FSB_TO_BB(mp, "
280
                                        "mp->m_sb.sb_dblocks) (0x%llx)",
281
                                        (unsigned long long) imap.im_blkno,
282
                                        (unsigned long long) imap.im_len,
283
                                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
284
#endif /* DEBUG */
285
                        return XFS_ERROR(EINVAL);
286
                }
287
 
288
                /*
289
                 * Fill in the fields in the inode that will be used to
290
                 * map the inode to its buffer from now on.
291
                 */
292
                ip->i_blkno = imap.im_blkno;
293
                ip->i_len = imap.im_len;
294
                ip->i_boffset = imap.im_boffset;
295
        } else {
296
                /*
297
                 * We've already mapped the inode once, so just use the
298
                 * mapping that we saved the first time.
299
                 */
300
                imap.im_blkno = ip->i_blkno;
301
                imap.im_len = ip->i_len;
302
                imap.im_boffset = ip->i_boffset;
303
        }
304
        ASSERT(bno == 0 || bno == imap.im_blkno);
305
 
306
        /*
307
         * Read in the buffer.  If tp is NULL, xfs_trans_read_buf() will
308
         * default to just a read_buf() call.
309
         */
310
        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
311
                                   (int)imap.im_len, XFS_BUF_LOCK, &bp);
312
        if (error) {
313
#ifdef DEBUG
314
                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
315
                                "xfs_trans_read_buf() returned error %d, "
316
                                "imap.im_blkno 0x%llx, imap.im_len 0x%llx",
317
                                error, (unsigned long long) imap.im_blkno,
318
                                (unsigned long long) imap.im_len);
319
#endif /* DEBUG */
320
                return error;
321
        }
322
 
323
        /*
324
         * Validate the magic number and version of every inode in the buffer
325
         * (if DEBUG kernel) or the first inode in the buffer, otherwise.
326
         * No validation is done here in userspace (xfs_repair).
327
         */
328
#if !defined(__KERNEL__)
329
        ni = 0;
330
#elif defined(DEBUG)
331
        ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog;
332
#else   /* usual case */
333
        ni = 1;
334
#endif
335
 
336
        for (i = 0; i < ni; i++) {
337
                int             di_ok;
338
                xfs_dinode_t    *dip;
339
 
340
                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
341
                                        (i << mp->m_sb.sb_inodelog));
342
                di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
343
                            XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
344
                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
345
                                                XFS_ERRTAG_ITOBP_INOTOBP,
346
                                                XFS_RANDOM_ITOBP_INOTOBP))) {
347
                        if (imap_flags & XFS_IMAP_BULKSTAT) {
348
                                xfs_trans_brelse(tp, bp);
349
                                return XFS_ERROR(EINVAL);
350
                        }
351
#ifdef DEBUG
352
                        cmn_err(CE_ALERT,
353
                                        "Device %s - bad inode magic/vsn "
354
                                        "daddr %lld #%d (magic=%x)",
355
                                XFS_BUFTARG_NAME(mp->m_ddev_targp),
356
                                (unsigned long long)imap.im_blkno, i,
357
                                be16_to_cpu(dip->di_core.di_magic));
358
#endif
359
                        XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH,
360
                                             mp, dip);
361
                        xfs_trans_brelse(tp, bp);
362
                        return XFS_ERROR(EFSCORRUPTED);
363
                }
364
        }
365
 
366
        xfs_inobp_check(mp, bp);
367
 
368
        /*
369
         * Mark the buffer as an inode buffer now that it looks good
370
         */
371
        XFS_BUF_SET_VTYPE(bp, B_FS_INO);
372
 
373
        /*
374
         * Set *dipp to point to the on-disk inode in the buffer.
375
         */
376
        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
377
        *bpp = bp;
378
        return 0;
379
}
380
 
381
/*
382
 * Move inode type and inode format specific information from the
383
 * on-disk inode to the in-core inode.  For fifos, devs, and sockets
384
 * this means set if_rdev to the proper value.  For files, directories,
385
 * and symlinks this means to bring in the in-line data or extent
386
 * pointers.  For a file in B-tree format, only the root is immediately
387
 * brought in-core.  The rest will be in-lined in if_extents when it
388
 * is first referenced (see xfs_iread_extents()).
389
 */
390
STATIC int
391
xfs_iformat(
392
        xfs_inode_t             *ip,
393
        xfs_dinode_t            *dip)
394
{
395
        xfs_attr_shortform_t    *atp;
396
        int                     size;
397
        int                     error;
398
        xfs_fsize_t             di_size;
399
        ip->i_df.if_ext_max =
400
                XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
401
        error = 0;
402
 
403
        if (unlikely(be32_to_cpu(dip->di_core.di_nextents) +
404
                     be16_to_cpu(dip->di_core.di_anextents) >
405
                     be64_to_cpu(dip->di_core.di_nblocks))) {
406
                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
407
                        "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
408
                        (unsigned long long)ip->i_ino,
409
                        (int)(be32_to_cpu(dip->di_core.di_nextents) +
410
                              be16_to_cpu(dip->di_core.di_anextents)),
411
                        (unsigned long long)
412
                                be64_to_cpu(dip->di_core.di_nblocks));
413
                XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
414
                                     ip->i_mount, dip);
415
                return XFS_ERROR(EFSCORRUPTED);
416
        }
417
 
418
        if (unlikely(dip->di_core.di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
419
                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
420
                        "corrupt dinode %Lu, forkoff = 0x%x.",
421
                        (unsigned long long)ip->i_ino,
422
                        dip->di_core.di_forkoff);
423
                XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
424
                                     ip->i_mount, dip);
425
                return XFS_ERROR(EFSCORRUPTED);
426
        }
427
 
428
        switch (ip->i_d.di_mode & S_IFMT) {
429
        case S_IFIFO:
430
        case S_IFCHR:
431
        case S_IFBLK:
432
        case S_IFSOCK:
433
                if (unlikely(dip->di_core.di_format != XFS_DINODE_FMT_DEV)) {
434
                        XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
435
                                              ip->i_mount, dip);
436
                        return XFS_ERROR(EFSCORRUPTED);
437
                }
438
                ip->i_d.di_size = 0;
439
                ip->i_size = 0;
440
                ip->i_df.if_u2.if_rdev = be32_to_cpu(dip->di_u.di_dev);
441
                break;
442
 
443
        case S_IFREG:
444
        case S_IFLNK:
445
        case S_IFDIR:
446
                switch (dip->di_core.di_format) {
447
                case XFS_DINODE_FMT_LOCAL:
448
                        /*
449
                         * no local regular files yet
450
                         */
451
                        if (unlikely((be16_to_cpu(dip->di_core.di_mode) & S_IFMT) == S_IFREG)) {
452
                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
453
                                        "corrupt inode %Lu "
454
                                        "(local format for regular file).",
455
                                        (unsigned long long) ip->i_ino);
456
                                XFS_CORRUPTION_ERROR("xfs_iformat(4)",
457
                                                     XFS_ERRLEVEL_LOW,
458
                                                     ip->i_mount, dip);
459
                                return XFS_ERROR(EFSCORRUPTED);
460
                        }
461
 
462
                        di_size = be64_to_cpu(dip->di_core.di_size);
463
                        if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
464
                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
465
                                        "corrupt inode %Lu "
466
                                        "(bad size %Ld for local inode).",
467
                                        (unsigned long long) ip->i_ino,
468
                                        (long long) di_size);
469
                                XFS_CORRUPTION_ERROR("xfs_iformat(5)",
470
                                                     XFS_ERRLEVEL_LOW,
471
                                                     ip->i_mount, dip);
472
                                return XFS_ERROR(EFSCORRUPTED);
473
                        }
474
 
475
                        size = (int)di_size;
476
                        error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
477
                        break;
478
                case XFS_DINODE_FMT_EXTENTS:
479
                        error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
480
                        break;
481
                case XFS_DINODE_FMT_BTREE:
482
                        error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
483
                        break;
484
                default:
485
                        XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
486
                                         ip->i_mount);
487
                        return XFS_ERROR(EFSCORRUPTED);
488
                }
489
                break;
490
 
491
        default:
492
                XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
493
                return XFS_ERROR(EFSCORRUPTED);
494
        }
495
        if (error) {
496
                return error;
497
        }
498
        if (!XFS_DFORK_Q(dip))
499
                return 0;
500
        ASSERT(ip->i_afp == NULL);
501
        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
502
        ip->i_afp->if_ext_max =
503
                XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
504
        switch (dip->di_core.di_aformat) {
505
        case XFS_DINODE_FMT_LOCAL:
506
                atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
507
                size = be16_to_cpu(atp->hdr.totsize);
508
                error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
509
                break;
510
        case XFS_DINODE_FMT_EXTENTS:
511
                error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
512
                break;
513
        case XFS_DINODE_FMT_BTREE:
514
                error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
515
                break;
516
        default:
517
                error = XFS_ERROR(EFSCORRUPTED);
518
                break;
519
        }
520
        if (error) {
521
                kmem_zone_free(xfs_ifork_zone, ip->i_afp);
522
                ip->i_afp = NULL;
523
                xfs_idestroy_fork(ip, XFS_DATA_FORK);
524
        }
525
        return error;
526
}
527
 
528
/*
529
 * The file is in-lined in the on-disk inode.
530
 * If it fits into if_inline_data, then copy
531
 * it there, otherwise allocate a buffer for it
532
 * and copy the data there.  Either way, set
533
 * if_data to point at the data.
534
 * If we allocate a buffer for the data, make
535
 * sure that its size is a multiple of 4 and
536
 * record the real size in i_real_bytes.
537
 */
538
STATIC int
539
xfs_iformat_local(
540
        xfs_inode_t     *ip,
541
        xfs_dinode_t    *dip,
542
        int             whichfork,
543
        int             size)
544
{
545
        xfs_ifork_t     *ifp;
546
        int             real_size;
547
 
548
        /*
549
         * If the size is unreasonable, then something
550
         * is wrong and we just bail out rather than crash in
551
         * kmem_alloc() or memcpy() below.
552
         */
553
        if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
554
                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
555
                        "corrupt inode %Lu "
556
                        "(bad size %d for local fork, size = %d).",
557
                        (unsigned long long) ip->i_ino, size,
558
                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
559
                XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
560
                                     ip->i_mount, dip);
561
                return XFS_ERROR(EFSCORRUPTED);
562
        }
563
        ifp = XFS_IFORK_PTR(ip, whichfork);
564
        real_size = 0;
565
        if (size == 0)
566
                ifp->if_u1.if_data = NULL;
567
        else if (size <= sizeof(ifp->if_u2.if_inline_data))
568
                ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
569
        else {
570
                real_size = roundup(size, 4);
571
                ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
572
        }
573
        ifp->if_bytes = size;
574
        ifp->if_real_bytes = real_size;
575
        if (size)
576
                memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
577
        ifp->if_flags &= ~XFS_IFEXTENTS;
578
        ifp->if_flags |= XFS_IFINLINE;
579
        return 0;
580
}
581
 
582
/*
583
 * The file consists of a set of extents all
584
 * of which fit into the on-disk inode.
585
 * If there are few enough extents to fit into
586
 * the if_inline_ext, then copy them there.
587
 * Otherwise allocate a buffer for them and copy
588
 * them into it.  Either way, set if_extents
589
 * to point at the extents.
590
 */
591
STATIC int
592
xfs_iformat_extents(
593
        xfs_inode_t     *ip,
594
        xfs_dinode_t    *dip,
595
        int             whichfork)
596
{
597
        xfs_bmbt_rec_t  *dp;
598
        xfs_ifork_t     *ifp;
599
        int             nex;
600
        int             size;
601
        int             i;
602
 
603
        ifp = XFS_IFORK_PTR(ip, whichfork);
604
        nex = XFS_DFORK_NEXTENTS(dip, whichfork);
605
        size = nex * (uint)sizeof(xfs_bmbt_rec_t);
606
 
607
        /*
608
         * If the number of extents is unreasonable, then something
609
         * is wrong and we just bail out rather than crash in
610
         * kmem_alloc() or memcpy() below.
611
         */
612
        if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
613
                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
614
                        "corrupt inode %Lu ((a)extents = %d).",
615
                        (unsigned long long) ip->i_ino, nex);
616
                XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
617
                                     ip->i_mount, dip);
618
                return XFS_ERROR(EFSCORRUPTED);
619
        }
620
 
621
        ifp->if_real_bytes = 0;
622
        if (nex == 0)
623
                ifp->if_u1.if_extents = NULL;
624
        else if (nex <= XFS_INLINE_EXTS)
625
                ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
626
        else
627
                xfs_iext_add(ifp, 0, nex);
628
 
629
        ifp->if_bytes = size;
630
        if (size) {
631
                dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
632
                xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
633
                for (i = 0; i < nex; i++, dp++) {
634
                        xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
635
                        ep->l0 = be64_to_cpu(get_unaligned(&dp->l0));
636
                        ep->l1 = be64_to_cpu(get_unaligned(&dp->l1));
637
                }
638
                XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
639
                if (whichfork != XFS_DATA_FORK ||
640
                        XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
641
                                if (unlikely(xfs_check_nostate_extents(
642
                                    ifp, 0, nex))) {
643
                                        XFS_ERROR_REPORT("xfs_iformat_extents(2)",
644
                                                         XFS_ERRLEVEL_LOW,
645
                                                         ip->i_mount);
646
                                        return XFS_ERROR(EFSCORRUPTED);
647
                                }
648
        }
649
        ifp->if_flags |= XFS_IFEXTENTS;
650
        return 0;
651
}
652
 
653
/*
654
 * The file has too many extents to fit into
655
 * the inode, so they are in B-tree format.
656
 * Allocate a buffer for the root of the B-tree
657
 * and copy the root into it.  The i_extents
658
 * field will remain NULL until all of the
659
 * extents are read in (when they are needed).
660
 */
661
STATIC int
662
xfs_iformat_btree(
663
        xfs_inode_t             *ip,
664
        xfs_dinode_t            *dip,
665
        int                     whichfork)
666
{
667
        xfs_bmdr_block_t        *dfp;
668
        xfs_ifork_t             *ifp;
669
        /* REFERENCED */
670
        int                     nrecs;
671
        int                     size;
672
 
673
        ifp = XFS_IFORK_PTR(ip, whichfork);
674
        dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
675
        size = XFS_BMAP_BROOT_SPACE(dfp);
676
        nrecs = XFS_BMAP_BROOT_NUMRECS(dfp);
677
 
678
        /*
679
         * blow out if -- fork has less extents than can fit in
680
         * fork (fork shouldn't be a btree format), root btree
681
         * block has more records than can fit into the fork,
682
         * or the number of extents is greater than the number of
683
         * blocks.
684
         */
685
        if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
686
            || XFS_BMDR_SPACE_CALC(nrecs) >
687
                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
688
            || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
689
                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
690
                        "corrupt inode %Lu (btree).",
691
                        (unsigned long long) ip->i_ino);
692
                XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
693
                                 ip->i_mount);
694
                return XFS_ERROR(EFSCORRUPTED);
695
        }
696
 
697
        ifp->if_broot_bytes = size;
698
        ifp->if_broot = kmem_alloc(size, KM_SLEEP);
699
        ASSERT(ifp->if_broot != NULL);
700
        /*
701
         * Copy and convert from the on-disk structure
702
         * to the in-memory structure.
703
         */
704
        xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
705
                ifp->if_broot, size);
706
        ifp->if_flags &= ~XFS_IFEXTENTS;
707
        ifp->if_flags |= XFS_IFBROOT;
708
 
709
        return 0;
710
}
711
 
712
void
713
xfs_dinode_from_disk(
714
        xfs_icdinode_t          *to,
715
        xfs_dinode_core_t       *from)
716
{
717
        to->di_magic = be16_to_cpu(from->di_magic);
718
        to->di_mode = be16_to_cpu(from->di_mode);
719
        to->di_version = from ->di_version;
720
        to->di_format = from->di_format;
721
        to->di_onlink = be16_to_cpu(from->di_onlink);
722
        to->di_uid = be32_to_cpu(from->di_uid);
723
        to->di_gid = be32_to_cpu(from->di_gid);
724
        to->di_nlink = be32_to_cpu(from->di_nlink);
725
        to->di_projid = be16_to_cpu(from->di_projid);
726
        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
727
        to->di_flushiter = be16_to_cpu(from->di_flushiter);
728
        to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
729
        to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
730
        to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
731
        to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
732
        to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
733
        to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
734
        to->di_size = be64_to_cpu(from->di_size);
735
        to->di_nblocks = be64_to_cpu(from->di_nblocks);
736
        to->di_extsize = be32_to_cpu(from->di_extsize);
737
        to->di_nextents = be32_to_cpu(from->di_nextents);
738
        to->di_anextents = be16_to_cpu(from->di_anextents);
739
        to->di_forkoff = from->di_forkoff;
740
        to->di_aformat  = from->di_aformat;
741
        to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
742
        to->di_dmstate  = be16_to_cpu(from->di_dmstate);
743
        to->di_flags    = be16_to_cpu(from->di_flags);
744
        to->di_gen      = be32_to_cpu(from->di_gen);
745
}
746
 
747
void
748
xfs_dinode_to_disk(
749
        xfs_dinode_core_t       *to,
750
        xfs_icdinode_t          *from)
751
{
752
        to->di_magic = cpu_to_be16(from->di_magic);
753
        to->di_mode = cpu_to_be16(from->di_mode);
754
        to->di_version = from ->di_version;
755
        to->di_format = from->di_format;
756
        to->di_onlink = cpu_to_be16(from->di_onlink);
757
        to->di_uid = cpu_to_be32(from->di_uid);
758
        to->di_gid = cpu_to_be32(from->di_gid);
759
        to->di_nlink = cpu_to_be32(from->di_nlink);
760
        to->di_projid = cpu_to_be16(from->di_projid);
761
        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
762
        to->di_flushiter = cpu_to_be16(from->di_flushiter);
763
        to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
764
        to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
765
        to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
766
        to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
767
        to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
768
        to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
769
        to->di_size = cpu_to_be64(from->di_size);
770
        to->di_nblocks = cpu_to_be64(from->di_nblocks);
771
        to->di_extsize = cpu_to_be32(from->di_extsize);
772
        to->di_nextents = cpu_to_be32(from->di_nextents);
773
        to->di_anextents = cpu_to_be16(from->di_anextents);
774
        to->di_forkoff = from->di_forkoff;
775
        to->di_aformat = from->di_aformat;
776
        to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
777
        to->di_dmstate = cpu_to_be16(from->di_dmstate);
778
        to->di_flags = cpu_to_be16(from->di_flags);
779
        to->di_gen = cpu_to_be32(from->di_gen);
780
}
781
 
782
STATIC uint
783
_xfs_dic2xflags(
784
        __uint16_t              di_flags)
785
{
786
        uint                    flags = 0;
787
 
788
        if (di_flags & XFS_DIFLAG_ANY) {
789
                if (di_flags & XFS_DIFLAG_REALTIME)
790
                        flags |= XFS_XFLAG_REALTIME;
791
                if (di_flags & XFS_DIFLAG_PREALLOC)
792
                        flags |= XFS_XFLAG_PREALLOC;
793
                if (di_flags & XFS_DIFLAG_IMMUTABLE)
794
                        flags |= XFS_XFLAG_IMMUTABLE;
795
                if (di_flags & XFS_DIFLAG_APPEND)
796
                        flags |= XFS_XFLAG_APPEND;
797
                if (di_flags & XFS_DIFLAG_SYNC)
798
                        flags |= XFS_XFLAG_SYNC;
799
                if (di_flags & XFS_DIFLAG_NOATIME)
800
                        flags |= XFS_XFLAG_NOATIME;
801
                if (di_flags & XFS_DIFLAG_NODUMP)
802
                        flags |= XFS_XFLAG_NODUMP;
803
                if (di_flags & XFS_DIFLAG_RTINHERIT)
804
                        flags |= XFS_XFLAG_RTINHERIT;
805
                if (di_flags & XFS_DIFLAG_PROJINHERIT)
806
                        flags |= XFS_XFLAG_PROJINHERIT;
807
                if (di_flags & XFS_DIFLAG_NOSYMLINKS)
808
                        flags |= XFS_XFLAG_NOSYMLINKS;
809
                if (di_flags & XFS_DIFLAG_EXTSIZE)
810
                        flags |= XFS_XFLAG_EXTSIZE;
811
                if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
812
                        flags |= XFS_XFLAG_EXTSZINHERIT;
813
                if (di_flags & XFS_DIFLAG_NODEFRAG)
814
                        flags |= XFS_XFLAG_NODEFRAG;
815
                if (di_flags & XFS_DIFLAG_FILESTREAM)
816
                        flags |= XFS_XFLAG_FILESTREAM;
817
        }
818
 
819
        return flags;
820
}
821
 
822
uint
823
xfs_ip2xflags(
824
        xfs_inode_t             *ip)
825
{
826
        xfs_icdinode_t          *dic = &ip->i_d;
827
 
828
        return _xfs_dic2xflags(dic->di_flags) |
829
                                (XFS_CFORK_Q(dic) ? XFS_XFLAG_HASATTR : 0);
830
}
831
 
832
uint
833
xfs_dic2xflags(
834
        xfs_dinode_core_t       *dic)
835
{
836
        return _xfs_dic2xflags(be16_to_cpu(dic->di_flags)) |
837
                                (XFS_CFORK_Q_DISK(dic) ? XFS_XFLAG_HASATTR : 0);
838
}
839
 
840
/*
841
 * Given a mount structure and an inode number, return a pointer
842
 * to a newly allocated in-core inode corresponding to the given
843
 * inode number.
844
 *
845
 * Initialize the inode's attributes and extent pointers if it
846
 * already has them (it will not if the inode has no links).
847
 */
848
int
849
xfs_iread(
850
        xfs_mount_t     *mp,
851
        xfs_trans_t     *tp,
852
        xfs_ino_t       ino,
853
        xfs_inode_t     **ipp,
854
        xfs_daddr_t     bno,
855
        uint            imap_flags)
856
{
857
        xfs_buf_t       *bp;
858
        xfs_dinode_t    *dip;
859
        xfs_inode_t     *ip;
860
        int             error;
861
 
862
        ASSERT(xfs_inode_zone != NULL);
863
 
864
        ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP);
865
        ip->i_ino = ino;
866
        ip->i_mount = mp;
867
        atomic_set(&ip->i_iocount, 0);
868
        spin_lock_init(&ip->i_flags_lock);
869
 
870
        /*
871
         * Get pointer's to the on-disk inode and the buffer containing it.
872
         * If the inode number refers to a block outside the file system
873
         * then xfs_itobp() will return NULL.  In this case we should
874
         * return NULL as well.  Set i_blkno to 0 so that xfs_itobp() will
875
         * know that this is a new incore inode.
876
         */
877
        error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags);
878
        if (error) {
879
                kmem_zone_free(xfs_inode_zone, ip);
880
                return error;
881
        }
882
 
883
        /*
884
         * Initialize inode's trace buffers.
885
         * Do this before xfs_iformat in case it adds entries.
886
         */
887
#ifdef  XFS_VNODE_TRACE
888
        ip->i_trace = ktrace_alloc(VNODE_TRACE_SIZE, KM_SLEEP);
889
#endif
890
#ifdef XFS_BMAP_TRACE
891
        ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP);
892
#endif
893
#ifdef XFS_BMBT_TRACE
894
        ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP);
895
#endif
896
#ifdef XFS_RW_TRACE
897
        ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP);
898
#endif
899
#ifdef XFS_ILOCK_TRACE
900
        ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP);
901
#endif
902
#ifdef XFS_DIR2_TRACE
903
        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP);
904
#endif
905
 
906
        /*
907
         * If we got something that isn't an inode it means someone
908
         * (nfs or dmi) has a stale handle.
909
         */
910
        if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC) {
911
                kmem_zone_free(xfs_inode_zone, ip);
912
                xfs_trans_brelse(tp, bp);
913
#ifdef DEBUG
914
                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
915
                                "dip->di_core.di_magic (0x%x) != "
916
                                "XFS_DINODE_MAGIC (0x%x)",
917
                                be16_to_cpu(dip->di_core.di_magic),
918
                                XFS_DINODE_MAGIC);
919
#endif /* DEBUG */
920
                return XFS_ERROR(EINVAL);
921
        }
922
 
923
        /*
924
         * If the on-disk inode is already linked to a directory
925
         * entry, copy all of the inode into the in-core inode.
926
         * xfs_iformat() handles copying in the inode format
927
         * specific information.
928
         * Otherwise, just get the truly permanent information.
929
         */
930
        if (dip->di_core.di_mode) {
931
                xfs_dinode_from_disk(&ip->i_d, &dip->di_core);
932
                error = xfs_iformat(ip, dip);
933
                if (error)  {
934
                        kmem_zone_free(xfs_inode_zone, ip);
935
                        xfs_trans_brelse(tp, bp);
936
#ifdef DEBUG
937
                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
938
                                        "xfs_iformat() returned error %d",
939
                                        error);
940
#endif /* DEBUG */
941
                        return error;
942
                }
943
        } else {
944
                ip->i_d.di_magic = be16_to_cpu(dip->di_core.di_magic);
945
                ip->i_d.di_version = dip->di_core.di_version;
946
                ip->i_d.di_gen = be32_to_cpu(dip->di_core.di_gen);
947
                ip->i_d.di_flushiter = be16_to_cpu(dip->di_core.di_flushiter);
948
                /*
949
                 * Make sure to pull in the mode here as well in
950
                 * case the inode is released without being used.
951
                 * This ensures that xfs_inactive() will see that
952
                 * the inode is already free and not try to mess
953
                 * with the uninitialized part of it.
954
                 */
955
                ip->i_d.di_mode = 0;
956
                /*
957
                 * Initialize the per-fork minima and maxima for a new
958
                 * inode here.  xfs_iformat will do it for old inodes.
959
                 */
960
                ip->i_df.if_ext_max =
961
                        XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
962
        }
963
 
964
        INIT_LIST_HEAD(&ip->i_reclaim);
965
 
966
        /*
967
         * The inode format changed when we moved the link count and
968
         * made it 32 bits long.  If this is an old format inode,
969
         * convert it in memory to look like a new one.  If it gets
970
         * flushed to disk we will convert back before flushing or
971
         * logging it.  We zero out the new projid field and the old link
972
         * count field.  We'll handle clearing the pad field (the remains
973
         * of the old uuid field) when we actually convert the inode to
974
         * the new format. We don't change the version number so that we
975
         * can distinguish this from a real new format inode.
976
         */
977
        if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
978
                ip->i_d.di_nlink = ip->i_d.di_onlink;
979
                ip->i_d.di_onlink = 0;
980
                ip->i_d.di_projid = 0;
981
        }
982
 
983
        ip->i_delayed_blks = 0;
984
        ip->i_size = ip->i_d.di_size;
985
 
986
        /*
987
         * Mark the buffer containing the inode as something to keep
988
         * around for a while.  This helps to keep recently accessed
989
         * meta-data in-core longer.
990
         */
991
         XFS_BUF_SET_REF(bp, XFS_INO_REF);
992
 
993
        /*
994
         * Use xfs_trans_brelse() to release the buffer containing the
995
         * on-disk inode, because it was acquired with xfs_trans_read_buf()
996
         * in xfs_itobp() above.  If tp is NULL, this is just a normal
997
         * brelse().  If we're within a transaction, then xfs_trans_brelse()
998
         * will only release the buffer if it is not dirty within the
999
         * transaction.  It will be OK to release the buffer in this case,
1000
         * because inodes on disk are never destroyed and we will be
1001
         * locking the new in-core inode before putting it in the hash
1002
         * table where other processes can find it.  Thus we don't have
1003
         * to worry about the inode being changed just because we released
1004
         * the buffer.
1005
         */
1006
        xfs_trans_brelse(tp, bp);
1007
        *ipp = ip;
1008
        return 0;
1009
}
1010
 
1011
/*
1012
 * Read in extents from a btree-format inode.
1013
 * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
1014
 */
1015
int
1016
xfs_iread_extents(
1017
        xfs_trans_t     *tp,
1018
        xfs_inode_t     *ip,
1019
        int             whichfork)
1020
{
1021
        int             error;
1022
        xfs_ifork_t     *ifp;
1023
        xfs_extnum_t    nextents;
1024
        size_t          size;
1025
 
1026
        if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
1027
                XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
1028
                                 ip->i_mount);
1029
                return XFS_ERROR(EFSCORRUPTED);
1030
        }
1031
        nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
1032
        size = nextents * sizeof(xfs_bmbt_rec_t);
1033
        ifp = XFS_IFORK_PTR(ip, whichfork);
1034
 
1035
        /*
1036
         * We know that the size is valid (it's checked in iformat_btree)
1037
         */
1038
        ifp->if_lastex = NULLEXTNUM;
1039
        ifp->if_bytes = ifp->if_real_bytes = 0;
1040
        ifp->if_flags |= XFS_IFEXTENTS;
1041
        xfs_iext_add(ifp, 0, nextents);
1042
        error = xfs_bmap_read_extents(tp, ip, whichfork);
1043
        if (error) {
1044
                xfs_iext_destroy(ifp);
1045
                ifp->if_flags &= ~XFS_IFEXTENTS;
1046
                return error;
1047
        }
1048
        xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
1049
        return 0;
1050
}
1051
 
1052
/*
1053
 * Allocate an inode on disk and return a copy of its in-core version.
1054
 * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
1055
 * appropriately within the inode.  The uid and gid for the inode are
1056
 * set according to the contents of the given cred structure.
1057
 *
1058
 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
1059
 * has a free inode available, call xfs_iget()
1060
 * to obtain the in-core version of the allocated inode.  Finally,
1061
 * fill in the inode and log its initial contents.  In this case,
1062
 * ialloc_context would be set to NULL and call_again set to false.
1063
 *
1064
 * If xfs_dialloc() does not have an available inode,
1065
 * it will replenish its supply by doing an allocation. Since we can
1066
 * only do one allocation within a transaction without deadlocks, we
1067
 * must commit the current transaction before returning the inode itself.
1068
 * In this case, therefore, we will set call_again to true and return.
1069
 * The caller should then commit the current transaction, start a new
1070
 * transaction, and call xfs_ialloc() again to actually get the inode.
1071
 *
1072
 * To ensure that some other process does not grab the inode that
1073
 * was allocated during the first call to xfs_ialloc(), this routine
1074
 * also returns the [locked] bp pointing to the head of the freelist
1075
 * as ialloc_context.  The caller should hold this buffer across
1076
 * the commit and pass it back into this routine on the second call.
1077
 *
1078
 * If we are allocating quota inodes, we do not have a parent inode
1079
 * to attach to or associate with (i.e. pip == NULL) because they
1080
 * are not linked into the directory structure - they are attached
1081
 * directly to the superblock - and so have no parent.
1082
 */
1083
int
1084
xfs_ialloc(
1085
        xfs_trans_t     *tp,
1086
        xfs_inode_t     *pip,
1087
        mode_t          mode,
1088
        xfs_nlink_t     nlink,
1089
        xfs_dev_t       rdev,
1090
        cred_t          *cr,
1091
        xfs_prid_t      prid,
1092
        int             okalloc,
1093
        xfs_buf_t       **ialloc_context,
1094
        boolean_t       *call_again,
1095
        xfs_inode_t     **ipp)
1096
{
1097
        xfs_ino_t       ino;
1098
        xfs_inode_t     *ip;
1099
        bhv_vnode_t     *vp;
1100
        uint            flags;
1101
        int             error;
1102
 
1103
        /*
1104
         * Call the space management code to pick
1105
         * the on-disk inode to be allocated.
1106
         */
1107
        error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
1108
                            ialloc_context, call_again, &ino);
1109
        if (error != 0) {
1110
                return error;
1111
        }
1112
        if (*call_again || ino == NULLFSINO) {
1113
                *ipp = NULL;
1114
                return 0;
1115
        }
1116
        ASSERT(*ialloc_context == NULL);
1117
 
1118
        /*
1119
         * Get the in-core inode with the lock held exclusively.
1120
         * This is because we're setting fields here we need
1121
         * to prevent others from looking at until we're done.
1122
         */
1123
        error = xfs_trans_iget(tp->t_mountp, tp, ino,
1124
                                XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
1125
        if (error != 0) {
1126
                return error;
1127
        }
1128
        ASSERT(ip != NULL);
1129
 
1130
        vp = XFS_ITOV(ip);
1131
        ip->i_d.di_mode = (__uint16_t)mode;
1132
        ip->i_d.di_onlink = 0;
1133
        ip->i_d.di_nlink = nlink;
1134
        ASSERT(ip->i_d.di_nlink == nlink);
1135
        ip->i_d.di_uid = current_fsuid(cr);
1136
        ip->i_d.di_gid = current_fsgid(cr);
1137
        ip->i_d.di_projid = prid;
1138
        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1139
 
1140
        /*
1141
         * If the superblock version is up to where we support new format
1142
         * inodes and this is currently an old format inode, then change
1143
         * the inode version number now.  This way we only do the conversion
1144
         * here rather than here and in the flush/logging code.
1145
         */
1146
        if (XFS_SB_VERSION_HASNLINK(&tp->t_mountp->m_sb) &&
1147
            ip->i_d.di_version == XFS_DINODE_VERSION_1) {
1148
                ip->i_d.di_version = XFS_DINODE_VERSION_2;
1149
                /*
1150
                 * We've already zeroed the old link count, the projid field,
1151
                 * and the pad field.
1152
                 */
1153
        }
1154
 
1155
        /*
1156
         * Project ids won't be stored on disk if we are using a version 1 inode.
1157
         */
1158
        if ((prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1))
1159
                xfs_bump_ino_vers2(tp, ip);
1160
 
1161
        if (pip && XFS_INHERIT_GID(pip)) {
1162
                ip->i_d.di_gid = pip->i_d.di_gid;
1163
                if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) {
1164
                        ip->i_d.di_mode |= S_ISGID;
1165
                }
1166
        }
1167
 
1168
        /*
1169
         * If the group ID of the new file does not match the effective group
1170
         * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
1171
         * (and only if the irix_sgid_inherit compatibility variable is set).
1172
         */
1173
        if ((irix_sgid_inherit) &&
1174
            (ip->i_d.di_mode & S_ISGID) &&
1175
            (!in_group_p((gid_t)ip->i_d.di_gid))) {
1176
                ip->i_d.di_mode &= ~S_ISGID;
1177
        }
1178
 
1179
        ip->i_d.di_size = 0;
1180
        ip->i_size = 0;
1181
        ip->i_d.di_nextents = 0;
1182
        ASSERT(ip->i_d.di_nblocks == 0);
1183
        xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD);
1184
        /*
1185
         * di_gen will have been taken care of in xfs_iread.
1186
         */
1187
        ip->i_d.di_extsize = 0;
1188
        ip->i_d.di_dmevmask = 0;
1189
        ip->i_d.di_dmstate = 0;
1190
        ip->i_d.di_flags = 0;
1191
        flags = XFS_ILOG_CORE;
1192
        switch (mode & S_IFMT) {
1193
        case S_IFIFO:
1194
        case S_IFCHR:
1195
        case S_IFBLK:
1196
        case S_IFSOCK:
1197
                ip->i_d.di_format = XFS_DINODE_FMT_DEV;
1198
                ip->i_df.if_u2.if_rdev = rdev;
1199
                ip->i_df.if_flags = 0;
1200
                flags |= XFS_ILOG_DEV;
1201
                break;
1202
        case S_IFREG:
1203
                if (pip && xfs_inode_is_filestream(pip)) {
1204
                        error = xfs_filestream_associate(pip, ip);
1205
                        if (error < 0)
1206
                                return -error;
1207
                        if (!error)
1208
                                xfs_iflags_set(ip, XFS_IFILESTREAM);
1209
                }
1210
                /* fall through */
1211
        case S_IFDIR:
1212
                if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
1213
                        uint    di_flags = 0;
1214
 
1215
                        if ((mode & S_IFMT) == S_IFDIR) {
1216
                                if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
1217
                                        di_flags |= XFS_DIFLAG_RTINHERIT;
1218
                                if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
1219
                                        di_flags |= XFS_DIFLAG_EXTSZINHERIT;
1220
                                        ip->i_d.di_extsize = pip->i_d.di_extsize;
1221
                                }
1222
                        } else if ((mode & S_IFMT) == S_IFREG) {
1223
                                if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) {
1224
                                        di_flags |= XFS_DIFLAG_REALTIME;
1225
                                        ip->i_iocore.io_flags |= XFS_IOCORE_RT;
1226
                                }
1227
                                if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
1228
                                        di_flags |= XFS_DIFLAG_EXTSIZE;
1229
                                        ip->i_d.di_extsize = pip->i_d.di_extsize;
1230
                                }
1231
                        }
1232
                        if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
1233
                            xfs_inherit_noatime)
1234
                                di_flags |= XFS_DIFLAG_NOATIME;
1235
                        if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
1236
                            xfs_inherit_nodump)
1237
                                di_flags |= XFS_DIFLAG_NODUMP;
1238
                        if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
1239
                            xfs_inherit_sync)
1240
                                di_flags |= XFS_DIFLAG_SYNC;
1241
                        if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
1242
                            xfs_inherit_nosymlinks)
1243
                                di_flags |= XFS_DIFLAG_NOSYMLINKS;
1244
                        if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1245
                                di_flags |= XFS_DIFLAG_PROJINHERIT;
1246
                        if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
1247
                            xfs_inherit_nodefrag)
1248
                                di_flags |= XFS_DIFLAG_NODEFRAG;
1249
                        if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
1250
                                di_flags |= XFS_DIFLAG_FILESTREAM;
1251
                        ip->i_d.di_flags |= di_flags;
1252
                }
1253
                /* FALLTHROUGH */
1254
        case S_IFLNK:
1255
                ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1256
                ip->i_df.if_flags = XFS_IFEXTENTS;
1257
                ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
1258
                ip->i_df.if_u1.if_extents = NULL;
1259
                break;
1260
        default:
1261
                ASSERT(0);
1262
        }
1263
        /*
1264
         * Attribute fork settings for new inode.
1265
         */
1266
        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1267
        ip->i_d.di_anextents = 0;
1268
 
1269
        /*
1270
         * Log the new values stuffed into the inode.
1271
         */
1272
        xfs_trans_log_inode(tp, ip, flags);
1273
 
1274
        /* now that we have an i_mode we can setup inode ops and unlock */
1275
        xfs_initialize_vnode(tp->t_mountp, vp, ip);
1276
 
1277
        *ipp = ip;
1278
        return 0;
1279
}
1280
 
1281
/*
1282
 * Check to make sure that there are no blocks allocated to the
1283
 * file beyond the size of the file.  We don't check this for
1284
 * files with fixed size extents or real time extents, but we
1285
 * at least do it for regular files.
1286
 */
1287
#ifdef DEBUG
1288
void
1289
xfs_isize_check(
1290
        xfs_mount_t     *mp,
1291
        xfs_inode_t     *ip,
1292
        xfs_fsize_t     isize)
1293
{
1294
        xfs_fileoff_t   map_first;
1295
        int             nimaps;
1296
        xfs_bmbt_irec_t imaps[2];
1297
 
1298
        if ((ip->i_d.di_mode & S_IFMT) != S_IFREG)
1299
                return;
1300
 
1301
        if (ip->i_d.di_flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_EXTSIZE))
1302
                return;
1303
 
1304
        nimaps = 2;
1305
        map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
1306
        /*
1307
         * The filesystem could be shutting down, so bmapi may return
1308
         * an error.
1309
         */
1310
        if (xfs_bmapi(NULL, ip, map_first,
1311
                         (XFS_B_TO_FSB(mp,
1312
                                       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
1313
                          map_first),
1314
                         XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
1315
                         NULL, NULL))
1316
            return;
1317
        ASSERT(nimaps == 1);
1318
        ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
1319
}
1320
#endif  /* DEBUG */
1321
 
1322
/*
1323
 * Calculate the last possible buffered byte in a file.  This must
1324
 * include data that was buffered beyond the EOF by the write code.
1325
 * This also needs to deal with overflowing the xfs_fsize_t type
1326
 * which can happen for sizes near the limit.
1327
 *
1328
 * We also need to take into account any blocks beyond the EOF.  It
1329
 * may be the case that they were buffered by a write which failed.
1330
 * In that case the pages will still be in memory, but the inode size
1331
 * will never have been updated.
1332
 */
1333
xfs_fsize_t
1334
xfs_file_last_byte(
1335
        xfs_inode_t     *ip)
1336
{
1337
        xfs_mount_t     *mp;
1338
        xfs_fsize_t     last_byte;
1339
        xfs_fileoff_t   last_block;
1340
        xfs_fileoff_t   size_last_block;
1341
        int             error;
1342
 
1343
        ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE | MR_ACCESS));
1344
 
1345
        mp = ip->i_mount;
1346
        /*
1347
         * Only check for blocks beyond the EOF if the extents have
1348
         * been read in.  This eliminates the need for the inode lock,
1349
         * and it also saves us from looking when it really isn't
1350
         * necessary.
1351
         */
1352
        if (ip->i_df.if_flags & XFS_IFEXTENTS) {
1353
                error = xfs_bmap_last_offset(NULL, ip, &last_block,
1354
                        XFS_DATA_FORK);
1355
                if (error) {
1356
                        last_block = 0;
1357
                }
1358
        } else {
1359
                last_block = 0;
1360
        }
1361
        size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_size);
1362
        last_block = XFS_FILEOFF_MAX(last_block, size_last_block);
1363
 
1364
        last_byte = XFS_FSB_TO_B(mp, last_block);
1365
        if (last_byte < 0) {
1366
                return XFS_MAXIOFFSET(mp);
1367
        }
1368
        last_byte += (1 << mp->m_writeio_log);
1369
        if (last_byte < 0) {
1370
                return XFS_MAXIOFFSET(mp);
1371
        }
1372
        return last_byte;
1373
}
1374
 
1375
#if defined(XFS_RW_TRACE)
1376
STATIC void
1377
xfs_itrunc_trace(
1378
        int             tag,
1379
        xfs_inode_t     *ip,
1380
        int             flag,
1381
        xfs_fsize_t     new_size,
1382
        xfs_off_t       toss_start,
1383
        xfs_off_t       toss_finish)
1384
{
1385
        if (ip->i_rwtrace == NULL) {
1386
                return;
1387
        }
1388
 
1389
        ktrace_enter(ip->i_rwtrace,
1390
                     (void*)((long)tag),
1391
                     (void*)ip,
1392
                     (void*)(unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff),
1393
                     (void*)(unsigned long)(ip->i_d.di_size & 0xffffffff),
1394
                     (void*)((long)flag),
1395
                     (void*)(unsigned long)((new_size >> 32) & 0xffffffff),
1396
                     (void*)(unsigned long)(new_size & 0xffffffff),
1397
                     (void*)(unsigned long)((toss_start >> 32) & 0xffffffff),
1398
                     (void*)(unsigned long)(toss_start & 0xffffffff),
1399
                     (void*)(unsigned long)((toss_finish >> 32) & 0xffffffff),
1400
                     (void*)(unsigned long)(toss_finish & 0xffffffff),
1401
                     (void*)(unsigned long)current_cpu(),
1402
                     (void*)(unsigned long)current_pid(),
1403
                     (void*)NULL,
1404
                     (void*)NULL,
1405
                     (void*)NULL);
1406
}
1407
#else
1408
#define xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish)
1409
#endif
1410
 
1411
/*
1412
 * Start the truncation of the file to new_size.  The new size
1413
 * must be smaller than the current size.  This routine will
1414
 * clear the buffer and page caches of file data in the removed
1415
 * range, and xfs_itruncate_finish() will remove the underlying
1416
 * disk blocks.
1417
 *
1418
 * The inode must have its I/O lock locked EXCLUSIVELY, and it
1419
 * must NOT have the inode lock held at all.  This is because we're
1420
 * calling into the buffer/page cache code and we can't hold the
1421
 * inode lock when we do so.
1422
 *
1423
 * We need to wait for any direct I/Os in flight to complete before we
1424
 * proceed with the truncate. This is needed to prevent the extents
1425
 * being read or written by the direct I/Os from being removed while the
1426
 * I/O is in flight as there is no other method of synchronising
1427
 * direct I/O with the truncate operation.  Also, because we hold
1428
 * the IOLOCK in exclusive mode, we prevent new direct I/Os from being
1429
 * started until the truncate completes and drops the lock. Essentially,
1430
 * the vn_iowait() call forms an I/O barrier that provides strict ordering
1431
 * between direct I/Os and the truncate operation.
1432
 *
1433
 * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
1434
 * or XFS_ITRUNC_MAYBE.  The XFS_ITRUNC_MAYBE value should be used
1435
 * in the case that the caller is locking things out of order and
1436
 * may not be able to call xfs_itruncate_finish() with the inode lock
1437
 * held without dropping the I/O lock.  If the caller must drop the
1438
 * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start()
1439
 * must be called again with all the same restrictions as the initial
1440
 * call.
1441
 */
1442
int
1443
xfs_itruncate_start(
1444
        xfs_inode_t     *ip,
1445
        uint            flags,
1446
        xfs_fsize_t     new_size)
1447
{
1448
        xfs_fsize_t     last_byte;
1449
        xfs_off_t       toss_start;
1450
        xfs_mount_t     *mp;
1451
        bhv_vnode_t     *vp;
1452
        int             error = 0;
1453
 
1454
        ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
1455
        ASSERT((new_size == 0) || (new_size <= ip->i_size));
1456
        ASSERT((flags == XFS_ITRUNC_DEFINITE) ||
1457
               (flags == XFS_ITRUNC_MAYBE));
1458
 
1459
        mp = ip->i_mount;
1460
        vp = XFS_ITOV(ip);
1461
 
1462
        /* wait for the completion of any pending DIOs */
1463
        if (new_size < ip->i_size)
1464
                vn_iowait(ip);
1465
 
1466
        /*
1467
         * Call toss_pages or flushinval_pages to get rid of pages
1468
         * overlapping the region being removed.  We have to use
1469
         * the less efficient flushinval_pages in the case that the
1470
         * caller may not be able to finish the truncate without
1471
         * dropping the inode's I/O lock.  Make sure
1472
         * to catch any pages brought in by buffers overlapping
1473
         * the EOF by searching out beyond the isize by our
1474
         * block size. We round new_size up to a block boundary
1475
         * so that we don't toss things on the same block as
1476
         * new_size but before it.
1477
         *
1478
         * Before calling toss_page or flushinval_pages, make sure to
1479
         * call remapf() over the same region if the file is mapped.
1480
         * This frees up mapped file references to the pages in the
1481
         * given range and for the flushinval_pages case it ensures
1482
         * that we get the latest mapped changes flushed out.
1483
         */
1484
        toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1485
        toss_start = XFS_FSB_TO_B(mp, toss_start);
1486
        if (toss_start < 0) {
1487
                /*
1488
                 * The place to start tossing is beyond our maximum
1489
                 * file size, so there is no way that the data extended
1490
                 * out there.
1491
                 */
1492
                return 0;
1493
        }
1494
        last_byte = xfs_file_last_byte(ip);
1495
        xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start,
1496
                         last_byte);
1497
        if (last_byte > toss_start) {
1498
                if (flags & XFS_ITRUNC_DEFINITE) {
1499
                        xfs_tosspages(ip, toss_start,
1500
                                        -1, FI_REMAPF_LOCKED);
1501
                } else {
1502
                        error = xfs_flushinval_pages(ip, toss_start,
1503
                                        -1, FI_REMAPF_LOCKED);
1504
                }
1505
        }
1506
 
1507
#ifdef DEBUG
1508
        if (new_size == 0) {
1509
                ASSERT(VN_CACHED(vp) == 0);
1510
        }
1511
#endif
1512
        return error;
1513
}
1514
 
1515
/*
1516
 * Shrink the file to the given new_size.  The new
1517
 * size must be smaller than the current size.
1518
 * This will free up the underlying blocks
1519
 * in the removed range after a call to xfs_itruncate_start()
1520
 * or xfs_atruncate_start().
1521
 *
1522
 * The transaction passed to this routine must have made
1523
 * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES.
1524
 * This routine may commit the given transaction and
1525
 * start new ones, so make sure everything involved in
1526
 * the transaction is tidy before calling here.
1527
 * Some transaction will be returned to the caller to be
1528
 * committed.  The incoming transaction must already include
1529
 * the inode, and both inode locks must be held exclusively.
1530
 * The inode must also be "held" within the transaction.  On
1531
 * return the inode will be "held" within the returned transaction.
1532
 * This routine does NOT require any disk space to be reserved
1533
 * for it within the transaction.
1534
 *
1535
 * The fork parameter must be either xfs_attr_fork or xfs_data_fork,
1536
 * and it indicates the fork which is to be truncated.  For the
1537
 * attribute fork we only support truncation to size 0.
1538
 *
1539
 * We use the sync parameter to indicate whether or not the first
1540
 * transaction we perform might have to be synchronous.  For the attr fork,
1541
 * it needs to be so if the unlink of the inode is not yet known to be
1542
 * permanent in the log.  This keeps us from freeing and reusing the
1543
 * blocks of the attribute fork before the unlink of the inode becomes
1544
 * permanent.
1545
 *
1546
 * For the data fork, we normally have to run synchronously if we're
1547
 * being called out of the inactive path or we're being called
1548
 * out of the create path where we're truncating an existing file.
1549
 * Either way, the truncate needs to be sync so blocks don't reappear
1550
 * in the file with altered data in case of a crash.  wsync filesystems
1551
 * can run the first case async because anything that shrinks the inode
1552
 * has to run sync so by the time we're called here from inactive, the
1553
 * inode size is permanently set to 0.
1554
 *
1555
 * Calls from the truncate path always need to be sync unless we're
1556
 * in a wsync filesystem and the file has already been unlinked.
1557
 *
1558
 * The caller is responsible for correctly setting the sync parameter.
1559
 * It gets too hard for us to guess here which path we're being called
1560
 * out of just based on inode state.
1561
 */
1562
int
1563
xfs_itruncate_finish(
1564
        xfs_trans_t     **tp,
1565
        xfs_inode_t     *ip,
1566
        xfs_fsize_t     new_size,
1567
        int             fork,
1568
        int             sync)
1569
{
1570
        xfs_fsblock_t   first_block;
1571
        xfs_fileoff_t   first_unmap_block;
1572
        xfs_fileoff_t   last_block;
1573
        xfs_filblks_t   unmap_len=0;
1574
        xfs_mount_t     *mp;
1575
        xfs_trans_t     *ntp;
1576
        int             done;
1577
        int             committed;
1578
        xfs_bmap_free_t free_list;
1579
        int             error;
1580
 
1581
        ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
1582
        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
1583
        ASSERT((new_size == 0) || (new_size <= ip->i_size));
1584
        ASSERT(*tp != NULL);
1585
        ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
1586
        ASSERT(ip->i_transp == *tp);
1587
        ASSERT(ip->i_itemp != NULL);
1588
        ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD);
1589
 
1590
 
1591
        ntp = *tp;
1592
        mp = (ntp)->t_mountp;
1593
        ASSERT(! XFS_NOT_DQATTACHED(mp, ip));
1594
 
1595
        /*
1596
         * We only support truncating the entire attribute fork.
1597
         */
1598
        if (fork == XFS_ATTR_FORK) {
1599
                new_size = 0LL;
1600
        }
1601
        first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1602
        xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0);
1603
        /*
1604
         * The first thing we do is set the size to new_size permanently
1605
         * on disk.  This way we don't have to worry about anyone ever
1606
         * being able to look at the data being freed even in the face
1607
         * of a crash.  What we're getting around here is the case where
1608
         * we free a block, it is allocated to another file, it is written
1609
         * to, and then we crash.  If the new data gets written to the
1610
         * file but the log buffers containing the free and reallocation
1611
         * don't, then we'd end up with garbage in the blocks being freed.
1612
         * As long as we make the new_size permanent before actually
1613
         * freeing any blocks it doesn't matter if they get writtten to.
1614
         *
1615
         * The callers must signal into us whether or not the size
1616
         * setting here must be synchronous.  There are a few cases
1617
         * where it doesn't have to be synchronous.  Those cases
1618
         * occur if the file is unlinked and we know the unlink is
1619
         * permanent or if the blocks being truncated are guaranteed
1620
         * to be beyond the inode eof (regardless of the link count)
1621
         * and the eof value is permanent.  Both of these cases occur
1622
         * only on wsync-mounted filesystems.  In those cases, we're
1623
         * guaranteed that no user will ever see the data in the blocks
1624
         * that are being truncated so the truncate can run async.
1625
         * In the free beyond eof case, the file may wind up with
1626
         * more blocks allocated to it than it needs if we crash
1627
         * and that won't get fixed until the next time the file
1628
         * is re-opened and closed but that's ok as that shouldn't
1629
         * be too many blocks.
1630
         *
1631
         * However, we can't just make all wsync xactions run async
1632
         * because there's one call out of the create path that needs
1633
         * to run sync where it's truncating an existing file to size
1634
         * 0 whose size is > 0.
1635
         *
1636
         * It's probably possible to come up with a test in this
1637
         * routine that would correctly distinguish all the above
1638
         * cases from the values of the function parameters and the
1639
         * inode state but for sanity's sake, I've decided to let the
1640
         * layers above just tell us.  It's simpler to correctly figure
1641
         * out in the layer above exactly under what conditions we
1642
         * can run async and I think it's easier for others read and
1643
         * follow the logic in case something has to be changed.
1644
         * cscope is your friend -- rcc.
1645
         *
1646
         * The attribute fork is much simpler.
1647
         *
1648
         * For the attribute fork we allow the caller to tell us whether
1649
         * the unlink of the inode that led to this call is yet permanent
1650
         * in the on disk log.  If it is not and we will be freeing extents
1651
         * in this inode then we make the first transaction synchronous
1652
         * to make sure that the unlink is permanent by the time we free
1653
         * the blocks.
1654
         */
1655
        if (fork == XFS_DATA_FORK) {
1656
                if (ip->i_d.di_nextents > 0) {
1657
                        /*
1658
                         * If we are not changing the file size then do
1659
                         * not update the on-disk file size - we may be
1660
                         * called from xfs_inactive_free_eofblocks().  If we
1661
                         * update the on-disk file size and then the system
1662
                         * crashes before the contents of the file are
1663
                         * flushed to disk then the files may be full of
1664
                         * holes (ie NULL files bug).
1665
                         */
1666
                        if (ip->i_size != new_size) {
1667
                                ip->i_d.di_size = new_size;
1668
                                ip->i_size = new_size;
1669
                                xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1670
                        }
1671
                }
1672
        } else if (sync) {
1673
                ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC));
1674
                if (ip->i_d.di_anextents > 0)
1675
                        xfs_trans_set_sync(ntp);
1676
        }
1677
        ASSERT(fork == XFS_DATA_FORK ||
1678
                (fork == XFS_ATTR_FORK &&
1679
                        ((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) ||
1680
                         (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC)))));
1681
 
1682
        /*
1683
         * Since it is possible for space to become allocated beyond
1684
         * the end of the file (in a crash where the space is allocated
1685
         * but the inode size is not yet updated), simply remove any
1686
         * blocks which show up between the new EOF and the maximum
1687
         * possible file size.  If the first block to be removed is
1688
         * beyond the maximum file size (ie it is the same as last_block),
1689
         * then there is nothing to do.
1690
         */
1691
        last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1692
        ASSERT(first_unmap_block <= last_block);
1693
        done = 0;
1694
        if (last_block == first_unmap_block) {
1695
                done = 1;
1696
        } else {
1697
                unmap_len = last_block - first_unmap_block + 1;
1698
        }
1699
        while (!done) {
1700
                /*
1701
                 * Free up up to XFS_ITRUNC_MAX_EXTENTS.  xfs_bunmapi()
1702
                 * will tell us whether it freed the entire range or
1703
                 * not.  If this is a synchronous mount (wsync),
1704
                 * then we can tell bunmapi to keep all the
1705
                 * transactions asynchronous since the unlink
1706
                 * transaction that made this inode inactive has
1707
                 * already hit the disk.  There's no danger of
1708
                 * the freed blocks being reused, there being a
1709
                 * crash, and the reused blocks suddenly reappearing
1710
                 * in this file with garbage in them once recovery
1711
                 * runs.
1712
                 */
1713
                XFS_BMAP_INIT(&free_list, &first_block);
1714
                error = XFS_BUNMAPI(mp, ntp, &ip->i_iocore,
1715
                                    first_unmap_block, unmap_len,
1716
                                    XFS_BMAPI_AFLAG(fork) |
1717
                                      (sync ? 0 : XFS_BMAPI_ASYNC),
1718
                                    XFS_ITRUNC_MAX_EXTENTS,
1719
                                    &first_block, &free_list,
1720
                                    NULL, &done);
1721
                if (error) {
1722
                        /*
1723
                         * If the bunmapi call encounters an error,
1724
                         * return to the caller where the transaction
1725
                         * can be properly aborted.  We just need to
1726
                         * make sure we're not holding any resources
1727
                         * that we were not when we came in.
1728
                         */
1729
                        xfs_bmap_cancel(&free_list);
1730
                        return error;
1731
                }
1732
 
1733
                /*
1734
                 * Duplicate the transaction that has the permanent
1735
                 * reservation and commit the old transaction.
1736
                 */
1737
                error = xfs_bmap_finish(tp, &free_list, &committed);
1738
                ntp = *tp;
1739
                if (error) {
1740
                        /*
1741
                         * If the bmap finish call encounters an error,
1742
                         * return to the caller where the transaction
1743
                         * can be properly aborted.  We just need to
1744
                         * make sure we're not holding any resources
1745
                         * that we were not when we came in.
1746
                         *
1747
                         * Aborting from this point might lose some
1748
                         * blocks in the file system, but oh well.
1749
                         */
1750
                        xfs_bmap_cancel(&free_list);
1751
                        if (committed) {
1752
                                /*
1753
                                 * If the passed in transaction committed
1754
                                 * in xfs_bmap_finish(), then we want to
1755
                                 * add the inode to this one before returning.
1756
                                 * This keeps things simple for the higher
1757
                                 * level code, because it always knows that
1758
                                 * the inode is locked and held in the
1759
                                 * transaction that returns to it whether
1760
                                 * errors occur or not.  We don't mark the
1761
                                 * inode dirty so that this transaction can
1762
                                 * be easily aborted if possible.
1763
                                 */
1764
                                xfs_trans_ijoin(ntp, ip,
1765
                                        XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1766
                                xfs_trans_ihold(ntp, ip);
1767
                        }
1768
                        return error;
1769
                }
1770
 
1771
                if (committed) {
1772
                        /*
1773
                         * The first xact was committed,
1774
                         * so add the inode to the new one.
1775
                         * Mark it dirty so it will be logged
1776
                         * and moved forward in the log as
1777
                         * part of every commit.
1778
                         */
1779
                        xfs_trans_ijoin(ntp, ip,
1780
                                        XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1781
                        xfs_trans_ihold(ntp, ip);
1782
                        xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1783
                }
1784
                ntp = xfs_trans_dup(ntp);
1785
                (void) xfs_trans_commit(*tp, 0);
1786
                *tp = ntp;
1787
                error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1788
                                          XFS_TRANS_PERM_LOG_RES,
1789
                                          XFS_ITRUNCATE_LOG_COUNT);
1790
                /*
1791
                 * Add the inode being truncated to the next chained
1792
                 * transaction.
1793
                 */
1794
                xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1795
                xfs_trans_ihold(ntp, ip);
1796
                if (error)
1797
                        return (error);
1798
        }
1799
        /*
1800
         * Only update the size in the case of the data fork, but
1801
         * always re-log the inode so that our permanent transaction
1802
         * can keep on rolling it forward in the log.
1803
         */
1804
        if (fork == XFS_DATA_FORK) {
1805
                xfs_isize_check(mp, ip, new_size);
1806
                /*
1807
                 * If we are not changing the file size then do
1808
                 * not update the on-disk file size - we may be
1809
                 * called from xfs_inactive_free_eofblocks().  If we
1810
                 * update the on-disk file size and then the system
1811
                 * crashes before the contents of the file are
1812
                 * flushed to disk then the files may be full of
1813
                 * holes (ie NULL files bug).
1814
                 */
1815
                if (ip->i_size != new_size) {
1816
                        ip->i_d.di_size = new_size;
1817
                        ip->i_size = new_size;
1818
                }
1819
        }
1820
        xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1821
        ASSERT((new_size != 0) ||
1822
               (fork == XFS_ATTR_FORK) ||
1823
               (ip->i_delayed_blks == 0));
1824
        ASSERT((new_size != 0) ||
1825
               (fork == XFS_ATTR_FORK) ||
1826
               (ip->i_d.di_nextents == 0));
1827
        xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0);
1828
        return 0;
1829
}
1830
 
1831
 
1832
/*
1833
 * xfs_igrow_start
1834
 *
1835
 * Do the first part of growing a file: zero any data in the last
1836
 * block that is beyond the old EOF.  We need to do this before
1837
 * the inode is joined to the transaction to modify the i_size.
1838
 * That way we can drop the inode lock and call into the buffer
1839
 * cache to get the buffer mapping the EOF.
1840
 */
1841
int
1842
xfs_igrow_start(
1843
        xfs_inode_t     *ip,
1844
        xfs_fsize_t     new_size,
1845
        cred_t          *credp)
1846
{
1847
        int             error;
1848
 
1849
        ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
1850
        ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
1851
        ASSERT(new_size > ip->i_size);
1852
 
1853
        /*
1854
         * Zero any pages that may have been created by
1855
         * xfs_write_file() beyond the end of the file
1856
         * and any blocks between the old and new file sizes.
1857
         */
1858
        error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size,
1859
                             ip->i_size);
1860
        return error;
1861
}
1862
 
1863
/*
1864
 * xfs_igrow_finish
1865
 *
1866
 * This routine is called to extend the size of a file.
1867
 * The inode must have both the iolock and the ilock locked
1868
 * for update and it must be a part of the current transaction.
1869
 * The xfs_igrow_start() function must have been called previously.
1870
 * If the change_flag is not zero, the inode change timestamp will
1871
 * be updated.
1872
 */
1873
void
1874
xfs_igrow_finish(
1875
        xfs_trans_t     *tp,
1876
        xfs_inode_t     *ip,
1877
        xfs_fsize_t     new_size,
1878
        int             change_flag)
1879
{
1880
        ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
1881
        ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
1882
        ASSERT(ip->i_transp == tp);
1883
        ASSERT(new_size > ip->i_size);
1884
 
1885
        /*
1886
         * Update the file size.  Update the inode change timestamp
1887
         * if change_flag set.
1888
         */
1889
        ip->i_d.di_size = new_size;
1890
        ip->i_size = new_size;
1891
        if (change_flag)
1892
                xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
1893
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1894
 
1895
}
1896
 
1897
 
1898
/*
1899
 * This is called when the inode's link count goes to 0.
1900
 * We place the on-disk inode on a list in the AGI.  It
1901
 * will be pulled from this list when the inode is freed.
1902
 */
1903
int
1904
xfs_iunlink(
1905
        xfs_trans_t     *tp,
1906
        xfs_inode_t     *ip)
1907
{
1908
        xfs_mount_t     *mp;
1909
        xfs_agi_t       *agi;
1910
        xfs_dinode_t    *dip;
1911
        xfs_buf_t       *agibp;
1912
        xfs_buf_t       *ibp;
1913
        xfs_agnumber_t  agno;
1914
        xfs_daddr_t     agdaddr;
1915
        xfs_agino_t     agino;
1916
        short           bucket_index;
1917
        int             offset;
1918
        int             error;
1919
        int             agi_ok;
1920
 
1921
        ASSERT(ip->i_d.di_nlink == 0);
1922
        ASSERT(ip->i_d.di_mode != 0);
1923
        ASSERT(ip->i_transp == tp);
1924
 
1925
        mp = tp->t_mountp;
1926
 
1927
        agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1928
        agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
1929
 
1930
        /*
1931
         * Get the agi buffer first.  It ensures lock ordering
1932
         * on the list.
1933
         */
1934
        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
1935
                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
1936
        if (error)
1937
                return error;
1938
 
1939
        /*
1940
         * Validate the magic number of the agi block.
1941
         */
1942
        agi = XFS_BUF_TO_AGI(agibp);
1943
        agi_ok =
1944
                be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
1945
                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
1946
        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK,
1947
                        XFS_RANDOM_IUNLINK))) {
1948
                XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi);
1949
                xfs_trans_brelse(tp, agibp);
1950
                return XFS_ERROR(EFSCORRUPTED);
1951
        }
1952
        /*
1953
         * Get the index into the agi hash table for the
1954
         * list this inode will go on.
1955
         */
1956
        agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1957
        ASSERT(agino != 0);
1958
        bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1959
        ASSERT(agi->agi_unlinked[bucket_index]);
1960
        ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
1961
 
1962
        error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
1963
        if (error)
1964
                return error;
1965
 
1966
        /*
1967
         * Clear the on-disk di_nlink. This is to prevent xfs_bulkstat
1968
         * from picking up this inode when it is reclaimed (its incore state
1969
         * initialzed but not flushed to disk yet). The in-core di_nlink is
1970
         * already cleared in xfs_droplink() and a corresponding transaction
1971
         * logged. The hack here just synchronizes the in-core to on-disk
1972
         * di_nlink value in advance before the actual inode sync to disk.
1973
         * This is OK because the inode is already unlinked and would never
1974
         * change its di_nlink again for this inode generation.
1975
         * This is a temporary hack that would require a proper fix
1976
         * in the future.
1977
         */
1978
        dip->di_core.di_nlink = 0;
1979
 
1980
        if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) {
1981
                /*
1982
                 * There is already another inode in the bucket we need
1983
                 * to add ourselves to.  Add us at the front of the list.
1984
                 * Here we put the head pointer into our next pointer,
1985
                 * and then we fall through to point the head at us.
1986
                 */
1987
                ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO);
1988
                /* both on-disk, don't endian flip twice */
1989
                dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
1990
                offset = ip->i_boffset +
1991
                        offsetof(xfs_dinode_t, di_next_unlinked);
1992
                xfs_trans_inode_buf(tp, ibp);
1993
                xfs_trans_log_buf(tp, ibp, offset,
1994
                                  (offset + sizeof(xfs_agino_t) - 1));
1995
                xfs_inobp_check(mp, ibp);
1996
        }
1997
 
1998
        /*
1999
         * Point the bucket head pointer at the inode being inserted.
2000
         */
2001
        ASSERT(agino != 0);
2002
        agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
2003
        offset = offsetof(xfs_agi_t, agi_unlinked) +
2004
                (sizeof(xfs_agino_t) * bucket_index);
2005
        xfs_trans_log_buf(tp, agibp, offset,
2006
                          (offset + sizeof(xfs_agino_t) - 1));
2007
        return 0;
2008
}
2009
 
2010
/*
2011
 * Pull the on-disk inode from the AGI unlinked list.
2012
 */
2013
STATIC int
2014
xfs_iunlink_remove(
2015
        xfs_trans_t     *tp,
2016
        xfs_inode_t     *ip)
2017
{
2018
        xfs_ino_t       next_ino;
2019
        xfs_mount_t     *mp;
2020
        xfs_agi_t       *agi;
2021
        xfs_dinode_t    *dip;
2022
        xfs_buf_t       *agibp;
2023
        xfs_buf_t       *ibp;
2024
        xfs_agnumber_t  agno;
2025
        xfs_daddr_t     agdaddr;
2026
        xfs_agino_t     agino;
2027
        xfs_agino_t     next_agino;
2028
        xfs_buf_t       *last_ibp;
2029
        xfs_dinode_t    *last_dip = NULL;
2030
        short           bucket_index;
2031
        int             offset, last_offset = 0;
2032
        int             error;
2033
        int             agi_ok;
2034
 
2035
        /*
2036
         * First pull the on-disk inode from the AGI unlinked list.
2037
         */
2038
        mp = tp->t_mountp;
2039
 
2040
        agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
2041
        agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
2042
 
2043
        /*
2044
         * Get the agi buffer first.  It ensures lock ordering
2045
         * on the list.
2046
         */
2047
        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
2048
                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
2049
        if (error) {
2050
                cmn_err(CE_WARN,
2051
                        "xfs_iunlink_remove: xfs_trans_read_buf()  returned an error %d on %s.  Returning error.",
2052
                        error, mp->m_fsname);
2053
                return error;
2054
        }
2055
        /*
2056
         * Validate the magic number of the agi block.
2057
         */
2058
        agi = XFS_BUF_TO_AGI(agibp);
2059
        agi_ok =
2060
                be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
2061
                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
2062
        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE,
2063
                        XFS_RANDOM_IUNLINK_REMOVE))) {
2064
                XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW,
2065
                                     mp, agi);
2066
                xfs_trans_brelse(tp, agibp);
2067
                cmn_err(CE_WARN,
2068
                        "xfs_iunlink_remove: XFS_TEST_ERROR()  returned an error on %s.  Returning EFSCORRUPTED.",
2069
                         mp->m_fsname);
2070
                return XFS_ERROR(EFSCORRUPTED);
2071
        }
2072
        /*
2073
         * Get the index into the agi hash table for the
2074
         * list this inode will go on.
2075
         */
2076
        agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2077
        ASSERT(agino != 0);
2078
        bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2079
        ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO);
2080
        ASSERT(agi->agi_unlinked[bucket_index]);
2081
 
2082
        if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
2083
                /*
2084
                 * We're at the head of the list.  Get the inode's
2085
                 * on-disk buffer to see if there is anyone after us
2086
                 * on the list.  Only modify our next pointer if it
2087
                 * is not already NULLAGINO.  This saves us the overhead
2088
                 * of dealing with the buffer when there is no need to
2089
                 * change it.
2090
                 */
2091
                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
2092
                if (error) {
2093
                        cmn_err(CE_WARN,
2094
                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
2095
                                error, mp->m_fsname);
2096
                        return error;
2097
                }
2098
                next_agino = be32_to_cpu(dip->di_next_unlinked);
2099
                ASSERT(next_agino != 0);
2100
                if (next_agino != NULLAGINO) {
2101
                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
2102
                        offset = ip->i_boffset +
2103
                                offsetof(xfs_dinode_t, di_next_unlinked);
2104
                        xfs_trans_inode_buf(tp, ibp);
2105
                        xfs_trans_log_buf(tp, ibp, offset,
2106
                                          (offset + sizeof(xfs_agino_t) - 1));
2107
                        xfs_inobp_check(mp, ibp);
2108
                } else {
2109
                        xfs_trans_brelse(tp, ibp);
2110
                }
2111
                /*
2112
                 * Point the bucket head pointer at the next inode.
2113
                 */
2114
                ASSERT(next_agino != 0);
2115
                ASSERT(next_agino != agino);
2116
                agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
2117
                offset = offsetof(xfs_agi_t, agi_unlinked) +
2118
                        (sizeof(xfs_agino_t) * bucket_index);
2119
                xfs_trans_log_buf(tp, agibp, offset,
2120
                                  (offset + sizeof(xfs_agino_t) - 1));
2121
        } else {
2122
                /*
2123
                 * We need to search the list for the inode being freed.
2124
                 */
2125
                next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2126
                last_ibp = NULL;
2127
                while (next_agino != agino) {
2128
                        /*
2129
                         * If the last inode wasn't the one pointing to
2130
                         * us, then release its buffer since we're not
2131
                         * going to do anything with it.
2132
                         */
2133
                        if (last_ibp != NULL) {
2134
                                xfs_trans_brelse(tp, last_ibp);
2135
                        }
2136
                        next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
2137
                        error = xfs_inotobp(mp, tp, next_ino, &last_dip,
2138
                                            &last_ibp, &last_offset);
2139
                        if (error) {
2140
                                cmn_err(CE_WARN,
2141
                        "xfs_iunlink_remove: xfs_inotobp()  returned an error %d on %s.  Returning error.",
2142
                                        error, mp->m_fsname);
2143
                                return error;
2144
                        }
2145
                        next_agino = be32_to_cpu(last_dip->di_next_unlinked);
2146
                        ASSERT(next_agino != NULLAGINO);
2147
                        ASSERT(next_agino != 0);
2148
                }
2149
                /*
2150
                 * Now last_ibp points to the buffer previous to us on
2151
                 * the unlinked list.  Pull us from the list.
2152
                 */
2153
                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
2154
                if (error) {
2155
                        cmn_err(CE_WARN,
2156
                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
2157
                                error, mp->m_fsname);
2158
                        return error;
2159
                }
2160
                next_agino = be32_to_cpu(dip->di_next_unlinked);
2161
                ASSERT(next_agino != 0);
2162
                ASSERT(next_agino != agino);
2163
                if (next_agino != NULLAGINO) {
2164
                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
2165
                        offset = ip->i_boffset +
2166
                                offsetof(xfs_dinode_t, di_next_unlinked);
2167
                        xfs_trans_inode_buf(tp, ibp);
2168
                        xfs_trans_log_buf(tp, ibp, offset,
2169
                                          (offset + sizeof(xfs_agino_t) - 1));
2170
                        xfs_inobp_check(mp, ibp);
2171
                } else {
2172
                        xfs_trans_brelse(tp, ibp);
2173
                }
2174
                /*
2175
                 * Point the previous inode on the list to the next inode.
2176
                 */
2177
                last_dip->di_next_unlinked = cpu_to_be32(next_agino);
2178
                ASSERT(next_agino != 0);
2179
                offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
2180
                xfs_trans_inode_buf(tp, last_ibp);
2181
                xfs_trans_log_buf(tp, last_ibp, offset,
2182
                                  (offset + sizeof(xfs_agino_t) - 1));
2183
                xfs_inobp_check(mp, last_ibp);
2184
        }
2185
        return 0;
2186
}
2187
 
2188
STATIC_INLINE int xfs_inode_clean(xfs_inode_t *ip)
2189
{
2190
        return (((ip->i_itemp == NULL) ||
2191
                !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
2192
                (ip->i_update_core == 0));
2193
}
2194
 
2195
STATIC void
2196
xfs_ifree_cluster(
2197
        xfs_inode_t     *free_ip,
2198
        xfs_trans_t     *tp,
2199
        xfs_ino_t       inum)
2200
{
2201
        xfs_mount_t             *mp = free_ip->i_mount;
2202
        int                     blks_per_cluster;
2203
        int                     nbufs;
2204
        int                     ninodes;
2205
        int                     i, j, found, pre_flushed;
2206
        xfs_daddr_t             blkno;
2207
        xfs_buf_t               *bp;
2208
        xfs_inode_t             *ip, **ip_found;
2209
        xfs_inode_log_item_t    *iip;
2210
        xfs_log_item_t          *lip;
2211
        xfs_perag_t             *pag = xfs_get_perag(mp, inum);
2212
        SPLDECL(s);
2213
 
2214
        if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
2215
                blks_per_cluster = 1;
2216
                ninodes = mp->m_sb.sb_inopblock;
2217
                nbufs = XFS_IALLOC_BLOCKS(mp);
2218
        } else {
2219
                blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
2220
                                        mp->m_sb.sb_blocksize;
2221
                ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
2222
                nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
2223
        }
2224
 
2225
        ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
2226
 
2227
        for (j = 0; j < nbufs; j++, inum += ninodes) {
2228
                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2229
                                         XFS_INO_TO_AGBNO(mp, inum));
2230
 
2231
 
2232
                /*
2233
                 * Look for each inode in memory and attempt to lock it,
2234
                 * we can be racing with flush and tail pushing here.
2235
                 * any inode we get the locks on, add to an array of
2236
                 * inode items to process later.
2237
                 *
2238
                 * The get the buffer lock, we could beat a flush
2239
                 * or tail pushing thread to the lock here, in which
2240
                 * case they will go looking for the inode buffer
2241
                 * and fail, we need some other form of interlock
2242
                 * here.
2243
                 */
2244
                found = 0;
2245
                for (i = 0; i < ninodes; i++) {
2246
                        read_lock(&pag->pag_ici_lock);
2247
                        ip = radix_tree_lookup(&pag->pag_ici_root,
2248
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
2249
 
2250
                        /* Inode not in memory or we found it already,
2251
                         * nothing to do
2252
                         */
2253
                        if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
2254
                                read_unlock(&pag->pag_ici_lock);
2255
                                continue;
2256
                        }
2257
 
2258
                        if (xfs_inode_clean(ip)) {
2259
                                read_unlock(&pag->pag_ici_lock);
2260
                                continue;
2261
                        }
2262
 
2263
                        /* If we can get the locks then add it to the
2264
                         * list, otherwise by the time we get the bp lock
2265
                         * below it will already be attached to the
2266
                         * inode buffer.
2267
                         */
2268
 
2269
                        /* This inode will already be locked - by us, lets
2270
                         * keep it that way.
2271
                         */
2272
 
2273
                        if (ip == free_ip) {
2274
                                if (xfs_iflock_nowait(ip)) {
2275
                                        xfs_iflags_set(ip, XFS_ISTALE);
2276
                                        if (xfs_inode_clean(ip)) {
2277
                                                xfs_ifunlock(ip);
2278
                                        } else {
2279
                                                ip_found[found++] = ip;
2280
                                        }
2281
                                }
2282
                                read_unlock(&pag->pag_ici_lock);
2283
                                continue;
2284
                        }
2285
 
2286
                        if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2287
                                if (xfs_iflock_nowait(ip)) {
2288
                                        xfs_iflags_set(ip, XFS_ISTALE);
2289
 
2290
                                        if (xfs_inode_clean(ip)) {
2291
                                                xfs_ifunlock(ip);
2292
                                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
2293
                                        } else {
2294
                                                ip_found[found++] = ip;
2295
                                        }
2296
                                } else {
2297
                                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
2298
                                }
2299
                        }
2300
                        read_unlock(&pag->pag_ici_lock);
2301
                }
2302
 
2303
                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2304
                                        mp->m_bsize * blks_per_cluster,
2305
                                        XFS_BUF_LOCK);
2306
 
2307
                pre_flushed = 0;
2308
                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
2309
                while (lip) {
2310
                        if (lip->li_type == XFS_LI_INODE) {
2311
                                iip = (xfs_inode_log_item_t *)lip;
2312
                                ASSERT(iip->ili_logged == 1);
2313
                                lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
2314
                                AIL_LOCK(mp,s);
2315
                                iip->ili_flush_lsn = iip->ili_item.li_lsn;
2316
                                AIL_UNLOCK(mp, s);
2317
                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
2318
                                pre_flushed++;
2319
                        }
2320
                        lip = lip->li_bio_list;
2321
                }
2322
 
2323
                for (i = 0; i < found; i++) {
2324
                        ip = ip_found[i];
2325
                        iip = ip->i_itemp;
2326
 
2327
                        if (!iip) {
2328
                                ip->i_update_core = 0;
2329
                                xfs_ifunlock(ip);
2330
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
2331
                                continue;
2332
                        }
2333
 
2334
                        iip->ili_last_fields = iip->ili_format.ilf_fields;
2335
                        iip->ili_format.ilf_fields = 0;
2336
                        iip->ili_logged = 1;
2337
                        AIL_LOCK(mp,s);
2338
                        iip->ili_flush_lsn = iip->ili_item.li_lsn;
2339
                        AIL_UNLOCK(mp, s);
2340
 
2341
                        xfs_buf_attach_iodone(bp,
2342
                                (void(*)(xfs_buf_t*,xfs_log_item_t*))
2343
                                xfs_istale_done, (xfs_log_item_t *)iip);
2344
                        if (ip != free_ip) {
2345
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
2346
                        }
2347
                }
2348
 
2349
                if (found || pre_flushed)
2350
                        xfs_trans_stale_inode_buf(tp, bp);
2351
                xfs_trans_binval(tp, bp);
2352
        }
2353
 
2354
        kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *));
2355
        xfs_put_perag(mp, pag);
2356
}
2357
 
2358
/*
2359
 * This is called to return an inode to the inode free list.
2360
 * The inode should already be truncated to 0 length and have
2361
 * no pages associated with it.  This routine also assumes that
2362
 * the inode is already a part of the transaction.
2363
 *
2364
 * The on-disk copy of the inode will have been added to the list
2365
 * of unlinked inodes in the AGI. We need to remove the inode from
2366
 * that list atomically with respect to freeing it here.
2367
 */
2368
int
2369
xfs_ifree(
2370
        xfs_trans_t     *tp,
2371
        xfs_inode_t     *ip,
2372
        xfs_bmap_free_t *flist)
2373
{
2374
        int                     error;
2375
        int                     delete;
2376
        xfs_ino_t               first_ino;
2377
 
2378
        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
2379
        ASSERT(ip->i_transp == tp);
2380
        ASSERT(ip->i_d.di_nlink == 0);
2381
        ASSERT(ip->i_d.di_nextents == 0);
2382
        ASSERT(ip->i_d.di_anextents == 0);
2383
        ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
2384
               ((ip->i_d.di_mode & S_IFMT) != S_IFREG));
2385
        ASSERT(ip->i_d.di_nblocks == 0);
2386
 
2387
        /*
2388
         * Pull the on-disk inode from the AGI unlinked list.
2389
         */
2390
        error = xfs_iunlink_remove(tp, ip);
2391
        if (error != 0) {
2392
                return error;
2393
        }
2394
 
2395
        error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
2396
        if (error != 0) {
2397
                return error;
2398
        }
2399
        ip->i_d.di_mode = 0;             /* mark incore inode as free */
2400
        ip->i_d.di_flags = 0;
2401
        ip->i_d.di_dmevmask = 0;
2402
        ip->i_d.di_forkoff = 0;          /* mark the attr fork not in use */
2403
        ip->i_df.if_ext_max =
2404
                XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
2405
        ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
2406
        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
2407
        /*
2408
         * Bump the generation count so no one will be confused
2409
         * by reincarnations of this inode.
2410
         */
2411
        ip->i_d.di_gen++;
2412
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2413
 
2414
        if (delete) {
2415
                xfs_ifree_cluster(ip, tp, first_ino);
2416
        }
2417
 
2418
        return 0;
2419
}
2420
 
2421
/*
2422
 * Reallocate the space for if_broot based on the number of records
2423
 * being added or deleted as indicated in rec_diff.  Move the records
2424
 * and pointers in if_broot to fit the new size.  When shrinking this
2425
 * will eliminate holes between the records and pointers created by
2426
 * the caller.  When growing this will create holes to be filled in
2427
 * by the caller.
2428
 *
2429
 * The caller must not request to add more records than would fit in
2430
 * the on-disk inode root.  If the if_broot is currently NULL, then
2431
 * if we adding records one will be allocated.  The caller must also
2432
 * not request that the number of records go below zero, although
2433
 * it can go to zero.
2434
 *
2435
 * ip -- the inode whose if_broot area is changing
2436
 * ext_diff -- the change in the number of records, positive or negative,
2437
 *       requested for the if_broot array.
2438
 */
2439
void
2440
xfs_iroot_realloc(
2441
        xfs_inode_t             *ip,
2442
        int                     rec_diff,
2443
        int                     whichfork)
2444
{
2445
        int                     cur_max;
2446
        xfs_ifork_t             *ifp;
2447
        xfs_bmbt_block_t        *new_broot;
2448
        int                     new_max;
2449
        size_t                  new_size;
2450
        char                    *np;
2451
        char                    *op;
2452
 
2453
        /*
2454
         * Handle the degenerate case quietly.
2455
         */
2456
        if (rec_diff == 0) {
2457
                return;
2458
        }
2459
 
2460
        ifp = XFS_IFORK_PTR(ip, whichfork);
2461
        if (rec_diff > 0) {
2462
                /*
2463
                 * If there wasn't any memory allocated before, just
2464
                 * allocate it now and get out.
2465
                 */
2466
                if (ifp->if_broot_bytes == 0) {
2467
                        new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
2468
                        ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size,
2469
                                                                     KM_SLEEP);
2470
                        ifp->if_broot_bytes = (int)new_size;
2471
                        return;
2472
                }
2473
 
2474
                /*
2475
                 * If there is already an existing if_broot, then we need
2476
                 * to realloc() it and shift the pointers to their new
2477
                 * location.  The records don't change location because
2478
                 * they are kept butted up against the btree block header.
2479
                 */
2480
                cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
2481
                new_max = cur_max + rec_diff;
2482
                new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2483
                ifp->if_broot = (xfs_bmbt_block_t *)
2484
                  kmem_realloc(ifp->if_broot,
2485
                                new_size,
2486
                                (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
2487
                                KM_SLEEP);
2488
                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
2489
                                                      ifp->if_broot_bytes);
2490
                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
2491
                                                      (int)new_size);
2492
                ifp->if_broot_bytes = (int)new_size;
2493
                ASSERT(ifp->if_broot_bytes <=
2494
                        XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
2495
                memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
2496
                return;
2497
        }
2498
 
2499
        /*
2500
         * rec_diff is less than 0.  In this case, we are shrinking the
2501
         * if_broot buffer.  It must already exist.  If we go to zero
2502
         * records, just get rid of the root and clear the status bit.
2503
         */
2504
        ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
2505
        cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
2506
        new_max = cur_max + rec_diff;
2507
        ASSERT(new_max >= 0);
2508
        if (new_max > 0)
2509
                new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2510
        else
2511
                new_size = 0;
2512
        if (new_size > 0) {
2513
                new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP);
2514
                /*
2515
                 * First copy over the btree block header.
2516
                 */
2517
                memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t));
2518
        } else {
2519
                new_broot = NULL;
2520
                ifp->if_flags &= ~XFS_IFBROOT;
2521
        }
2522
 
2523
        /*
2524
         * Only copy the records and pointers if there are any.
2525
         */
2526
        if (new_max > 0) {
2527
                /*
2528
                 * First copy the records.
2529
                 */
2530
                op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1,
2531
                                                     ifp->if_broot_bytes);
2532
                np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1,
2533
                                                     (int)new_size);
2534
                memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
2535
 
2536
                /*
2537
                 * Then copy the pointers.
2538
                 */
2539
                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
2540
                                                     ifp->if_broot_bytes);
2541
                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1,
2542
                                                     (int)new_size);
2543
                memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
2544
        }
2545
        kmem_free(ifp->if_broot, ifp->if_broot_bytes);
2546
        ifp->if_broot = new_broot;
2547
        ifp->if_broot_bytes = (int)new_size;
2548
        ASSERT(ifp->if_broot_bytes <=
2549
                XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
2550
        return;
2551
}
2552
 
2553
 
2554
/*
2555
 * This is called when the amount of space needed for if_data
2556
 * is increased or decreased.  The change in size is indicated by
2557
 * the number of bytes that need to be added or deleted in the
2558
 * byte_diff parameter.
2559
 *
2560
 * If the amount of space needed has decreased below the size of the
2561
 * inline buffer, then switch to using the inline buffer.  Otherwise,
2562
 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
2563
 * to what is needed.
2564
 *
2565
 * ip -- the inode whose if_data area is changing
2566
 * byte_diff -- the change in the number of bytes, positive or negative,
2567
 *       requested for the if_data array.
2568
 */
2569
void
2570
xfs_idata_realloc(
2571
        xfs_inode_t     *ip,
2572
        int             byte_diff,
2573
        int             whichfork)
2574
{
2575
        xfs_ifork_t     *ifp;
2576
        int             new_size;
2577
        int             real_size;
2578
 
2579
        if (byte_diff == 0) {
2580
                return;
2581
        }
2582
 
2583
        ifp = XFS_IFORK_PTR(ip, whichfork);
2584
        new_size = (int)ifp->if_bytes + byte_diff;
2585
        ASSERT(new_size >= 0);
2586
 
2587
        if (new_size == 0) {
2588
                if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2589
                        kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
2590
                }
2591
                ifp->if_u1.if_data = NULL;
2592
                real_size = 0;
2593
        } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
2594
                /*
2595
                 * If the valid extents/data can fit in if_inline_ext/data,
2596
                 * copy them from the malloc'd vector and free it.
2597
                 */
2598
                if (ifp->if_u1.if_data == NULL) {
2599
                        ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2600
                } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2601
                        ASSERT(ifp->if_real_bytes != 0);
2602
                        memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
2603
                              new_size);
2604
                        kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
2605
                        ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2606
                }
2607
                real_size = 0;
2608
        } else {
2609
                /*
2610
                 * Stuck with malloc/realloc.
2611
                 * For inline data, the underlying buffer must be
2612
                 * a multiple of 4 bytes in size so that it can be
2613
                 * logged and stay on word boundaries.  We enforce
2614
                 * that here.
2615
                 */
2616
                real_size = roundup(new_size, 4);
2617
                if (ifp->if_u1.if_data == NULL) {
2618
                        ASSERT(ifp->if_real_bytes == 0);
2619
                        ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
2620
                } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2621
                        /*
2622
                         * Only do the realloc if the underlying size
2623
                         * is really changing.
2624
                         */
2625
                        if (ifp->if_real_bytes != real_size) {
2626
                                ifp->if_u1.if_data =
2627
                                        kmem_realloc(ifp->if_u1.if_data,
2628
                                                        real_size,
2629
                                                        ifp->if_real_bytes,
2630
                                                        KM_SLEEP);
2631
                        }
2632
                } else {
2633
                        ASSERT(ifp->if_real_bytes == 0);
2634
                        ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
2635
                        memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
2636
                                ifp->if_bytes);
2637
                }
2638
        }
2639
        ifp->if_real_bytes = real_size;
2640
        ifp->if_bytes = new_size;
2641
        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2642
}
2643
 
2644
 
2645
 
2646
 
2647
/*
2648
 * Map inode to disk block and offset.
2649
 *
2650
 * mp -- the mount point structure for the current file system
2651
 * tp -- the current transaction
2652
 * ino -- the inode number of the inode to be located
2653
 * imap -- this structure is filled in with the information necessary
2654
 *       to retrieve the given inode from disk
2655
 * flags -- flags to pass to xfs_dilocate indicating whether or not
2656
 *       lookups in the inode btree were OK or not
2657
 */
2658
int
2659
xfs_imap(
2660
        xfs_mount_t     *mp,
2661
        xfs_trans_t     *tp,
2662
        xfs_ino_t       ino,
2663
        xfs_imap_t      *imap,
2664
        uint            flags)
2665
{
2666
        xfs_fsblock_t   fsbno;
2667
        int             len;
2668
        int             off;
2669
        int             error;
2670
 
2671
        fsbno = imap->im_blkno ?
2672
                XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
2673
        error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
2674
        if (error != 0) {
2675
                return error;
2676
        }
2677
        imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
2678
        imap->im_len = XFS_FSB_TO_BB(mp, len);
2679
        imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
2680
        imap->im_ioffset = (ushort)off;
2681
        imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
2682
        return 0;
2683
}
2684
 
2685
void
2686
xfs_idestroy_fork(
2687
        xfs_inode_t     *ip,
2688
        int             whichfork)
2689
{
2690
        xfs_ifork_t     *ifp;
2691
 
2692
        ifp = XFS_IFORK_PTR(ip, whichfork);
2693
        if (ifp->if_broot != NULL) {
2694
                kmem_free(ifp->if_broot, ifp->if_broot_bytes);
2695
                ifp->if_broot = NULL;
2696
        }
2697
 
2698
        /*
2699
         * If the format is local, then we can't have an extents
2700
         * array so just look for an inline data array.  If we're
2701
         * not local then we may or may not have an extents list,
2702
         * so check and free it up if we do.
2703
         */
2704
        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
2705
                if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
2706
                    (ifp->if_u1.if_data != NULL)) {
2707
                        ASSERT(ifp->if_real_bytes != 0);
2708
                        kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
2709
                        ifp->if_u1.if_data = NULL;
2710
                        ifp->if_real_bytes = 0;
2711
                }
2712
        } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
2713
                   ((ifp->if_flags & XFS_IFEXTIREC) ||
2714
                    ((ifp->if_u1.if_extents != NULL) &&
2715
                     (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
2716
                ASSERT(ifp->if_real_bytes != 0);
2717
                xfs_iext_destroy(ifp);
2718
        }
2719
        ASSERT(ifp->if_u1.if_extents == NULL ||
2720
               ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
2721
        ASSERT(ifp->if_real_bytes == 0);
2722
        if (whichfork == XFS_ATTR_FORK) {
2723
                kmem_zone_free(xfs_ifork_zone, ip->i_afp);
2724
                ip->i_afp = NULL;
2725
        }
2726
}
2727
 
2728
/*
2729
 * This is called free all the memory associated with an inode.
2730
 * It must free the inode itself and any buffers allocated for
2731
 * if_extents/if_data and if_broot.  It must also free the lock
2732
 * associated with the inode.
2733
 */
2734
void
2735
xfs_idestroy(
2736
        xfs_inode_t     *ip)
2737
{
2738
 
2739
        switch (ip->i_d.di_mode & S_IFMT) {
2740
        case S_IFREG:
2741
        case S_IFDIR:
2742
        case S_IFLNK:
2743
                xfs_idestroy_fork(ip, XFS_DATA_FORK);
2744
                break;
2745
        }
2746
        if (ip->i_afp)
2747
                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
2748
        mrfree(&ip->i_lock);
2749
        mrfree(&ip->i_iolock);
2750
        freesema(&ip->i_flock);
2751
 
2752
#ifdef XFS_VNODE_TRACE
2753
        ktrace_free(ip->i_trace);
2754
#endif
2755
#ifdef XFS_BMAP_TRACE
2756
        ktrace_free(ip->i_xtrace);
2757
#endif
2758
#ifdef XFS_BMBT_TRACE
2759
        ktrace_free(ip->i_btrace);
2760
#endif
2761
#ifdef XFS_RW_TRACE
2762
        ktrace_free(ip->i_rwtrace);
2763
#endif
2764
#ifdef XFS_ILOCK_TRACE
2765
        ktrace_free(ip->i_lock_trace);
2766
#endif
2767
#ifdef XFS_DIR2_TRACE
2768
        ktrace_free(ip->i_dir_trace);
2769
#endif
2770
        if (ip->i_itemp) {
2771
                /*
2772
                 * Only if we are shutting down the fs will we see an
2773
                 * inode still in the AIL. If it is there, we should remove
2774
                 * it to prevent a use-after-free from occurring.
2775
                 */
2776
                xfs_mount_t     *mp = ip->i_mount;
2777
                xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
2778
                int             s;
2779
 
2780
                ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
2781
                                       XFS_FORCED_SHUTDOWN(ip->i_mount));
2782
                if (lip->li_flags & XFS_LI_IN_AIL) {
2783
                        AIL_LOCK(mp, s);
2784
                        if (lip->li_flags & XFS_LI_IN_AIL)
2785
                                xfs_trans_delete_ail(mp, lip, s);
2786
                        else
2787
                                AIL_UNLOCK(mp, s);
2788
                }
2789
                xfs_inode_item_destroy(ip);
2790
        }
2791
        kmem_zone_free(xfs_inode_zone, ip);
2792
}
2793
 
2794
 
2795
/*
2796
 * Increment the pin count of the given buffer.
2797
 * This value is protected by ipinlock spinlock in the mount structure.
2798
 */
2799
void
2800
xfs_ipin(
2801
        xfs_inode_t     *ip)
2802
{
2803
        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
2804
 
2805
        atomic_inc(&ip->i_pincount);
2806
}
2807
 
2808
/*
2809
 * Decrement the pin count of the given inode, and wake up
2810
 * anyone in xfs_iwait_unpin() if the count goes to 0.  The
2811
 * inode must have been previously pinned with a call to xfs_ipin().
2812
 */
2813
void
2814
xfs_iunpin(
2815
        xfs_inode_t     *ip)
2816
{
2817
        ASSERT(atomic_read(&ip->i_pincount) > 0);
2818
 
2819
        if (atomic_dec_and_lock(&ip->i_pincount, &ip->i_flags_lock)) {
2820
 
2821
                /*
2822
                 * If the inode is currently being reclaimed, the link between
2823
                 * the bhv_vnode and the xfs_inode will be broken after the
2824
                 * XFS_IRECLAIM* flag is set. Hence, if these flags are not
2825
                 * set, then we can move forward and mark the linux inode dirty
2826
                 * knowing that it is still valid as it won't freed until after
2827
                 * the bhv_vnode<->xfs_inode link is broken in xfs_reclaim. The
2828
                 * i_flags_lock is used to synchronise the setting of the
2829
                 * XFS_IRECLAIM* flags and the breaking of the link, and so we
2830
                 * can execute atomically w.r.t to reclaim by holding this lock
2831
                 * here.
2832
                 *
2833
                 * However, we still need to issue the unpin wakeup call as the
2834
                 * inode reclaim may be blocked waiting for the inode to become
2835
                 * unpinned.
2836
                 */
2837
 
2838
                if (!__xfs_iflags_test(ip, XFS_IRECLAIM|XFS_IRECLAIMABLE)) {
2839
                        bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
2840
                        struct inode *inode = NULL;
2841
 
2842
                        BUG_ON(vp == NULL);
2843
                        inode = vn_to_inode(vp);
2844
                        BUG_ON(inode->i_state & I_CLEAR);
2845
 
2846
                        /* make sync come back and flush this inode */
2847
                        if (!(inode->i_state & (I_NEW|I_FREEING)))
2848
                                mark_inode_dirty_sync(inode);
2849
                }
2850
                spin_unlock(&ip->i_flags_lock);
2851
                wake_up(&ip->i_ipin_wait);
2852
        }
2853
}
2854
 
2855
/*
2856
 * This is called to wait for the given inode to be unpinned.
2857
 * It will sleep until this happens.  The caller must have the
2858
 * inode locked in at least shared mode so that the buffer cannot
2859
 * be subsequently pinned once someone is waiting for it to be
2860
 * unpinned.
2861
 */
2862
STATIC void
2863
xfs_iunpin_wait(
2864
        xfs_inode_t     *ip)
2865
{
2866
        xfs_inode_log_item_t    *iip;
2867
        xfs_lsn_t       lsn;
2868
 
2869
        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS));
2870
 
2871
        if (atomic_read(&ip->i_pincount) == 0) {
2872
                return;
2873
        }
2874
 
2875
        iip = ip->i_itemp;
2876
        if (iip && iip->ili_last_lsn) {
2877
                lsn = iip->ili_last_lsn;
2878
        } else {
2879
                lsn = (xfs_lsn_t)0;
2880
        }
2881
 
2882
        /*
2883
         * Give the log a push so we don't wait here too long.
2884
         */
2885
        xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE);
2886
 
2887
        wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
2888
}
2889
 
2890
 
2891
/*
2892
 * xfs_iextents_copy()
2893
 *
2894
 * This is called to copy the REAL extents (as opposed to the delayed
2895
 * allocation extents) from the inode into the given buffer.  It
2896
 * returns the number of bytes copied into the buffer.
2897
 *
2898
 * If there are no delayed allocation extents, then we can just
2899
 * memcpy() the extents into the buffer.  Otherwise, we need to
2900
 * examine each extent in turn and skip those which are delayed.
2901
 */
2902
int
2903
xfs_iextents_copy(
2904
        xfs_inode_t             *ip,
2905
        xfs_bmbt_rec_t          *dp,
2906
        int                     whichfork)
2907
{
2908
        int                     copied;
2909
        int                     i;
2910
        xfs_ifork_t             *ifp;
2911
        int                     nrecs;
2912
        xfs_fsblock_t           start_block;
2913
 
2914
        ifp = XFS_IFORK_PTR(ip, whichfork);
2915
        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
2916
        ASSERT(ifp->if_bytes > 0);
2917
 
2918
        nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
2919
        XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
2920
        ASSERT(nrecs > 0);
2921
 
2922
        /*
2923
         * There are some delayed allocation extents in the
2924
         * inode, so copy the extents one at a time and skip
2925
         * the delayed ones.  There must be at least one
2926
         * non-delayed extent.
2927
         */
2928
        copied = 0;
2929
        for (i = 0; i < nrecs; i++) {
2930
                xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
2931
                start_block = xfs_bmbt_get_startblock(ep);
2932
                if (ISNULLSTARTBLOCK(start_block)) {
2933
                        /*
2934
                         * It's a delayed allocation extent, so skip it.
2935
                         */
2936
                        continue;
2937
                }
2938
 
2939
                /* Translate to on disk format */
2940
                put_unaligned(cpu_to_be64(ep->l0), &dp->l0);
2941
                put_unaligned(cpu_to_be64(ep->l1), &dp->l1);
2942
                dp++;
2943
                copied++;
2944
        }
2945
        ASSERT(copied != 0);
2946
        xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
2947
 
2948
        return (copied * (uint)sizeof(xfs_bmbt_rec_t));
2949
}
2950
 
2951
/*
2952
 * Each of the following cases stores data into the same region
2953
 * of the on-disk inode, so only one of them can be valid at
2954
 * any given time. While it is possible to have conflicting formats
2955
 * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
2956
 * in EXTENTS format, this can only happen when the fork has
2957
 * changed formats after being modified but before being flushed.
2958
 * In these cases, the format always takes precedence, because the
2959
 * format indicates the current state of the fork.
2960
 */
2961
/*ARGSUSED*/
2962
STATIC int
2963
xfs_iflush_fork(
2964
        xfs_inode_t             *ip,
2965
        xfs_dinode_t            *dip,
2966
        xfs_inode_log_item_t    *iip,
2967
        int                     whichfork,
2968
        xfs_buf_t               *bp)
2969
{
2970
        char                    *cp;
2971
        xfs_ifork_t             *ifp;
2972
        xfs_mount_t             *mp;
2973
#ifdef XFS_TRANS_DEBUG
2974
        int                     first;
2975
#endif
2976
        static const short      brootflag[2] =
2977
                { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
2978
        static const short      dataflag[2] =
2979
                { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
2980
        static const short      extflag[2] =
2981
                { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
2982
 
2983
        if (iip == NULL)
2984
                return 0;
2985
        ifp = XFS_IFORK_PTR(ip, whichfork);
2986
        /*
2987
         * This can happen if we gave up in iformat in an error path,
2988
         * for the attribute fork.
2989
         */
2990
        if (ifp == NULL) {
2991
                ASSERT(whichfork == XFS_ATTR_FORK);
2992
                return 0;
2993
        }
2994
        cp = XFS_DFORK_PTR(dip, whichfork);
2995
        mp = ip->i_mount;
2996
        switch (XFS_IFORK_FORMAT(ip, whichfork)) {
2997
        case XFS_DINODE_FMT_LOCAL:
2998
                if ((iip->ili_format.ilf_fields & dataflag[whichfork]) &&
2999
                    (ifp->if_bytes > 0)) {
3000
                        ASSERT(ifp->if_u1.if_data != NULL);
3001
                        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
3002
                        memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
3003
                }
3004
                break;
3005
 
3006
        case XFS_DINODE_FMT_EXTENTS:
3007
                ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
3008
                       !(iip->ili_format.ilf_fields & extflag[whichfork]));
3009
                ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) ||
3010
                        (ifp->if_bytes == 0));
3011
                ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) ||
3012
                        (ifp->if_bytes > 0));
3013
                if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
3014
                    (ifp->if_bytes > 0)) {
3015
                        ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
3016
                        (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
3017
                                whichfork);
3018
                }
3019
                break;
3020
 
3021
        case XFS_DINODE_FMT_BTREE:
3022
                if ((iip->ili_format.ilf_fields & brootflag[whichfork]) &&
3023
                    (ifp->if_broot_bytes > 0)) {
3024
                        ASSERT(ifp->if_broot != NULL);
3025
                        ASSERT(ifp->if_broot_bytes <=
3026
                               (XFS_IFORK_SIZE(ip, whichfork) +
3027
                                XFS_BROOT_SIZE_ADJ));
3028
                        xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes,
3029
                                (xfs_bmdr_block_t *)cp,
3030
                                XFS_DFORK_SIZE(dip, mp, whichfork));
3031
                }
3032
                break;
3033
 
3034
        case XFS_DINODE_FMT_DEV:
3035
                if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
3036
                        ASSERT(whichfork == XFS_DATA_FORK);
3037
                        dip->di_u.di_dev = cpu_to_be32(ip->i_df.if_u2.if_rdev);
3038
                }
3039
                break;
3040
 
3041
        case XFS_DINODE_FMT_UUID:
3042
                if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
3043
                        ASSERT(whichfork == XFS_DATA_FORK);
3044
                        memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid,
3045
                                sizeof(uuid_t));
3046
                }
3047
                break;
3048
 
3049
        default:
3050
                ASSERT(0);
3051
                break;
3052
        }
3053
 
3054
        return 0;
3055
}
3056
 
3057
/*
3058
 * xfs_iflush() will write a modified inode's changes out to the
3059
 * inode's on disk home.  The caller must have the inode lock held
3060
 * in at least shared mode and the inode flush semaphore must be
3061
 * held as well.  The inode lock will still be held upon return from
3062
 * the call and the caller is free to unlock it.
3063
 * The inode flush lock will be unlocked when the inode reaches the disk.
3064
 * The flags indicate how the inode's buffer should be written out.
3065
 */
3066
int
3067
xfs_iflush(
3068
        xfs_inode_t             *ip,
3069
        uint                    flags)
3070
{
3071
        xfs_inode_log_item_t    *iip;
3072
        xfs_buf_t               *bp;
3073
        xfs_dinode_t            *dip;
3074
        xfs_mount_t             *mp;
3075
        int                     error;
3076
        /* REFERENCED */
3077
        xfs_inode_t             *iq;
3078
        int                     clcount;        /* count of inodes clustered */
3079
        int                     bufwasdelwri;
3080
        struct hlist_node       *entry;
3081
        enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
3082
 
3083
        XFS_STATS_INC(xs_iflush_count);
3084
 
3085
        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
3086
        ASSERT(issemalocked(&(ip->i_flock)));
3087
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3088
               ip->i_d.di_nextents > ip->i_df.if_ext_max);
3089
 
3090
        iip = ip->i_itemp;
3091
        mp = ip->i_mount;
3092
 
3093
        /*
3094
         * If the inode isn't dirty, then just release the inode
3095
         * flush lock and do nothing.
3096
         */
3097
        if ((ip->i_update_core == 0) &&
3098
            ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
3099
                ASSERT((iip != NULL) ?
3100
                         !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1);
3101
                xfs_ifunlock(ip);
3102
                return 0;
3103
        }
3104
 
3105
        /*
3106
         * We can't flush the inode until it is unpinned, so
3107
         * wait for it.  We know noone new can pin it, because
3108
         * we are holding the inode lock shared and you need
3109
         * to hold it exclusively to pin the inode.
3110
         */
3111
        xfs_iunpin_wait(ip);
3112
 
3113
        /*
3114
         * This may have been unpinned because the filesystem is shutting
3115
         * down forcibly. If that's the case we must not write this inode
3116
         * to disk, because the log record didn't make it to disk!
3117
         */
3118
        if (XFS_FORCED_SHUTDOWN(mp)) {
3119
                ip->i_update_core = 0;
3120
                if (iip)
3121
                        iip->ili_format.ilf_fields = 0;
3122
                xfs_ifunlock(ip);
3123
                return XFS_ERROR(EIO);
3124
        }
3125
 
3126
        /*
3127
         * Get the buffer containing the on-disk inode.
3128
         */
3129
        error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0);
3130
        if (error) {
3131
                xfs_ifunlock(ip);
3132
                return error;
3133
        }
3134
 
3135
        /*
3136
         * Decide how buffer will be flushed out.  This is done before
3137
         * the call to xfs_iflush_int because this field is zeroed by it.
3138
         */
3139
        if (iip != NULL && iip->ili_format.ilf_fields != 0) {
3140
                /*
3141
                 * Flush out the inode buffer according to the directions
3142
                 * of the caller.  In the cases where the caller has given
3143
                 * us a choice choose the non-delwri case.  This is because
3144
                 * the inode is in the AIL and we need to get it out soon.
3145
                 */
3146
                switch (flags) {
3147
                case XFS_IFLUSH_SYNC:
3148
                case XFS_IFLUSH_DELWRI_ELSE_SYNC:
3149
                        flags = 0;
3150
                        break;
3151
                case XFS_IFLUSH_ASYNC:
3152
                case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
3153
                        flags = INT_ASYNC;
3154
                        break;
3155
                case XFS_IFLUSH_DELWRI:
3156
                        flags = INT_DELWRI;
3157
                        break;
3158
                default:
3159
                        ASSERT(0);
3160
                        flags = 0;
3161
                        break;
3162
                }
3163
        } else {
3164
                switch (flags) {
3165
                case XFS_IFLUSH_DELWRI_ELSE_SYNC:
3166
                case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
3167
                case XFS_IFLUSH_DELWRI:
3168
                        flags = INT_DELWRI;
3169
                        break;
3170
                case XFS_IFLUSH_ASYNC:
3171
                        flags = INT_ASYNC;
3172
                        break;
3173
                case XFS_IFLUSH_SYNC:
3174
                        flags = 0;
3175
                        break;
3176
                default:
3177
                        ASSERT(0);
3178
                        flags = 0;
3179
                        break;
3180
                }
3181
        }
3182
 
3183
        /*
3184
         * First flush out the inode that xfs_iflush was called with.
3185
         */
3186
        error = xfs_iflush_int(ip, bp);
3187
        if (error) {
3188
                goto corrupt_out;
3189
        }
3190
 
3191
        /*
3192
         * inode clustering:
3193
         * see if other inodes can be gathered into this write
3194
         */
3195
        spin_lock(&ip->i_cluster->icl_lock);
3196
        ip->i_cluster->icl_buf = bp;
3197
 
3198
        clcount = 0;
3199
        hlist_for_each_entry(iq, entry, &ip->i_cluster->icl_inodes, i_cnode) {
3200
                if (iq == ip)
3201
                        continue;
3202
 
3203
                /*
3204
                 * Do an un-protected check to see if the inode is dirty and
3205
                 * is a candidate for flushing.  These checks will be repeated
3206
                 * later after the appropriate locks are acquired.
3207
                 */
3208
                iip = iq->i_itemp;
3209
                if ((iq->i_update_core == 0) &&
3210
                    ((iip == NULL) ||
3211
                     !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
3212
                      xfs_ipincount(iq) == 0) {
3213
                        continue;
3214
                }
3215
 
3216
                /*
3217
                 * Try to get locks.  If any are unavailable,
3218
                 * then this inode cannot be flushed and is skipped.
3219
                 */
3220
 
3221
                /* get inode locks (just i_lock) */
3222
                if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) {
3223
                        /* get inode flush lock */
3224
                        if (xfs_iflock_nowait(iq)) {
3225
                                /* check if pinned */
3226
                                if (xfs_ipincount(iq) == 0) {
3227
                                        /* arriving here means that
3228
                                         * this inode can be flushed.
3229
                                         * first re-check that it's
3230
                                         * dirty
3231
                                         */
3232
                                        iip = iq->i_itemp;
3233
                                        if ((iq->i_update_core != 0)||
3234
                                            ((iip != NULL) &&
3235
                                             (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
3236
                                                clcount++;
3237
                                                error = xfs_iflush_int(iq, bp);
3238
                                                if (error) {
3239
                                                        xfs_iunlock(iq,
3240
                                                                    XFS_ILOCK_SHARED);
3241
                                                        goto cluster_corrupt_out;
3242
                                                }
3243
                                        } else {
3244
                                                xfs_ifunlock(iq);
3245
                                        }
3246
                                } else {
3247
                                        xfs_ifunlock(iq);
3248
                                }
3249
                        }
3250
                        xfs_iunlock(iq, XFS_ILOCK_SHARED);
3251
                }
3252
        }
3253
        spin_unlock(&ip->i_cluster->icl_lock);
3254
 
3255
        if (clcount) {
3256
                XFS_STATS_INC(xs_icluster_flushcnt);
3257
                XFS_STATS_ADD(xs_icluster_flushinode, clcount);
3258
        }
3259
 
3260
        /*
3261
         * If the buffer is pinned then push on the log so we won't
3262
         * get stuck waiting in the write for too long.
3263
         */
3264
        if (XFS_BUF_ISPINNED(bp)){
3265
                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
3266
        }
3267
 
3268
        if (flags & INT_DELWRI) {
3269
                xfs_bdwrite(mp, bp);
3270
        } else if (flags & INT_ASYNC) {
3271
                xfs_bawrite(mp, bp);
3272
        } else {
3273
                error = xfs_bwrite(mp, bp);
3274
        }
3275
        return error;
3276
 
3277
corrupt_out:
3278
        xfs_buf_relse(bp);
3279
        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3280
        xfs_iflush_abort(ip);
3281
        /*
3282
         * Unlocks the flush lock
3283
         */
3284
        return XFS_ERROR(EFSCORRUPTED);
3285
 
3286
cluster_corrupt_out:
3287
        /* Corruption detected in the clustering loop.  Invalidate the
3288
         * inode buffer and shut down the filesystem.
3289
         */
3290
        spin_unlock(&ip->i_cluster->icl_lock);
3291
 
3292
        /*
3293
         * Clean up the buffer.  If it was B_DELWRI, just release it --
3294
         * brelse can handle it with no problems.  If not, shut down the
3295
         * filesystem before releasing the buffer.
3296
         */
3297
        if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) {
3298
                xfs_buf_relse(bp);
3299
        }
3300
 
3301
        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3302
 
3303
        if(!bufwasdelwri)  {
3304
                /*
3305
                 * Just like incore_relse: if we have b_iodone functions,
3306
                 * mark the buffer as an error and call them.  Otherwise
3307
                 * mark it as stale and brelse.
3308
                 */
3309
                if (XFS_BUF_IODONE_FUNC(bp)) {
3310
                        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
3311
                        XFS_BUF_UNDONE(bp);
3312
                        XFS_BUF_STALE(bp);
3313
                        XFS_BUF_SHUT(bp);
3314
                        XFS_BUF_ERROR(bp,EIO);
3315
                        xfs_biodone(bp);
3316
                } else {
3317
                        XFS_BUF_STALE(bp);
3318
                        xfs_buf_relse(bp);
3319
                }
3320
        }
3321
 
3322
        xfs_iflush_abort(iq);
3323
        /*
3324
         * Unlocks the flush lock
3325
         */
3326
        return XFS_ERROR(EFSCORRUPTED);
3327
}
3328
 
3329
 
3330
STATIC int
3331
xfs_iflush_int(
3332
        xfs_inode_t             *ip,
3333
        xfs_buf_t               *bp)
3334
{
3335
        xfs_inode_log_item_t    *iip;
3336
        xfs_dinode_t            *dip;
3337
        xfs_mount_t             *mp;
3338
#ifdef XFS_TRANS_DEBUG
3339
        int                     first;
3340
#endif
3341
        SPLDECL(s);
3342
 
3343
        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
3344
        ASSERT(issemalocked(&(ip->i_flock)));
3345
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3346
               ip->i_d.di_nextents > ip->i_df.if_ext_max);
3347
 
3348
        iip = ip->i_itemp;
3349
        mp = ip->i_mount;
3350
 
3351
 
3352
        /*
3353
         * If the inode isn't dirty, then just release the inode
3354
         * flush lock and do nothing.
3355
         */
3356
        if ((ip->i_update_core == 0) &&
3357
            ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
3358
                xfs_ifunlock(ip);
3359
                return 0;
3360
        }
3361
 
3362
        /* set *dip = inode's place in the buffer */
3363
        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset);
3364
 
3365
        /*
3366
         * Clear i_update_core before copying out the data.
3367
         * This is for coordination with our timestamp updates
3368
         * that don't hold the inode lock. They will always
3369
         * update the timestamps BEFORE setting i_update_core,
3370
         * so if we clear i_update_core after they set it we
3371
         * are guaranteed to see their updates to the timestamps.
3372
         * I believe that this depends on strongly ordered memory
3373
         * semantics, but we have that.  We use the SYNCHRONIZE
3374
         * macro to make sure that the compiler does not reorder
3375
         * the i_update_core access below the data copy below.
3376
         */
3377
        ip->i_update_core = 0;
3378
        SYNCHRONIZE();
3379
 
3380
        /*
3381
         * Make sure to get the latest atime from the Linux inode.
3382
         */
3383
        xfs_synchronize_atime(ip);
3384
 
3385
        if (XFS_TEST_ERROR(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC,
3386
                               mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
3387
                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3388
                    "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
3389
                        ip->i_ino, be16_to_cpu(dip->di_core.di_magic), dip);
3390
                goto corrupt_out;
3391
        }
3392
        if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
3393
                                mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
3394
                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3395
                        "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
3396
                        ip->i_ino, ip, ip->i_d.di_magic);
3397
                goto corrupt_out;
3398
        }
3399
        if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
3400
                if (XFS_TEST_ERROR(
3401
                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3402
                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
3403
                    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
3404
                        xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3405
                                "xfs_iflush: Bad regular inode %Lu, ptr 0x%p",
3406
                                ip->i_ino, ip);
3407
                        goto corrupt_out;
3408
                }
3409
        } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
3410
                if (XFS_TEST_ERROR(
3411
                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3412
                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
3413
                    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
3414
                    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
3415
                        xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3416
                                "xfs_iflush: Bad directory inode %Lu, ptr 0x%p",
3417
                                ip->i_ino, ip);
3418
                        goto corrupt_out;
3419
                }
3420
        }
3421
        if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
3422
                                ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
3423
                                XFS_RANDOM_IFLUSH_5)) {
3424
                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3425
                        "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p",
3426
                        ip->i_ino,
3427
                        ip->i_d.di_nextents + ip->i_d.di_anextents,
3428
                        ip->i_d.di_nblocks,
3429
                        ip);
3430
                goto corrupt_out;
3431
        }
3432
        if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
3433
                                mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
3434
                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3435
                        "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
3436
                        ip->i_ino, ip->i_d.di_forkoff, ip);
3437
                goto corrupt_out;
3438
        }
3439
        /*
3440
         * bump the flush iteration count, used to detect flushes which
3441
         * postdate a log record during recovery.
3442
         */
3443
 
3444
        ip->i_d.di_flushiter++;
3445
 
3446
        /*
3447
         * Copy the dirty parts of the inode into the on-disk
3448
         * inode.  We always copy out the core of the inode,
3449
         * because if the inode is dirty at all the core must
3450
         * be.
3451
         */
3452
        xfs_dinode_to_disk(&dip->di_core, &ip->i_d);
3453
 
3454
        /* Wrap, we never let the log put out DI_MAX_FLUSH */
3455
        if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
3456
                ip->i_d.di_flushiter = 0;
3457
 
3458
        /*
3459
         * If this is really an old format inode and the superblock version
3460
         * has not been updated to support only new format inodes, then
3461
         * convert back to the old inode format.  If the superblock version
3462
         * has been updated, then make the conversion permanent.
3463
         */
3464
        ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
3465
               XFS_SB_VERSION_HASNLINK(&mp->m_sb));
3466
        if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
3467
                if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
3468
                        /*
3469
                         * Convert it back.
3470
                         */
3471
                        ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
3472
                        dip->di_core.di_onlink = cpu_to_be16(ip->i_d.di_nlink);
3473
                } else {
3474
                        /*
3475
                         * The superblock version has already been bumped,
3476
                         * so just make the conversion to the new inode
3477
                         * format permanent.
3478
                         */
3479
                        ip->i_d.di_version = XFS_DINODE_VERSION_2;
3480
                        dip->di_core.di_version =  XFS_DINODE_VERSION_2;
3481
                        ip->i_d.di_onlink = 0;
3482
                        dip->di_core.di_onlink = 0;
3483
                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
3484
                        memset(&(dip->di_core.di_pad[0]), 0,
3485
                              sizeof(dip->di_core.di_pad));
3486
                        ASSERT(ip->i_d.di_projid == 0);
3487
                }
3488
        }
3489
 
3490
        if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) {
3491
                goto corrupt_out;
3492
        }
3493
 
3494
        if (XFS_IFORK_Q(ip)) {
3495
                /*
3496
                 * The only error from xfs_iflush_fork is on the data fork.
3497
                 */
3498
                (void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
3499
        }
3500
        xfs_inobp_check(mp, bp);
3501
 
3502
        /*
3503
         * We've recorded everything logged in the inode, so we'd
3504
         * like to clear the ilf_fields bits so we don't log and
3505
         * flush things unnecessarily.  However, we can't stop
3506
         * logging all this information until the data we've copied
3507
         * into the disk buffer is written to disk.  If we did we might
3508
         * overwrite the copy of the inode in the log with all the
3509
         * data after re-logging only part of it, and in the face of
3510
         * a crash we wouldn't have all the data we need to recover.
3511
         *
3512
         * What we do is move the bits to the ili_last_fields field.
3513
         * When logging the inode, these bits are moved back to the
3514
         * ilf_fields field.  In the xfs_iflush_done() routine we
3515
         * clear ili_last_fields, since we know that the information
3516
         * those bits represent is permanently on disk.  As long as
3517
         * the flush completes before the inode is logged again, then
3518
         * both ilf_fields and ili_last_fields will be cleared.
3519
         *
3520
         * We can play with the ilf_fields bits here, because the inode
3521
         * lock must be held exclusively in order to set bits there
3522
         * and the flush lock protects the ili_last_fields bits.
3523
         * Set ili_logged so the flush done
3524
         * routine can tell whether or not to look in the AIL.
3525
         * Also, store the current LSN of the inode so that we can tell
3526
         * whether the item has moved in the AIL from xfs_iflush_done().
3527
         * In order to read the lsn we need the AIL lock, because
3528
         * it is a 64 bit value that cannot be read atomically.
3529
         */
3530
        if (iip != NULL && iip->ili_format.ilf_fields != 0) {
3531
                iip->ili_last_fields = iip->ili_format.ilf_fields;
3532
                iip->ili_format.ilf_fields = 0;
3533
                iip->ili_logged = 1;
3534
 
3535
                ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
3536
                AIL_LOCK(mp,s);
3537
                iip->ili_flush_lsn = iip->ili_item.li_lsn;
3538
                AIL_UNLOCK(mp, s);
3539
 
3540
                /*
3541
                 * Attach the function xfs_iflush_done to the inode's
3542
                 * buffer.  This will remove the inode from the AIL
3543
                 * and unlock the inode's flush lock when the inode is
3544
                 * completely written to disk.
3545
                 */
3546
                xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*))
3547
                                      xfs_iflush_done, (xfs_log_item_t *)iip);
3548
 
3549
                ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
3550
                ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL);
3551
        } else {
3552
                /*
3553
                 * We're flushing an inode which is not in the AIL and has
3554
                 * not been logged but has i_update_core set.  For this
3555
                 * case we can use a B_DELWRI flush and immediately drop
3556
                 * the inode flush lock because we can avoid the whole
3557
                 * AIL state thing.  It's OK to drop the flush lock now,
3558
                 * because we've already locked the buffer and to do anything
3559
                 * you really need both.
3560
                 */
3561
                if (iip != NULL) {
3562
                        ASSERT(iip->ili_logged == 0);
3563
                        ASSERT(iip->ili_last_fields == 0);
3564
                        ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
3565
                }
3566
                xfs_ifunlock(ip);
3567
        }
3568
 
3569
        return 0;
3570
 
3571
corrupt_out:
3572
        return XFS_ERROR(EFSCORRUPTED);
3573
}
3574
 
3575
 
3576
/*
3577
 * Flush all inactive inodes in mp.
3578
 */
3579
void
3580
xfs_iflush_all(
3581
        xfs_mount_t     *mp)
3582
{
3583
        xfs_inode_t     *ip;
3584
        bhv_vnode_t     *vp;
3585
 
3586
 again:
3587
        XFS_MOUNT_ILOCK(mp);
3588
        ip = mp->m_inodes;
3589
        if (ip == NULL)
3590
                goto out;
3591
 
3592
        do {
3593
                /* Make sure we skip markers inserted by sync */
3594
                if (ip->i_mount == NULL) {
3595
                        ip = ip->i_mnext;
3596
                        continue;
3597
                }
3598
 
3599
                vp = XFS_ITOV_NULL(ip);
3600
                if (!vp) {
3601
                        XFS_MOUNT_IUNLOCK(mp);
3602
                        xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
3603
                        goto again;
3604
                }
3605
 
3606
                ASSERT(vn_count(vp) == 0);
3607
 
3608
                ip = ip->i_mnext;
3609
        } while (ip != mp->m_inodes);
3610
 out:
3611
        XFS_MOUNT_IUNLOCK(mp);
3612
}
3613
 
3614
/*
3615
 * xfs_iaccess: check accessibility of inode for mode.
3616
 */
3617
int
3618
xfs_iaccess(
3619
        xfs_inode_t     *ip,
3620
        mode_t          mode,
3621
        cred_t          *cr)
3622
{
3623
        int             error;
3624
        mode_t          orgmode = mode;
3625
        struct inode    *inode = vn_to_inode(XFS_ITOV(ip));
3626
 
3627
        if (mode & S_IWUSR) {
3628
                umode_t         imode = inode->i_mode;
3629
 
3630
                if (IS_RDONLY(inode) &&
3631
                    (S_ISREG(imode) || S_ISDIR(imode) || S_ISLNK(imode)))
3632
                        return XFS_ERROR(EROFS);
3633
 
3634
                if (IS_IMMUTABLE(inode))
3635
                        return XFS_ERROR(EACCES);
3636
        }
3637
 
3638
        /*
3639
         * If there's an Access Control List it's used instead of
3640
         * the mode bits.
3641
         */
3642
        if ((error = _ACL_XFS_IACCESS(ip, mode, cr)) != -1)
3643
                return error ? XFS_ERROR(error) : 0;
3644
 
3645
        if (current_fsuid(cr) != ip->i_d.di_uid) {
3646
                mode >>= 3;
3647
                if (!in_group_p((gid_t)ip->i_d.di_gid))
3648
                        mode >>= 3;
3649
        }
3650
 
3651
        /*
3652
         * If the DACs are ok we don't need any capability check.
3653
         */
3654
        if ((ip->i_d.di_mode & mode) == mode)
3655
                return 0;
3656
        /*
3657
         * Read/write DACs are always overridable.
3658
         * Executable DACs are overridable if at least one exec bit is set.
3659
         */
3660
        if (!(orgmode & S_IXUSR) ||
3661
            (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
3662
                if (capable_cred(cr, CAP_DAC_OVERRIDE))
3663
                        return 0;
3664
 
3665
        if ((orgmode == S_IRUSR) ||
3666
            (S_ISDIR(inode->i_mode) && (!(orgmode & S_IWUSR)))) {
3667
                if (capable_cred(cr, CAP_DAC_READ_SEARCH))
3668
                        return 0;
3669
#ifdef  NOISE
3670
                cmn_err(CE_NOTE, "Ick: mode=%o, orgmode=%o", mode, orgmode);
3671
#endif  /* NOISE */
3672
                return XFS_ERROR(EACCES);
3673
        }
3674
        return XFS_ERROR(EACCES);
3675
}
3676
 
3677
/*
3678
 * xfs_iroundup: round up argument to next power of two
3679
 */
3680
uint
3681
xfs_iroundup(
3682
        uint    v)
3683
{
3684
        int i;
3685
        uint m;
3686
 
3687
        if ((v & (v - 1)) == 0)
3688
                return v;
3689
        ASSERT((v & 0x80000000) == 0);
3690
        if ((v & (v + 1)) == 0)
3691
                return v + 1;
3692
        for (i = 0, m = 1; i < 31; i++, m <<= 1) {
3693
                if (v & m)
3694
                        continue;
3695
                v |= m;
3696
                if ((v & (v + 1)) == 0)
3697
                        return v + 1;
3698
        }
3699
        ASSERT(0);
3700
        return( 0 );
3701
}
3702
 
3703
#ifdef XFS_ILOCK_TRACE
3704
ktrace_t        *xfs_ilock_trace_buf;
3705
 
3706
void
3707
xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
3708
{
3709
        ktrace_enter(ip->i_lock_trace,
3710
                     (void *)ip,
3711
                     (void *)(unsigned long)lock, /* 1 = LOCK, 3=UNLOCK, etc */
3712
                     (void *)(unsigned long)lockflags, /* XFS_ILOCK_EXCL etc */
3713
                     (void *)ra,                /* caller of ilock */
3714
                     (void *)(unsigned long)current_cpu(),
3715
                     (void *)(unsigned long)current_pid(),
3716
                     NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL);
3717
}
3718
#endif
3719
 
3720
/*
3721
 * Return a pointer to the extent record at file index idx.
3722
 */
3723
xfs_bmbt_rec_host_t *
3724
xfs_iext_get_ext(
3725
        xfs_ifork_t     *ifp,           /* inode fork pointer */
3726
        xfs_extnum_t    idx)            /* index of target extent */
3727
{
3728
        ASSERT(idx >= 0);
3729
        if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
3730
                return ifp->if_u1.if_ext_irec->er_extbuf;
3731
        } else if (ifp->if_flags & XFS_IFEXTIREC) {
3732
                xfs_ext_irec_t  *erp;           /* irec pointer */
3733
                int             erp_idx = 0;     /* irec index */
3734
                xfs_extnum_t    page_idx = idx; /* ext index in target list */
3735
 
3736
                erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
3737
                return &erp->er_extbuf[page_idx];
3738
        } else if (ifp->if_bytes) {
3739
                return &ifp->if_u1.if_extents[idx];
3740
        } else {
3741
                return NULL;
3742
        }
3743
}
3744
 
3745
/*
3746
 * Insert new item(s) into the extent records for incore inode
3747
 * fork 'ifp'.  'count' new items are inserted at index 'idx'.
3748
 */
3749
void
3750
xfs_iext_insert(
3751
        xfs_ifork_t     *ifp,           /* inode fork pointer */
3752
        xfs_extnum_t    idx,            /* starting index of new items */
3753
        xfs_extnum_t    count,          /* number of inserted items */
3754
        xfs_bmbt_irec_t *new)           /* items to insert */
3755
{
3756
        xfs_extnum_t    i;              /* extent record index */
3757
 
3758
        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
3759
        xfs_iext_add(ifp, idx, count);
3760
        for (i = idx; i < idx + count; i++, new++)
3761
                xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
3762
}
3763
 
3764
/*
3765
 * This is called when the amount of space required for incore file
3766
 * extents needs to be increased. The ext_diff parameter stores the
3767
 * number of new extents being added and the idx parameter contains
3768
 * the extent index where the new extents will be added. If the new
3769
 * extents are being appended, then we just need to (re)allocate and
3770
 * initialize the space. Otherwise, if the new extents are being
3771
 * inserted into the middle of the existing entries, a bit more work
3772
 * is required to make room for the new extents to be inserted. The
3773
 * caller is responsible for filling in the new extent entries upon
3774
 * return.
3775
 */
3776
void
3777
xfs_iext_add(
3778
        xfs_ifork_t     *ifp,           /* inode fork pointer */
3779
        xfs_extnum_t    idx,            /* index to begin adding exts */
3780
        int             ext_diff)       /* number of extents to add */
3781
{
3782
        int             byte_diff;      /* new bytes being added */
3783
        int             new_size;       /* size of extents after adding */
3784
        xfs_extnum_t    nextents;       /* number of extents in file */
3785
 
3786
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3787
        ASSERT((idx >= 0) && (idx <= nextents));
3788
        byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
3789
        new_size = ifp->if_bytes + byte_diff;
3790
        /*
3791
         * If the new number of extents (nextents + ext_diff)
3792
         * fits inside the inode, then continue to use the inline
3793
         * extent buffer.
3794
         */
3795
        if (nextents + ext_diff <= XFS_INLINE_EXTS) {
3796
                if (idx < nextents) {
3797
                        memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
3798
                                &ifp->if_u2.if_inline_ext[idx],
3799
                                (nextents - idx) * sizeof(xfs_bmbt_rec_t));
3800
                        memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
3801
                }
3802
                ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
3803
                ifp->if_real_bytes = 0;
3804
                ifp->if_lastex = nextents + ext_diff;
3805
        }
3806
        /*
3807
         * Otherwise use a linear (direct) extent list.
3808
         * If the extents are currently inside the inode,
3809
         * xfs_iext_realloc_direct will switch us from
3810
         * inline to direct extent allocation mode.
3811
         */
3812
        else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
3813
                xfs_iext_realloc_direct(ifp, new_size);
3814
                if (idx < nextents) {
3815
                        memmove(&ifp->if_u1.if_extents[idx + ext_diff],
3816
                                &ifp->if_u1.if_extents[idx],
3817
                                (nextents - idx) * sizeof(xfs_bmbt_rec_t));
3818
                        memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
3819
                }
3820
        }
3821
        /* Indirection array */
3822
        else {
3823
                xfs_ext_irec_t  *erp;
3824
                int             erp_idx = 0;
3825
                int             page_idx = idx;
3826
 
3827
                ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
3828
                if (ifp->if_flags & XFS_IFEXTIREC) {
3829
                        erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
3830
                } else {
3831
                        xfs_iext_irec_init(ifp);
3832
                        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3833
                        erp = ifp->if_u1.if_ext_irec;
3834
                }
3835
                /* Extents fit in target extent page */
3836
                if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
3837
                        if (page_idx < erp->er_extcount) {
3838
                                memmove(&erp->er_extbuf[page_idx + ext_diff],
3839
                                        &erp->er_extbuf[page_idx],
3840
                                        (erp->er_extcount - page_idx) *
3841
                                        sizeof(xfs_bmbt_rec_t));
3842
                                memset(&erp->er_extbuf[page_idx], 0, byte_diff);
3843
                        }
3844
                        erp->er_extcount += ext_diff;
3845
                        xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3846
                }
3847
                /* Insert a new extent page */
3848
                else if (erp) {
3849
                        xfs_iext_add_indirect_multi(ifp,
3850
                                erp_idx, page_idx, ext_diff);
3851
                }
3852
                /*
3853
                 * If extent(s) are being appended to the last page in
3854
                 * the indirection array and the new extent(s) don't fit
3855
                 * in the page, then erp is NULL and erp_idx is set to
3856
                 * the next index needed in the indirection array.
3857
                 */
3858
                else {
3859
                        int     count = ext_diff;
3860
 
3861
                        while (count) {
3862
                                erp = xfs_iext_irec_new(ifp, erp_idx);
3863
                                erp->er_extcount = count;
3864
                                count -= MIN(count, (int)XFS_LINEAR_EXTS);
3865
                                if (count) {
3866
                                        erp_idx++;
3867
                                }
3868
                        }
3869
                }
3870
        }
3871
        ifp->if_bytes = new_size;
3872
}
3873
 
3874
/*
3875
 * This is called when incore extents are being added to the indirection
3876
 * array and the new extents do not fit in the target extent list. The
3877
 * erp_idx parameter contains the irec index for the target extent list
3878
 * in the indirection array, and the idx parameter contains the extent
3879
 * index within the list. The number of extents being added is stored
3880
 * in the count parameter.
3881
 *
3882
 *    |-------|   |-------|
3883
 *    |       |   |       |    idx - number of extents before idx
3884
 *    |  idx  |   | count |
3885
 *    |       |   |       |    count - number of extents being inserted at idx
3886
 *    |-------|   |-------|
3887
 *    | count |   | nex2  |    nex2 - number of extents after idx + count
3888
 *    |-------|   |-------|
3889
 */
3890
void
3891
xfs_iext_add_indirect_multi(
3892
        xfs_ifork_t     *ifp,                   /* inode fork pointer */
3893
        int             erp_idx,                /* target extent irec index */
3894
        xfs_extnum_t    idx,                    /* index within target list */
3895
        int             count)                  /* new extents being added */
3896
{
3897
        int             byte_diff;              /* new bytes being added */
3898
        xfs_ext_irec_t  *erp;                   /* pointer to irec entry */
3899
        xfs_extnum_t    ext_diff;               /* number of extents to add */
3900
        xfs_extnum_t    ext_cnt;                /* new extents still needed */
3901
        xfs_extnum_t    nex2;                   /* extents after idx + count */
3902
        xfs_bmbt_rec_t  *nex2_ep = NULL;        /* temp list for nex2 extents */
3903
        int             nlists;                 /* number of irec's (lists) */
3904
 
3905
        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3906
        erp = &ifp->if_u1.if_ext_irec[erp_idx];
3907
        nex2 = erp->er_extcount - idx;
3908
        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3909
 
3910
        /*
3911
         * Save second part of target extent list
3912
         * (all extents past */
3913
        if (nex2) {
3914
                byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
3915
                nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_SLEEP);
3916
                memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
3917
                erp->er_extcount -= nex2;
3918
                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
3919
                memset(&erp->er_extbuf[idx], 0, byte_diff);
3920
        }
3921
 
3922
        /*
3923
         * Add the new extents to the end of the target
3924
         * list, then allocate new irec record(s) and
3925
         * extent buffer(s) as needed to store the rest
3926
         * of the new extents.
3927
         */
3928
        ext_cnt = count;
3929
        ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
3930
        if (ext_diff) {
3931
                erp->er_extcount += ext_diff;
3932
                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3933
                ext_cnt -= ext_diff;
3934
        }
3935
        while (ext_cnt) {
3936
                erp_idx++;
3937
                erp = xfs_iext_irec_new(ifp, erp_idx);
3938
                ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
3939
                erp->er_extcount = ext_diff;
3940
                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3941
                ext_cnt -= ext_diff;
3942
        }
3943
 
3944
        /* Add nex2 extents back to indirection array */
3945
        if (nex2) {
3946
                xfs_extnum_t    ext_avail;
3947
                int             i;
3948
 
3949
                byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
3950
                ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
3951
                i = 0;
3952
                /*
3953
                 * If nex2 extents fit in the current page, append
3954
                 * nex2_ep after the new extents.
3955
                 */
3956
                if (nex2 <= ext_avail) {
3957
                        i = erp->er_extcount;
3958
                }
3959
                /*
3960
                 * Otherwise, check if space is available in the
3961
                 * next page.
3962
                 */
3963
                else if ((erp_idx < nlists - 1) &&
3964
                         (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
3965
                          ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
3966
                        erp_idx++;
3967
                        erp++;
3968
                        /* Create a hole for nex2 extents */
3969
                        memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
3970
                                erp->er_extcount * sizeof(xfs_bmbt_rec_t));
3971
                }
3972
                /*
3973
                 * Final choice, create a new extent page for
3974
                 * nex2 extents.
3975
                 */
3976
                else {
3977
                        erp_idx++;
3978
                        erp = xfs_iext_irec_new(ifp, erp_idx);
3979
                }
3980
                memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
3981
                kmem_free(nex2_ep, byte_diff);
3982
                erp->er_extcount += nex2;
3983
                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
3984
        }
3985
}
3986
 
3987
/*
3988
 * This is called when the amount of space required for incore file
3989
 * extents needs to be decreased. The ext_diff parameter stores the
3990
 * number of extents to be removed and the idx parameter contains
3991
 * the extent index where the extents will be removed from.
3992
 *
3993
 * If the amount of space needed has decreased below the linear
3994
 * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
3995
 * extent array.  Otherwise, use kmem_realloc() to adjust the
3996
 * size to what is needed.
3997
 */
3998
void
3999
xfs_iext_remove(
4000
        xfs_ifork_t     *ifp,           /* inode fork pointer */
4001
        xfs_extnum_t    idx,            /* index to begin removing exts */
4002
        int             ext_diff)       /* number of extents to remove */
4003
{
4004
        xfs_extnum_t    nextents;       /* number of extents in file */
4005
        int             new_size;       /* size of extents after removal */
4006
 
4007
        ASSERT(ext_diff > 0);
4008
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4009
        new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
4010
 
4011
        if (new_size == 0) {
4012
                xfs_iext_destroy(ifp);
4013
        } else if (ifp->if_flags & XFS_IFEXTIREC) {
4014
                xfs_iext_remove_indirect(ifp, idx, ext_diff);
4015
        } else if (ifp->if_real_bytes) {
4016
                xfs_iext_remove_direct(ifp, idx, ext_diff);
4017
        } else {
4018
                xfs_iext_remove_inline(ifp, idx, ext_diff);
4019
        }
4020
        ifp->if_bytes = new_size;
4021
}
4022
 
4023
/*
4024
 * This removes ext_diff extents from the inline buffer, beginning
4025
 * at extent index idx.
4026
 */
4027
void
4028
xfs_iext_remove_inline(
4029
        xfs_ifork_t     *ifp,           /* inode fork pointer */
4030
        xfs_extnum_t    idx,            /* index to begin removing exts */
4031
        int             ext_diff)       /* number of extents to remove */
4032
{
4033
        int             nextents;       /* number of extents in file */
4034
 
4035
        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
4036
        ASSERT(idx < XFS_INLINE_EXTS);
4037
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4038
        ASSERT(((nextents - ext_diff) > 0) &&
4039
                (nextents - ext_diff) < XFS_INLINE_EXTS);
4040
 
4041
        if (idx + ext_diff < nextents) {
4042
                memmove(&ifp->if_u2.if_inline_ext[idx],
4043
                        &ifp->if_u2.if_inline_ext[idx + ext_diff],
4044
                        (nextents - (idx + ext_diff)) *
4045
                         sizeof(xfs_bmbt_rec_t));
4046
                memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
4047
                        0, ext_diff * sizeof(xfs_bmbt_rec_t));
4048
        } else {
4049
                memset(&ifp->if_u2.if_inline_ext[idx], 0,
4050
                        ext_diff * sizeof(xfs_bmbt_rec_t));
4051
        }
4052
}
4053
 
4054
/*
4055
 * This removes ext_diff extents from a linear (direct) extent list,
4056
 * beginning at extent index idx. If the extents are being removed
4057
 * from the end of the list (ie. truncate) then we just need to re-
4058
 * allocate the list to remove the extra space. Otherwise, if the
4059
 * extents are being removed from the middle of the existing extent
4060
 * entries, then we first need to move the extent records beginning
4061
 * at idx + ext_diff up in the list to overwrite the records being
4062
 * removed, then remove the extra space via kmem_realloc.
4063
 */
4064
void
4065
xfs_iext_remove_direct(
4066
        xfs_ifork_t     *ifp,           /* inode fork pointer */
4067
        xfs_extnum_t    idx,            /* index to begin removing exts */
4068
        int             ext_diff)       /* number of extents to remove */
4069
{
4070
        xfs_extnum_t    nextents;       /* number of extents in file */
4071
        int             new_size;       /* size of extents after removal */
4072
 
4073
        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
4074
        new_size = ifp->if_bytes -
4075
                (ext_diff * sizeof(xfs_bmbt_rec_t));
4076
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4077
 
4078
        if (new_size == 0) {
4079
                xfs_iext_destroy(ifp);
4080
                return;
4081
        }
4082
        /* Move extents up in the list (if needed) */
4083
        if (idx + ext_diff < nextents) {
4084
                memmove(&ifp->if_u1.if_extents[idx],
4085
                        &ifp->if_u1.if_extents[idx + ext_diff],
4086
                        (nextents - (idx + ext_diff)) *
4087
                         sizeof(xfs_bmbt_rec_t));
4088
        }
4089
        memset(&ifp->if_u1.if_extents[nextents - ext_diff],
4090
                0, ext_diff * sizeof(xfs_bmbt_rec_t));
4091
        /*
4092
         * Reallocate the direct extent list. If the extents
4093
         * will fit inside the inode then xfs_iext_realloc_direct
4094
         * will switch from direct to inline extent allocation
4095
         * mode for us.
4096
         */
4097
        xfs_iext_realloc_direct(ifp, new_size);
4098
        ifp->if_bytes = new_size;
4099
}
4100
 
4101
/*
4102
 * This is called when incore extents are being removed from the
4103
 * indirection array and the extents being removed span multiple extent
4104
 * buffers. The idx parameter contains the file extent index where we
4105
 * want to begin removing extents, and the count parameter contains
4106
 * how many extents need to be removed.
4107
 *
4108
 *    |-------|   |-------|
4109
 *    | nex1  |   |       |    nex1 - number of extents before idx
4110
 *    |-------|   | count |
4111
 *    |       |   |       |    count - number of extents being removed at idx
4112
 *    | count |   |-------|
4113
 *    |       |   | nex2  |    nex2 - number of extents after idx + count
4114
 *    |-------|   |-------|
4115
 */
4116
void
4117
xfs_iext_remove_indirect(
4118
        xfs_ifork_t     *ifp,           /* inode fork pointer */
4119
        xfs_extnum_t    idx,            /* index to begin removing extents */
4120
        int             count)          /* number of extents to remove */
4121
{
4122
        xfs_ext_irec_t  *erp;           /* indirection array pointer */
4123
        int             erp_idx = 0;     /* indirection array index */
4124
        xfs_extnum_t    ext_cnt;        /* extents left to remove */
4125
        xfs_extnum_t    ext_diff;       /* extents to remove in current list */
4126
        xfs_extnum_t    nex1;           /* number of extents before idx */
4127
        xfs_extnum_t    nex2;           /* extents after idx + count */
4128
        int             nlists;         /* entries in indirection array */
4129
        int             page_idx = idx; /* index in target extent list */
4130
 
4131
        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4132
        erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
4133
        ASSERT(erp != NULL);
4134
        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4135
        nex1 = page_idx;
4136
        ext_cnt = count;
4137
        while (ext_cnt) {
4138
                nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
4139
                ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
4140
                /*
4141
                 * Check for deletion of entire list;
4142
                 * xfs_iext_irec_remove() updates extent offsets.
4143
                 */
4144
                if (ext_diff == erp->er_extcount) {
4145
                        xfs_iext_irec_remove(ifp, erp_idx);
4146
                        ext_cnt -= ext_diff;
4147
                        nex1 = 0;
4148
                        if (ext_cnt) {
4149
                                ASSERT(erp_idx < ifp->if_real_bytes /
4150
                                        XFS_IEXT_BUFSZ);
4151
                                erp = &ifp->if_u1.if_ext_irec[erp_idx];
4152
                                nex1 = 0;
4153
                                continue;
4154
                        } else {
4155
                                break;
4156
                        }
4157
                }
4158
                /* Move extents up (if needed) */
4159
                if (nex2) {
4160
                        memmove(&erp->er_extbuf[nex1],
4161
                                &erp->er_extbuf[nex1 + ext_diff],
4162
                                nex2 * sizeof(xfs_bmbt_rec_t));
4163
                }
4164
                /* Zero out rest of page */
4165
                memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
4166
                        ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
4167
                /* Update remaining counters */
4168
                erp->er_extcount -= ext_diff;
4169
                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
4170
                ext_cnt -= ext_diff;
4171
                nex1 = 0;
4172
                erp_idx++;
4173
                erp++;
4174
        }
4175
        ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
4176
        xfs_iext_irec_compact(ifp);
4177
}
4178
 
4179
/*
4180
 * Create, destroy, or resize a linear (direct) block of extents.
4181
 */
4182
void
4183
xfs_iext_realloc_direct(
4184
        xfs_ifork_t     *ifp,           /* inode fork pointer */
4185
        int             new_size)       /* new size of extents */
4186
{
4187
        int             rnew_size;      /* real new size of extents */
4188
 
4189
        rnew_size = new_size;
4190
 
4191
        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
4192
                ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
4193
                 (new_size != ifp->if_real_bytes)));
4194
 
4195
        /* Free extent records */
4196
        if (new_size == 0) {
4197
                xfs_iext_destroy(ifp);
4198
        }
4199
        /* Resize direct extent list and zero any new bytes */
4200
        else if (ifp->if_real_bytes) {
4201
                /* Check if extents will fit inside the inode */
4202
                if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
4203
                        xfs_iext_direct_to_inline(ifp, new_size /
4204
                                (uint)sizeof(xfs_bmbt_rec_t));
4205
                        ifp->if_bytes = new_size;
4206
                        return;
4207
                }
4208
                if (!is_power_of_2(new_size)){
4209
                        rnew_size = xfs_iroundup(new_size);
4210
                }
4211
                if (rnew_size != ifp->if_real_bytes) {
4212
                        ifp->if_u1.if_extents =
4213
                                kmem_realloc(ifp->if_u1.if_extents,
4214
                                                rnew_size,
4215
                                                ifp->if_real_bytes,
4216
                                                KM_SLEEP);
4217
                }
4218
                if (rnew_size > ifp->if_real_bytes) {
4219
                        memset(&ifp->if_u1.if_extents[ifp->if_bytes /
4220
                                (uint)sizeof(xfs_bmbt_rec_t)], 0,
4221
                                rnew_size - ifp->if_real_bytes);
4222
                }
4223
        }
4224
        /*
4225
         * Switch from the inline extent buffer to a direct
4226
         * extent list. Be sure to include the inline extent
4227
         * bytes in new_size.
4228
         */
4229
        else {
4230
                new_size += ifp->if_bytes;
4231
                if (!is_power_of_2(new_size)) {
4232
                        rnew_size = xfs_iroundup(new_size);
4233
                }
4234
                xfs_iext_inline_to_direct(ifp, rnew_size);
4235
        }
4236
        ifp->if_real_bytes = rnew_size;
4237
        ifp->if_bytes = new_size;
4238
}
4239
 
4240
/*
4241
 * Switch from linear (direct) extent records to inline buffer.
4242
 */
4243
void
4244
xfs_iext_direct_to_inline(
4245
        xfs_ifork_t     *ifp,           /* inode fork pointer */
4246
        xfs_extnum_t    nextents)       /* number of extents in file */
4247
{
4248
        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
4249
        ASSERT(nextents <= XFS_INLINE_EXTS);
4250
        /*
4251
         * The inline buffer was zeroed when we switched
4252
         * from inline to direct extent allocation mode,
4253
         * so we don't need to clear it here.
4254
         */
4255
        memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
4256
                nextents * sizeof(xfs_bmbt_rec_t));
4257
        kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
4258
        ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
4259
        ifp->if_real_bytes = 0;
4260
}
4261
 
4262
/*
4263
 * Switch from inline buffer to linear (direct) extent records.
4264
 * new_size should already be rounded up to the next power of 2
4265
 * by the caller (when appropriate), so use new_size as it is.
4266
 * However, since new_size may be rounded up, we can't update
4267
 * if_bytes here. It is the caller's responsibility to update
4268
 * if_bytes upon return.
4269
 */
4270
void
4271
xfs_iext_inline_to_direct(
4272
        xfs_ifork_t     *ifp,           /* inode fork pointer */
4273
        int             new_size)       /* number of extents in file */
4274
{
4275
        ifp->if_u1.if_extents = kmem_alloc(new_size, KM_SLEEP);
4276
        memset(ifp->if_u1.if_extents, 0, new_size);
4277
        if (ifp->if_bytes) {
4278
                memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
4279
                        ifp->if_bytes);
4280
                memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
4281
                        sizeof(xfs_bmbt_rec_t));
4282
        }
4283
        ifp->if_real_bytes = new_size;
4284
}
4285
 
4286
/*
4287
 * Resize an extent indirection array to new_size bytes.
4288
 */
4289
void
4290
xfs_iext_realloc_indirect(
4291
        xfs_ifork_t     *ifp,           /* inode fork pointer */
4292
        int             new_size)       /* new indirection array size */
4293
{
4294
        int             nlists;         /* number of irec's (ex lists) */
4295
        int             size;           /* current indirection array size */
4296
 
4297
        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4298
        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4299
        size = nlists * sizeof(xfs_ext_irec_t);
4300
        ASSERT(ifp->if_real_bytes);
4301
        ASSERT((new_size >= 0) && (new_size != size));
4302
        if (new_size == 0) {
4303
                xfs_iext_destroy(ifp);
4304
        } else {
4305
                ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
4306
                        kmem_realloc(ifp->if_u1.if_ext_irec,
4307
                                new_size, size, KM_SLEEP);
4308
        }
4309
}
4310
 
4311
/*
4312
 * Switch from indirection array to linear (direct) extent allocations.
4313
 */
4314
void
4315
xfs_iext_indirect_to_direct(
4316
         xfs_ifork_t    *ifp)           /* inode fork pointer */
4317
{
4318
        xfs_bmbt_rec_host_t *ep;        /* extent record pointer */
4319
        xfs_extnum_t    nextents;       /* number of extents in file */
4320
        int             size;           /* size of file extents */
4321
 
4322
        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4323
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4324
        ASSERT(nextents <= XFS_LINEAR_EXTS);
4325
        size = nextents * sizeof(xfs_bmbt_rec_t);
4326
 
4327
        xfs_iext_irec_compact_full(ifp);
4328
        ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
4329
 
4330
        ep = ifp->if_u1.if_ext_irec->er_extbuf;
4331
        kmem_free(ifp->if_u1.if_ext_irec, sizeof(xfs_ext_irec_t));
4332
        ifp->if_flags &= ~XFS_IFEXTIREC;
4333
        ifp->if_u1.if_extents = ep;
4334
        ifp->if_bytes = size;
4335
        if (nextents < XFS_LINEAR_EXTS) {
4336
                xfs_iext_realloc_direct(ifp, size);
4337
        }
4338
}
4339
 
4340
/*
4341
 * Free incore file extents.
4342
 */
4343
void
4344
xfs_iext_destroy(
4345
        xfs_ifork_t     *ifp)           /* inode fork pointer */
4346
{
4347
        if (ifp->if_flags & XFS_IFEXTIREC) {
4348
                int     erp_idx;
4349
                int     nlists;
4350
 
4351
                nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4352
                for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
4353
                        xfs_iext_irec_remove(ifp, erp_idx);
4354
                }
4355
                ifp->if_flags &= ~XFS_IFEXTIREC;
4356
        } else if (ifp->if_real_bytes) {
4357
                kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
4358
        } else if (ifp->if_bytes) {
4359
                memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
4360
                        sizeof(xfs_bmbt_rec_t));
4361
        }
4362
        ifp->if_u1.if_extents = NULL;
4363
        ifp->if_real_bytes = 0;
4364
        ifp->if_bytes = 0;
4365
}
4366
 
4367
/*
4368
 * Return a pointer to the extent record for file system block bno.
4369
 */
4370
xfs_bmbt_rec_host_t *                   /* pointer to found extent record */
4371
xfs_iext_bno_to_ext(
4372
        xfs_ifork_t     *ifp,           /* inode fork pointer */
4373
        xfs_fileoff_t   bno,            /* block number to search for */
4374
        xfs_extnum_t    *idxp)          /* index of target extent */
4375
{
4376
        xfs_bmbt_rec_host_t *base;      /* pointer to first extent */
4377
        xfs_filblks_t   blockcount = 0;  /* number of blocks in extent */
4378
        xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
4379
        xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
4380
        int             high;           /* upper boundary in search */
4381
        xfs_extnum_t    idx = 0; /* index of target extent */
4382
        int             low;            /* lower boundary in search */
4383
        xfs_extnum_t    nextents;       /* number of file extents */
4384
        xfs_fileoff_t   startoff = 0;    /* start offset of extent */
4385
 
4386
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4387
        if (nextents == 0) {
4388
                *idxp = 0;
4389
                return NULL;
4390
        }
4391
        low = 0;
4392
        if (ifp->if_flags & XFS_IFEXTIREC) {
4393
                /* Find target extent list */
4394
                int     erp_idx = 0;
4395
                erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
4396
                base = erp->er_extbuf;
4397
                high = erp->er_extcount - 1;
4398
        } else {
4399
                base = ifp->if_u1.if_extents;
4400
                high = nextents - 1;
4401
        }
4402
        /* Binary search extent records */
4403
        while (low <= high) {
4404
                idx = (low + high) >> 1;
4405
                ep = base + idx;
4406
                startoff = xfs_bmbt_get_startoff(ep);
4407
                blockcount = xfs_bmbt_get_blockcount(ep);
4408
                if (bno < startoff) {
4409
                        high = idx - 1;
4410
                } else if (bno >= startoff + blockcount) {
4411
                        low = idx + 1;
4412
                } else {
4413
                        /* Convert back to file-based extent index */
4414
                        if (ifp->if_flags & XFS_IFEXTIREC) {
4415
                                idx += erp->er_extoff;
4416
                        }
4417
                        *idxp = idx;
4418
                        return ep;
4419
                }
4420
        }
4421
        /* Convert back to file-based extent index */
4422
        if (ifp->if_flags & XFS_IFEXTIREC) {
4423
                idx += erp->er_extoff;
4424
        }
4425
        if (bno >= startoff + blockcount) {
4426
                if (++idx == nextents) {
4427
                        ep = NULL;
4428
                } else {
4429
                        ep = xfs_iext_get_ext(ifp, idx);
4430
                }
4431
        }
4432
        *idxp = idx;
4433
        return ep;
4434
}
4435
 
4436
/*
4437
 * Return a pointer to the indirection array entry containing the
4438
 * extent record for filesystem block bno. Store the index of the
4439
 * target irec in *erp_idxp.
4440
 */
4441
xfs_ext_irec_t *                        /* pointer to found extent record */
4442
xfs_iext_bno_to_irec(
4443
        xfs_ifork_t     *ifp,           /* inode fork pointer */
4444
        xfs_fileoff_t   bno,            /* block number to search for */
4445
        int             *erp_idxp)      /* irec index of target ext list */
4446
{
4447
        xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
4448
        xfs_ext_irec_t  *erp_next;      /* next indirection array entry */
4449
        int             erp_idx;        /* indirection array index */
4450
        int             nlists;         /* number of extent irec's (lists) */
4451
        int             high;           /* binary search upper limit */
4452
        int             low;            /* binary search lower limit */
4453
 
4454
        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4455
        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4456
        erp_idx = 0;
4457
        low = 0;
4458
        high = nlists - 1;
4459
        while (low <= high) {
4460
                erp_idx = (low + high) >> 1;
4461
                erp = &ifp->if_u1.if_ext_irec[erp_idx];
4462
                erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
4463
                if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
4464
                        high = erp_idx - 1;
4465
                } else if (erp_next && bno >=
4466
                           xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
4467
                        low = erp_idx + 1;
4468
                } else {
4469
                        break;
4470
                }
4471
        }
4472
        *erp_idxp = erp_idx;
4473
        return erp;
4474
}
4475
 
4476
/*
4477
 * Return a pointer to the indirection array entry containing the
4478
 * extent record at file extent index *idxp. Store the index of the
4479
 * target irec in *erp_idxp and store the page index of the target
4480
 * extent record in *idxp.
4481
 */
4482
xfs_ext_irec_t *
4483
xfs_iext_idx_to_irec(
4484
        xfs_ifork_t     *ifp,           /* inode fork pointer */
4485
        xfs_extnum_t    *idxp,          /* extent index (file -> page) */
4486
        int             *erp_idxp,      /* pointer to target irec */
4487
        int             realloc)        /* new bytes were just added */
4488
{
4489
        xfs_ext_irec_t  *prev;          /* pointer to previous irec */
4490
        xfs_ext_irec_t  *erp = NULL;    /* pointer to current irec */
4491
        int             erp_idx;        /* indirection array index */
4492
        int             nlists;         /* number of irec's (ex lists) */
4493
        int             high;           /* binary search upper limit */
4494
        int             low;            /* binary search lower limit */
4495
        xfs_extnum_t    page_idx = *idxp; /* extent index in target list */
4496
 
4497
        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4498
        ASSERT(page_idx >= 0 && page_idx <=
4499
                ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
4500
        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4501
        erp_idx = 0;
4502
        low = 0;
4503
        high = nlists - 1;
4504
 
4505
        /* Binary search extent irec's */
4506
        while (low <= high) {
4507
                erp_idx = (low + high) >> 1;
4508
                erp = &ifp->if_u1.if_ext_irec[erp_idx];
4509
                prev = erp_idx > 0 ? erp - 1 : NULL;
4510
                if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
4511
                     realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
4512
                        high = erp_idx - 1;
4513
                } else if (page_idx > erp->er_extoff + erp->er_extcount ||
4514
                           (page_idx == erp->er_extoff + erp->er_extcount &&
4515
                            !realloc)) {
4516
                        low = erp_idx + 1;
4517
                } else if (page_idx == erp->er_extoff + erp->er_extcount &&
4518
                           erp->er_extcount == XFS_LINEAR_EXTS) {
4519
                        ASSERT(realloc);
4520
                        page_idx = 0;
4521
                        erp_idx++;
4522
                        erp = erp_idx < nlists ? erp + 1 : NULL;
4523
                        break;
4524
                } else {
4525
                        page_idx -= erp->er_extoff;
4526
                        break;
4527
                }
4528
        }
4529
        *idxp = page_idx;
4530
        *erp_idxp = erp_idx;
4531
        return(erp);
4532
}
4533
 
4534
/*
4535
 * Allocate and initialize an indirection array once the space needed
4536
 * for incore extents increases above XFS_IEXT_BUFSZ.
4537
 */
4538
void
4539
xfs_iext_irec_init(
4540
        xfs_ifork_t     *ifp)           /* inode fork pointer */
4541
{
4542
        xfs_ext_irec_t  *erp;           /* indirection array pointer */
4543
        xfs_extnum_t    nextents;       /* number of extents in file */
4544
 
4545
        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
4546
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4547
        ASSERT(nextents <= XFS_LINEAR_EXTS);
4548
 
4549
        erp = (xfs_ext_irec_t *)
4550
                kmem_alloc(sizeof(xfs_ext_irec_t), KM_SLEEP);
4551
 
4552
        if (nextents == 0) {
4553
                ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP);
4554
        } else if (!ifp->if_real_bytes) {
4555
                xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
4556
        } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
4557
                xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
4558
        }
4559
        erp->er_extbuf = ifp->if_u1.if_extents;
4560
        erp->er_extcount = nextents;
4561
        erp->er_extoff = 0;
4562
 
4563
        ifp->if_flags |= XFS_IFEXTIREC;
4564
        ifp->if_real_bytes = XFS_IEXT_BUFSZ;
4565
        ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
4566
        ifp->if_u1.if_ext_irec = erp;
4567
 
4568
        return;
4569
}
4570
 
4571
/*
4572
 * Allocate and initialize a new entry in the indirection array.
4573
 */
4574
xfs_ext_irec_t *
4575
xfs_iext_irec_new(
4576
        xfs_ifork_t     *ifp,           /* inode fork pointer */
4577
        int             erp_idx)        /* index for new irec */
4578
{
4579
        xfs_ext_irec_t  *erp;           /* indirection array pointer */
4580
        int             i;              /* loop counter */
4581
        int             nlists;         /* number of irec's (ex lists) */
4582
 
4583
        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4584
        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4585
 
4586
        /* Resize indirection array */
4587
        xfs_iext_realloc_indirect(ifp, ++nlists *
4588
                                  sizeof(xfs_ext_irec_t));
4589
        /*
4590
         * Move records down in the array so the
4591
         * new page can use erp_idx.
4592
         */
4593
        erp = ifp->if_u1.if_ext_irec;
4594
        for (i = nlists - 1; i > erp_idx; i--) {
4595
                memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
4596
        }
4597
        ASSERT(i == erp_idx);
4598
 
4599
        /* Initialize new extent record */
4600
        erp = ifp->if_u1.if_ext_irec;
4601
        erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP);
4602
        ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
4603
        memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
4604
        erp[erp_idx].er_extcount = 0;
4605
        erp[erp_idx].er_extoff = erp_idx > 0 ?
4606
                erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
4607
        return (&erp[erp_idx]);
4608
}
4609
 
4610
/*
4611
 * Remove a record from the indirection array.
4612
 */
4613
void
4614
xfs_iext_irec_remove(
4615
        xfs_ifork_t     *ifp,           /* inode fork pointer */
4616
        int             erp_idx)        /* irec index to remove */
4617
{
4618
        xfs_ext_irec_t  *erp;           /* indirection array pointer */
4619
        int             i;              /* loop counter */
4620
        int             nlists;         /* number of irec's (ex lists) */
4621
 
4622
        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4623
        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4624
        erp = &ifp->if_u1.if_ext_irec[erp_idx];
4625
        if (erp->er_extbuf) {
4626
                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
4627
                        -erp->er_extcount);
4628
                kmem_free(erp->er_extbuf, XFS_IEXT_BUFSZ);
4629
        }
4630
        /* Compact extent records */
4631
        erp = ifp->if_u1.if_ext_irec;
4632
        for (i = erp_idx; i < nlists - 1; i++) {
4633
                memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
4634
        }
4635
        /*
4636
         * Manually free the last extent record from the indirection
4637
         * array.  A call to xfs_iext_realloc_indirect() with a size
4638
         * of zero would result in a call to xfs_iext_destroy() which
4639
         * would in turn call this function again, creating a nasty
4640
         * infinite loop.
4641
         */
4642
        if (--nlists) {
4643
                xfs_iext_realloc_indirect(ifp,
4644
                        nlists * sizeof(xfs_ext_irec_t));
4645
        } else {
4646
                kmem_free(ifp->if_u1.if_ext_irec,
4647
                        sizeof(xfs_ext_irec_t));
4648
        }
4649
        ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
4650
}
4651
 
4652
/*
4653
 * This is called to clean up large amounts of unused memory allocated
4654
 * by the indirection array.  Before compacting anything though, verify
4655
 * that the indirection array is still needed and switch back to the
4656
 * linear extent list (or even the inline buffer) if possible.  The
4657
 * compaction policy is as follows:
4658
 *
4659
 *    Full Compaction: Extents fit into a single page (or inline buffer)
4660
 *    Full Compaction: Extents occupy less than 10% of allocated space
4661
 * Partial Compaction: Extents occupy > 10% and < 50% of allocated space
4662
 *      No Compaction: Extents occupy at least 50% of allocated space
4663
 */
4664
void
4665
xfs_iext_irec_compact(
4666
        xfs_ifork_t     *ifp)           /* inode fork pointer */
4667
{
4668
        xfs_extnum_t    nextents;       /* number of extents in file */
4669
        int             nlists;         /* number of irec's (ex lists) */
4670
 
4671
        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4672
        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4673
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4674
 
4675
        if (nextents == 0) {
4676
                xfs_iext_destroy(ifp);
4677
        } else if (nextents <= XFS_INLINE_EXTS) {
4678
                xfs_iext_indirect_to_direct(ifp);
4679
                xfs_iext_direct_to_inline(ifp, nextents);
4680
        } else if (nextents <= XFS_LINEAR_EXTS) {
4681
                xfs_iext_indirect_to_direct(ifp);
4682
        } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 3) {
4683
                xfs_iext_irec_compact_full(ifp);
4684
        } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
4685
                xfs_iext_irec_compact_pages(ifp);
4686
        }
4687
}
4688
 
4689
/*
4690
 * Combine extents from neighboring extent pages.
4691
 */
4692
void
4693
xfs_iext_irec_compact_pages(
4694
        xfs_ifork_t     *ifp)           /* inode fork pointer */
4695
{
4696
        xfs_ext_irec_t  *erp, *erp_next;/* pointers to irec entries */
4697
        int             erp_idx = 0;     /* indirection array index */
4698
        int             nlists;         /* number of irec's (ex lists) */
4699
 
4700
        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4701
        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4702
        while (erp_idx < nlists - 1) {
4703
                erp = &ifp->if_u1.if_ext_irec[erp_idx];
4704
                erp_next = erp + 1;
4705
                if (erp_next->er_extcount <=
4706
                    (XFS_LINEAR_EXTS - erp->er_extcount)) {
4707
                        memmove(&erp->er_extbuf[erp->er_extcount],
4708
                                erp_next->er_extbuf, erp_next->er_extcount *
4709
                                sizeof(xfs_bmbt_rec_t));
4710
                        erp->er_extcount += erp_next->er_extcount;
4711
                        /*
4712
                         * Free page before removing extent record
4713
                         * so er_extoffs don't get modified in
4714
                         * xfs_iext_irec_remove.
4715
                         */
4716
                        kmem_free(erp_next->er_extbuf, XFS_IEXT_BUFSZ);
4717
                        erp_next->er_extbuf = NULL;
4718
                        xfs_iext_irec_remove(ifp, erp_idx + 1);
4719
                        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4720
                } else {
4721
                        erp_idx++;
4722
                }
4723
        }
4724
}
4725
 
4726
/*
4727
 * Fully compact the extent records managed by the indirection array.
4728
 */
4729
void
4730
xfs_iext_irec_compact_full(
4731
        xfs_ifork_t     *ifp)                   /* inode fork pointer */
4732
{
4733
        xfs_bmbt_rec_host_t *ep, *ep_next;      /* extent record pointers */
4734
        xfs_ext_irec_t  *erp, *erp_next;        /* extent irec pointers */
4735
        int             erp_idx = 0;             /* extent irec index */
4736
        int             ext_avail;              /* empty entries in ex list */
4737
        int             ext_diff;               /* number of exts to add */
4738
        int             nlists;                 /* number of irec's (ex lists) */
4739
 
4740
        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4741
        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4742
        erp = ifp->if_u1.if_ext_irec;
4743
        ep = &erp->er_extbuf[erp->er_extcount];
4744
        erp_next = erp + 1;
4745
        ep_next = erp_next->er_extbuf;
4746
        while (erp_idx < nlists - 1) {
4747
                ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
4748
                ext_diff = MIN(ext_avail, erp_next->er_extcount);
4749
                memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t));
4750
                erp->er_extcount += ext_diff;
4751
                erp_next->er_extcount -= ext_diff;
4752
                /* Remove next page */
4753
                if (erp_next->er_extcount == 0) {
4754
                        /*
4755
                         * Free page before removing extent record
4756
                         * so er_extoffs don't get modified in
4757
                         * xfs_iext_irec_remove.
4758
                         */
4759
                        kmem_free(erp_next->er_extbuf,
4760
                                erp_next->er_extcount * sizeof(xfs_bmbt_rec_t));
4761
                        erp_next->er_extbuf = NULL;
4762
                        xfs_iext_irec_remove(ifp, erp_idx + 1);
4763
                        erp = &ifp->if_u1.if_ext_irec[erp_idx];
4764
                        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4765
                /* Update next page */
4766
                } else {
4767
                        /* Move rest of page up to become next new page */
4768
                        memmove(erp_next->er_extbuf, ep_next,
4769
                                erp_next->er_extcount * sizeof(xfs_bmbt_rec_t));
4770
                        ep_next = erp_next->er_extbuf;
4771
                        memset(&ep_next[erp_next->er_extcount], 0,
4772
                                (XFS_LINEAR_EXTS - erp_next->er_extcount) *
4773
                                sizeof(xfs_bmbt_rec_t));
4774
                }
4775
                if (erp->er_extcount == XFS_LINEAR_EXTS) {
4776
                        erp_idx++;
4777
                        if (erp_idx < nlists)
4778
                                erp = &ifp->if_u1.if_ext_irec[erp_idx];
4779
                        else
4780
                                break;
4781
                }
4782
                ep = &erp->er_extbuf[erp->er_extcount];
4783
                erp_next = erp + 1;
4784
                ep_next = erp_next->er_extbuf;
4785
        }
4786
}
4787
 
4788
/*
4789
 * This is called to update the er_extoff field in the indirection
4790
 * array when extents have been added or removed from one of the
4791
 * extent lists. erp_idx contains the irec index to begin updating
4792
 * at and ext_diff contains the number of extents that were added
4793
 * or removed.
4794
 */
4795
void
4796
xfs_iext_irec_update_extoffs(
4797
        xfs_ifork_t     *ifp,           /* inode fork pointer */
4798
        int             erp_idx,        /* irec index to update */
4799
        int             ext_diff)       /* number of new extents */
4800
{
4801
        int             i;              /* loop counter */
4802
        int             nlists;         /* number of irec's (ex lists */
4803
 
4804
        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4805
        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4806
        for (i = erp_idx; i < nlists; i++) {
4807
                ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
4808
        }
4809
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.