OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [fs/] [buffer.c] - Blame information for rev 1765

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1275 phoenix
/*
2
 *  linux/fs/buffer.c
3
 *
4
 *  Copyright (C) 1991, 1992  Linus Torvalds
5
 */
6
 
7
/*
8
 *  'buffer.c' implements the buffer-cache functions. Race-conditions have
9
 * been avoided by NEVER letting an interrupt change a buffer (except for the
10
 * data, of course), but instead letting the caller do it.
11
 */
12
 
13
/* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
14
 
15
/* Removed a lot of unnecessary code and simplified things now that
16
 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
17
 */
18
 
19
/* Speed up hash, lru, and free list operations.  Use gfp() for allocating
20
 * hash table, use SLAB cache for buffer heads. -DaveM
21
 */
22
 
23
/* Added 32k buffer block sizes - these are required older ARM systems.
24
 * - RMK
25
 */
26
 
27
/* Thread it... -DaveM */
28
 
29
/* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
30
 
31
#include <linux/config.h>
32
#include <linux/sched.h>
33
#include <linux/fs.h>
34
#include <linux/slab.h>
35
#include <linux/locks.h>
36
#include <linux/errno.h>
37
#include <linux/swap.h>
38
#include <linux/swapctl.h>
39
#include <linux/smp_lock.h>
40
#include <linux/vmalloc.h>
41
#include <linux/blkdev.h>
42
#include <linux/sysrq.h>
43
#include <linux/file.h>
44
#include <linux/init.h>
45
#include <linux/quotaops.h>
46
#include <linux/iobuf.h>
47
#include <linux/highmem.h>
48
#include <linux/module.h>
49
#include <linux/completion.h>
50
 
51
#include <asm/uaccess.h>
52
#include <asm/io.h>
53
#include <asm/bitops.h>
54
#include <asm/mmu_context.h>
55
 
56
#define NR_RESERVED (10*MAX_BUF_PER_PAGE)
57
#define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this 
58
                                             number of unused buffer heads */
59
 
60
/* Anti-deadlock ordering:
61
 *      lru_list_lock > hash_table_lock > unused_list_lock
62
 */
63
 
64
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
65
 
66
/*
67
 * Hash table gook..
68
 */
69
static unsigned int bh_hash_mask;
70
static unsigned int bh_hash_shift;
71
static struct buffer_head **hash_table;
72
static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
73
 
74
static struct buffer_head *lru_list[NR_LIST];
75
 
76
static spinlock_cacheline_t lru_list_lock_cacheline = {SPIN_LOCK_UNLOCKED};
77
#define lru_list_lock  lru_list_lock_cacheline.lock
78
 
79
static int nr_buffers_type[NR_LIST];
80
static unsigned long size_buffers_type[NR_LIST];
81
 
82
static struct buffer_head * unused_list;
83
static int nr_unused_buffer_heads;
84
static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
85
static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
86
 
87
static int grow_buffers(kdev_t dev, unsigned long block, int size);
88
static int osync_buffers_list(struct list_head *);
89
static void __refile_buffer(struct buffer_head *);
90
 
91
/*
92
 * A global sysctl-controlled flag which puts the machine into "laptop mode"
93
 */
94
int laptop_mode;
95
 
96
static DECLARE_WAIT_QUEUE_HEAD(kupdate_wait);
97
 
98
/* This is used by some architectures to estimate available memory. */
99
atomic_t buffermem_pages = ATOMIC_INIT(0);
100
 
101
/* Here is the parameter block for the bdflush process. If you add or
102
 * remove any of the parameters, make sure to update kernel/sysctl.c
103
 * and the documentation at linux/Documentation/sysctl/vm.txt.
104
 */
105
 
106
#define N_PARAM 9
107
 
108
/* The dummy values in this structure are left in there for compatibility
109
 * with old programs that play with the /proc entries.
110
 */
111
union bdflush_param {
112
        struct {
113
                int nfract;     /* Percentage of buffer cache dirty to
114
                                   activate bdflush */
115
                int ndirty;     /* Maximum number of dirty blocks to write out per
116
                                   wake-cycle */
117
                int dummy2;     /* old "nrefill" */
118
                int dummy3;     /* unused */
119
                int interval;   /* jiffies delay between kupdate flushes */
120
                int age_buffer; /* Time for normal buffer to age before we flush it */
121
                int nfract_sync;/* Percentage of buffer cache dirty to
122
                                   activate bdflush synchronously */
123
                int nfract_stop_bdflush; /* Percetange of buffer cache dirty to stop bdflush */
124
                int dummy5;     /* unused */
125
        } b_un;
126
        unsigned int data[N_PARAM];
127
} bdf_prm = {{30, 500, 0, 0, 5*HZ, 30*HZ, 60, 20, 0}};
128
 
129
/* These are the min and max parameter values that we will allow to be assigned */
130
int bdflush_min[N_PARAM] = {  0,  1,    0,   0,  0,   1*HZ,   0, 0, 0};
131
int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 10000*HZ, 100, 100, 0};
132
 
133
static inline int write_buffer_delay(struct buffer_head *bh)
134
{
135
        struct page *page = bh->b_page;
136
 
137
        if (!TryLockPage(page)) {
138
                spin_unlock(&lru_list_lock);
139
                unlock_buffer(bh);
140
                page->mapping->a_ops->writepage(page);
141
                return 1;
142
        }
143
 
144
        return 0;
145
}
146
 
147
static inline void write_buffer(struct buffer_head *bh)
148
{
149
        if (buffer_delay(bh)) {
150
                struct page *page = bh->b_page;
151
 
152
                lock_page(page);
153
                if (buffer_delay(bh)) {
154
                        page->mapping->a_ops->writepage(page);
155
                        return;
156
                }
157
                unlock_page(page);
158
        }
159
 
160
        ll_rw_block(WRITE, 1, &bh);
161
}
162
 
163
void unlock_buffer(struct buffer_head *bh)
164
{
165
        clear_bit(BH_Wait_IO, &bh->b_state);
166
        clear_bit(BH_Launder, &bh->b_state);
167
        /*
168
         * When a locked buffer is visible to the I/O layer BH_Launder
169
         * is set. This means before unlocking we must clear BH_Launder,
170
         * mb() on alpha and then clear BH_Lock, so no reader can see
171
         * BH_Launder set on an unlocked buffer and then risk to deadlock.
172
         */
173
        smp_mb__after_clear_bit();
174
        clear_bit(BH_Lock, &bh->b_state);
175
        smp_mb__after_clear_bit();
176
        if (waitqueue_active(&bh->b_wait))
177
                wake_up(&bh->b_wait);
178
}
179
 
180
/*
181
 * Note that the real wait_on_buffer() is an inline function that checks
182
 * that the buffer is locked before calling this, so that unnecessary disk
183
 * unplugging does not occur.
184
 */
185
void __wait_on_buffer(struct buffer_head * bh)
186
{
187
        struct task_struct *tsk = current;
188
        DECLARE_WAITQUEUE(wait, tsk);
189
 
190
        get_bh(bh);
191
        add_wait_queue(&bh->b_wait, &wait);
192
        do {
193
                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
194
                if (!buffer_locked(bh))
195
                        break;
196
                /*
197
                 * We must read tq_disk in TQ_ACTIVE after the
198
                 * add_wait_queue effect is visible to other cpus.
199
                 * We could unplug some line above it wouldn't matter
200
                 * but we can't do that right after add_wait_queue
201
                 * without an smp_mb() in between because spin_unlock
202
                 * has inclusive semantics.
203
                 * Doing it here is the most efficient place so we
204
                 * don't do a suprious unplug if we get a racy
205
                 * wakeup that make buffer_locked to return 0, and
206
                 * doing it here avoids an explicit smp_mb() we
207
                 * rely on the implicit one in set_task_state.
208
                 */
209
                run_task_queue(&tq_disk);
210
                schedule();
211
        } while (buffer_locked(bh));
212
        tsk->state = TASK_RUNNING;
213
        remove_wait_queue(&bh->b_wait, &wait);
214
        put_bh(bh);
215
}
216
 
217
/*
218
 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
219
 * unlock the buffer. This is what ll_rw_block uses too.
220
 */
221
void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
222
{
223
        mark_buffer_uptodate(bh, uptodate);
224
        unlock_buffer(bh);
225
        put_bh(bh);
226
}
227
 
228
/*
229
 * The buffers have been marked clean and locked.  Just submit the dang
230
 * things..
231
 */
232
static void write_locked_buffers(struct buffer_head **array, unsigned int count)
233
{
234
        do {
235
                struct buffer_head * bh = *array++;
236
                bh->b_end_io = end_buffer_io_sync;
237
                submit_bh(WRITE, bh);
238
        } while (--count);
239
}
240
 
241
/*
242
 * Write some buffers from the head of the dirty queue.
243
 *
244
 * This must be called with the LRU lock held, and will
245
 * return without it!
246
 */
247
#define NRSYNC (32)
248
static int write_some_buffers(kdev_t dev)
249
{
250
        struct buffer_head *next;
251
        struct buffer_head *array[NRSYNC];
252
        unsigned int count;
253
        int nr;
254
 
255
        next = lru_list[BUF_DIRTY];
256
        nr = nr_buffers_type[BUF_DIRTY];
257
        count = 0;
258
        while (next && --nr >= 0) {
259
                struct buffer_head * bh = next;
260
                next = bh->b_next_free;
261
 
262
                if (dev != NODEV && bh->b_dev != dev)
263
                        continue;
264
                if (test_and_set_bit(BH_Lock, &bh->b_state))
265
                        continue;
266
                if (buffer_delay(bh)) {
267
                        if (write_buffer_delay(bh)) {
268
                                if (count)
269
                                        write_locked_buffers(array, count);
270
                                return -EAGAIN;
271
                        }
272
                } else if (atomic_set_buffer_clean(bh)) {
273
                        __refile_buffer(bh);
274
                        get_bh(bh);
275
                        array[count++] = bh;
276
                        if (count < NRSYNC)
277
                                continue;
278
 
279
                        spin_unlock(&lru_list_lock);
280
                        write_locked_buffers(array, count);
281
                        return -EAGAIN;
282
                }
283
                unlock_buffer(bh);
284
                __refile_buffer(bh);
285
        }
286
        spin_unlock(&lru_list_lock);
287
 
288
        if (count)
289
                write_locked_buffers(array, count);
290
        return 0;
291
}
292
 
293
/*
294
 * Write out all buffers on the dirty list.
295
 */
296
static void write_unlocked_buffers(kdev_t dev)
297
{
298
        do
299
                spin_lock(&lru_list_lock);
300
        while (write_some_buffers(dev));
301
}
302
 
303
/*
304
 * Wait for a buffer on the proper list.
305
 *
306
 * This must be called with the LRU lock held, and
307
 * will return with it released.
308
 */
309
static int wait_for_buffers(kdev_t dev, int index, int refile)
310
{
311
        struct buffer_head * next;
312
        int nr;
313
 
314
        next = lru_list[index];
315
        nr = nr_buffers_type[index];
316
        while (next && --nr >= 0) {
317
                struct buffer_head *bh = next;
318
                next = bh->b_next_free;
319
 
320
                if (!buffer_locked(bh)) {
321
                        if (refile)
322
                                __refile_buffer(bh);
323
                        continue;
324
                }
325
                if (dev != NODEV && bh->b_dev != dev)
326
                        continue;
327
 
328
                get_bh(bh);
329
                spin_unlock(&lru_list_lock);
330
                wait_on_buffer (bh);
331
                put_bh(bh);
332
                return -EAGAIN;
333
        }
334
        spin_unlock(&lru_list_lock);
335
        return 0;
336
}
337
 
338
static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
339
{
340
        do {
341
                spin_lock(&lru_list_lock);
342
        } while (wait_for_buffers(dev, index, refile));
343
        return 0;
344
}
345
 
346
/* Call sync_buffers with wait!=0 to ensure that the call does not
347
 * return until all buffer writes have completed.  Sync() may return
348
 * before the writes have finished; fsync() may not.
349
 */
350
 
351
/* Godamity-damn.  Some buffers (bitmaps for filesystems)
352
 * spontaneously dirty themselves without ever brelse being called.
353
 * We will ultimately want to put these in a separate list, but for
354
 * now we search all of the lists for dirty buffers.
355
 */
356
int sync_buffers(kdev_t dev, int wait)
357
{
358
        int err = 0;
359
 
360
        /* One pass for no-wait, three for wait:
361
         * 0) write out all dirty, unlocked buffers;
362
         * 1) wait for all dirty locked buffers;
363
         * 2) write out all dirty, unlocked buffers;
364
         * 2) wait for completion by waiting for all buffers to unlock.
365
         */
366
        write_unlocked_buffers(dev);
367
        if (wait) {
368
                err = wait_for_locked_buffers(dev, BUF_DIRTY, 0);
369
                write_unlocked_buffers(dev);
370
                err |= wait_for_locked_buffers(dev, BUF_LOCKED, 1);
371
        }
372
        return err;
373
}
374
 
375
int fsync_super(struct super_block *sb)
376
{
377
        kdev_t dev = sb->s_dev;
378
        sync_buffers(dev, 0);
379
 
380
        lock_kernel();
381
        sync_inodes_sb(sb);
382
        DQUOT_SYNC_SB(sb);
383
        lock_super(sb);
384
        if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
385
                sb->s_op->write_super(sb);
386
        unlock_super(sb);
387
        if (sb->s_op && sb->s_op->sync_fs)
388
                sb->s_op->sync_fs(sb);
389
        unlock_kernel();
390
 
391
        return sync_buffers(dev, 1);
392
}
393
 
394
int fsync_no_super(kdev_t dev)
395
{
396
        sync_buffers(dev, 0);
397
        return sync_buffers(dev, 1);
398
}
399
 
400
int fsync_dev(kdev_t dev)
401
{
402
        sync_buffers(dev, 0);
403
 
404
        lock_kernel();
405
        sync_inodes(dev);
406
        DQUOT_SYNC_DEV(dev);
407
        sync_supers(dev, 1);
408
        unlock_kernel();
409
 
410
        return sync_buffers(dev, 1);
411
}
412
 
413
/*
414
 * There's no real reason to pretend we should
415
 * ever do anything differently
416
 */
417
void sync_dev(kdev_t dev)
418
{
419
        fsync_dev(dev);
420
}
421
 
422
asmlinkage long sys_sync(void)
423
{
424
        fsync_dev(0);
425
        return 0;
426
}
427
 
428
/*
429
 *      filp may be NULL if called via the msync of a vma.
430
 */
431
 
432
int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
433
{
434
        struct inode * inode = dentry->d_inode;
435
        struct super_block * sb;
436
        kdev_t dev;
437
        int ret;
438
 
439
        lock_kernel();
440
        /* sync the inode to buffers */
441
        write_inode_now(inode, 0);
442
 
443
        /* sync the superblock to buffers */
444
        sb = inode->i_sb;
445
        lock_super(sb);
446
        if (sb->s_op && sb->s_op->write_super)
447
                sb->s_op->write_super(sb);
448
        unlock_super(sb);
449
 
450
        /* .. finally sync the buffers to disk */
451
        dev = inode->i_dev;
452
        ret = sync_buffers(dev, 1);
453
        unlock_kernel();
454
        return ret;
455
}
456
 
457
asmlinkage long sys_fsync(unsigned int fd)
458
{
459
        struct file * file;
460
        struct dentry * dentry;
461
        struct inode * inode;
462
        int ret, err;
463
 
464
        ret = -EBADF;
465
        file = fget(fd);
466
        if (!file)
467
                goto out;
468
 
469
        dentry = file->f_dentry;
470
        inode = dentry->d_inode;
471
 
472
        ret = -EINVAL;
473
        if (!file->f_op || !file->f_op->fsync) {
474
                /* Why?  We can still call filemap_fdatasync */
475
                goto out_putf;
476
        }
477
 
478
        /* We need to protect against concurrent writers.. */
479
        down(&inode->i_sem);
480
        ret = filemap_fdatasync(inode->i_mapping);
481
        err = file->f_op->fsync(file, dentry, 0);
482
        if (err && !ret)
483
                ret = err;
484
        err = filemap_fdatawait(inode->i_mapping);
485
        if (err && !ret)
486
                ret = err;
487
        up(&inode->i_sem);
488
 
489
out_putf:
490
        fput(file);
491
out:
492
        return ret;
493
}
494
 
495
int do_fdatasync(struct file *file)
496
{
497
        int ret, err;
498
        struct dentry *dentry;
499
        struct inode *inode;
500
 
501
        if (unlikely(!file->f_op || !file->f_op->fsync))
502
                return -EINVAL;
503
 
504
        dentry = file->f_dentry;
505
        inode = dentry->d_inode;
506
 
507
        ret = filemap_fdatasync(inode->i_mapping);
508
        err = file->f_op->fsync(file, dentry, 1);
509
        if (err && !ret)
510
                ret = err;
511
        err = filemap_fdatawait(inode->i_mapping);
512
        if (err && !ret)
513
                ret = err;
514
        return ret;
515
}
516
 
517
asmlinkage long sys_fdatasync(unsigned int fd)
518
{
519
        struct file * file;
520
        struct inode *inode;
521
        int ret;
522
 
523
        ret = -EBADF;
524
        file = fget(fd);
525
        if (!file)
526
                goto out;
527
 
528
        inode = file->f_dentry->d_inode;
529
        down(&inode->i_sem);
530
        ret = do_fdatasync(file);
531
        up(&inode->i_sem);
532
 
533
        fput(file);
534
out:
535
        return ret;
536
}
537
 
538
/* After several hours of tedious analysis, the following hash
539
 * function won.  Do not mess with it... -DaveM
540
 */
541
#define _hashfn(dev,block)      \
542
        ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
543
         (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
544
          ((block) << (bh_hash_shift - 12))))
545
#define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
546
 
547
static inline void __insert_into_hash_list(struct buffer_head *bh)
548
{
549
        struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
550
        struct buffer_head *next = *head;
551
 
552
        *head = bh;
553
        bh->b_pprev = head;
554
        bh->b_next = next;
555
        if (next != NULL)
556
                next->b_pprev = &bh->b_next;
557
}
558
 
559
static __inline__ void __hash_unlink(struct buffer_head *bh)
560
{
561
        struct buffer_head **pprev = bh->b_pprev;
562
        if (pprev) {
563
                struct buffer_head *next = bh->b_next;
564
                if (next)
565
                        next->b_pprev = pprev;
566
                *pprev = next;
567
                bh->b_pprev = NULL;
568
        }
569
}
570
 
571
static void __insert_into_lru_list(struct buffer_head * bh, int blist)
572
{
573
        struct buffer_head **bhp = &lru_list[blist];
574
 
575
        if (bh->b_prev_free || bh->b_next_free) BUG();
576
 
577
        if(!*bhp) {
578
                *bhp = bh;
579
                bh->b_prev_free = bh;
580
        }
581
        bh->b_next_free = *bhp;
582
        bh->b_prev_free = (*bhp)->b_prev_free;
583
        (*bhp)->b_prev_free->b_next_free = bh;
584
        (*bhp)->b_prev_free = bh;
585
        nr_buffers_type[blist]++;
586
        size_buffers_type[blist] += bh->b_size;
587
}
588
 
589
static void __remove_from_lru_list(struct buffer_head * bh)
590
{
591
        struct buffer_head *next = bh->b_next_free;
592
        if (next) {
593
                struct buffer_head *prev = bh->b_prev_free;
594
                int blist = bh->b_list;
595
 
596
                prev->b_next_free = next;
597
                next->b_prev_free = prev;
598
                if (lru_list[blist] == bh) {
599
                        if (next == bh)
600
                                next = NULL;
601
                        lru_list[blist] = next;
602
                }
603
                bh->b_next_free = NULL;
604
                bh->b_prev_free = NULL;
605
                nr_buffers_type[blist]--;
606
                size_buffers_type[blist] -= bh->b_size;
607
        }
608
}
609
 
610
/* must be called with both the hash_table_lock and the lru_list_lock
611
   held */
612
static void __remove_from_queues(struct buffer_head *bh)
613
{
614
        __hash_unlink(bh);
615
        __remove_from_lru_list(bh);
616
}
617
 
618
static void remove_from_queues(struct buffer_head *bh)
619
{
620
        spin_lock(&lru_list_lock);
621
        write_lock(&hash_table_lock);
622
        __remove_from_queues(bh);
623
        write_unlock(&hash_table_lock);
624
        spin_unlock(&lru_list_lock);
625
}
626
 
627
struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
628
{
629
        struct buffer_head *bh, **p = &hash(dev, block);
630
 
631
        read_lock(&hash_table_lock);
632
 
633
        for (;;) {
634
                bh = *p;
635
                if (!bh)
636
                        break;
637
                p = &bh->b_next;
638
                if (bh->b_blocknr != block)
639
                        continue;
640
                if (bh->b_size != size)
641
                        continue;
642
                if (bh->b_dev != dev)
643
                        continue;
644
                get_bh(bh);
645
                break;
646
        }
647
 
648
        read_unlock(&hash_table_lock);
649
        return bh;
650
}
651
 
652
void buffer_insert_list(struct buffer_head *bh, struct list_head *list)
653
{
654
        spin_lock(&lru_list_lock);
655
        if (buffer_attached(bh))
656
                list_del(&bh->b_inode_buffers);
657
        set_buffer_attached(bh);
658
        list_add_tail(&bh->b_inode_buffers, list);
659
        spin_unlock(&lru_list_lock);
660
}
661
 
662
/*
663
 * The caller must have the lru_list lock before calling the
664
 * remove_inode_queue functions.
665
 */
666
static void __remove_inode_queue(struct buffer_head *bh)
667
{
668
        list_del(&bh->b_inode_buffers);
669
        clear_buffer_attached(bh);
670
}
671
 
672
static inline void remove_inode_queue(struct buffer_head *bh)
673
{
674
        if (buffer_attached(bh))
675
                __remove_inode_queue(bh);
676
}
677
 
678
int inode_has_buffers(struct inode *inode)
679
{
680
        int ret;
681
 
682
        spin_lock(&lru_list_lock);
683
        ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
684
        spin_unlock(&lru_list_lock);
685
 
686
        return ret;
687
}
688
 
689
/* If invalidate_buffers() will trash dirty buffers, it means some kind
690
   of fs corruption is going on. Trashing dirty data always imply losing
691
   information that was supposed to be just stored on the physical layer
692
   by the user.
693
 
694
   Thus invalidate_buffers in general usage is not allwowed to trash
695
   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
696
   be preserved.  These buffers are simply skipped.
697
 
698
   We also skip buffers which are still in use.  For example this can
699
   happen if a userspace program is reading the block device.
700
 
701
   NOTE: In the case where the user removed a removable-media-disk even if
702
   there's still dirty data not synced on disk (due a bug in the device driver
703
   or due an error of the user), by not destroying the dirty buffers we could
704
   generate corruption also on the next media inserted, thus a parameter is
705
   necessary to handle this case in the most safe way possible (trying
706
   to not corrupt also the new disk inserted with the data belonging to
707
   the old now corrupted disk). Also for the ramdisk the natural thing
708
   to do in order to release the ramdisk memory is to destroy dirty buffers.
709
 
710
   These are two special cases. Normal usage imply the device driver
711
   to issue a sync on the device (without waiting I/O completion) and
712
   then an invalidate_buffers call that doesn't trash dirty buffers.
713
 
714
   For handling cache coherency with the blkdev pagecache the 'update' case
715
   is been introduced. It is needed to re-read from disk any pinned
716
   buffer. NOTE: re-reading from disk is destructive so we can do it only
717
   when we assume nobody is changing the buffercache under our I/O and when
718
   we think the disk contains more recent information than the buffercache.
719
   The update == 1 pass marks the buffers we need to update, the update == 2
720
   pass does the actual I/O. */
721
void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
722
{
723
        int i, nlist, slept;
724
        struct buffer_head * bh, * bh_next;
725
        kdev_t dev = to_kdev_t(bdev->bd_dev);   /* will become bdev */
726
 
727
 retry:
728
        slept = 0;
729
        spin_lock(&lru_list_lock);
730
        for(nlist = 0; nlist < NR_LIST; nlist++) {
731
                bh = lru_list[nlist];
732
                if (!bh)
733
                        continue;
734
                for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
735
                        bh_next = bh->b_next_free;
736
 
737
                        /* Another device? */
738
                        if (bh->b_dev != dev)
739
                                continue;
740
                        /* Not hashed? */
741
                        if (!bh->b_pprev)
742
                                continue;
743
                        if (buffer_locked(bh)) {
744
                                get_bh(bh);
745
                                spin_unlock(&lru_list_lock);
746
                                wait_on_buffer(bh);
747
                                slept = 1;
748
                                spin_lock(&lru_list_lock);
749
                                put_bh(bh);
750
                        }
751
 
752
                        write_lock(&hash_table_lock);
753
                        /* All buffers in the lru lists are mapped */
754
                        if (!buffer_mapped(bh))
755
                                BUG();
756
                        if (buffer_dirty(bh) && destroy_dirty_buffers)
757
                                printk("invalidate: dirty buffer\n");
758
                        if (!atomic_read(&bh->b_count)) {
759
                                if (destroy_dirty_buffers || !buffer_dirty(bh)) {
760
                                        remove_inode_queue(bh);
761
                                }
762
                        } else if (!bdev->bd_openers)
763
                                printk("invalidate: busy buffer\n");
764
 
765
                        write_unlock(&hash_table_lock);
766
                        if (slept)
767
                                goto out;
768
                }
769
        }
770
out:
771
        spin_unlock(&lru_list_lock);
772
        if (slept)
773
                goto retry;
774
 
775
        /* Get rid of the page cache */
776
        invalidate_inode_pages(bdev->bd_inode);
777
}
778
 
779
void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
780
{
781
        struct block_device *bdev = bdget(dev);
782
        if (bdev) {
783
                invalidate_bdev(bdev, destroy_dirty_buffers);
784
                bdput(bdev);
785
        }
786
}
787
 
788
static void free_more_memory(void)
789
{
790
        balance_dirty();
791
        wakeup_bdflush();
792
        try_to_free_pages(GFP_NOIO);
793
        run_task_queue(&tq_disk);
794
        yield();
795
}
796
 
797
void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
798
{
799
        bh->b_list = BUF_CLEAN;
800
        bh->b_end_io = handler;
801
        bh->b_private = private;
802
}
803
 
804
void end_buffer_io_async(struct buffer_head * bh, int uptodate)
805
{
806
        static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
807
        unsigned long flags;
808
        struct buffer_head *tmp;
809
        struct page *page;
810
        int fullup = 1;
811
 
812
        mark_buffer_uptodate(bh, uptodate);
813
 
814
        /* This is a temporary buffer used for page I/O. */
815
        page = bh->b_page;
816
 
817
        if (!uptodate)
818
                SetPageError(page);
819
 
820
        /*
821
         * Be _very_ careful from here on. Bad things can happen if
822
         * two buffer heads end IO at almost the same time and both
823
         * decide that the page is now completely done.
824
         *
825
         * Async buffer_heads are here only as labels for IO, and get
826
         * thrown away once the IO for this page is complete.  IO is
827
         * deemed complete once all buffers have been visited
828
         * (b_count==0) and are now unlocked. We must make sure that
829
         * only the _last_ buffer that decrements its count is the one
830
         * that unlock the page..
831
         */
832
        spin_lock_irqsave(&page_uptodate_lock, flags);
833
        mark_buffer_async(bh, 0);
834
        unlock_buffer(bh);
835
        tmp = bh->b_this_page;
836
        while (tmp != bh) {
837
                if (buffer_locked(tmp)) {
838
                        if (buffer_async(tmp))
839
                                goto still_busy;
840
                } else if (!buffer_uptodate(tmp))
841
                        fullup = 0;
842
                tmp = tmp->b_this_page;
843
        }
844
 
845
        /* OK, the async IO on this page is complete. */
846
        spin_unlock_irqrestore(&page_uptodate_lock, flags);
847
 
848
        /*
849
         * If none of the buffers had errors and all were uptodate
850
         * then we can set the page uptodate:
851
         */
852
        if (fullup && !PageError(page))
853
                SetPageUptodate(page);
854
 
855
        UnlockPage(page);
856
 
857
        return;
858
 
859
still_busy:
860
        spin_unlock_irqrestore(&page_uptodate_lock, flags);
861
        return;
862
}
863
 
864
inline void set_buffer_async_io(struct buffer_head *bh)
865
{
866
        bh->b_end_io = end_buffer_io_async;
867
        mark_buffer_async(bh, 1);
868
}
869
 
870
/*
871
 * Synchronise all the inode's dirty buffers to the disk.
872
 *
873
 * We have conflicting pressures: we want to make sure that all
874
 * initially dirty buffers get waited on, but that any subsequently
875
 * dirtied buffers don't.  After all, we don't want fsync to last
876
 * forever if somebody is actively writing to the file.
877
 *
878
 * Do this in two main stages: first we copy dirty buffers to a
879
 * temporary inode list, queueing the writes as we go.  Then we clean
880
 * up, waiting for those writes to complete.
881
 *
882
 * During this second stage, any subsequent updates to the file may end
883
 * up refiling the buffer on the original inode's dirty list again, so
884
 * there is a chance we will end up with a buffer queued for write but
885
 * not yet completed on that list.  So, as a final cleanup we go through
886
 * the osync code to catch these locked, dirty buffers without requeuing
887
 * any newly dirty buffers for write.
888
 */
889
int fsync_buffers_list(struct list_head *list)
890
{
891
        struct buffer_head *bh;
892
        struct list_head tmp;
893
        int err = 0, err2;
894
 
895
        INIT_LIST_HEAD(&tmp);
896
 
897
        spin_lock(&lru_list_lock);
898
 
899
        while (!list_empty(list)) {
900
                bh = BH_ENTRY(list->next);
901
                list_del(&bh->b_inode_buffers);
902
                if (!buffer_dirty(bh) && !buffer_locked(bh))
903
                        clear_buffer_attached(bh);
904
                else {
905
                        set_buffer_attached(bh);
906
                        list_add(&bh->b_inode_buffers, &tmp);
907
                        if (buffer_dirty(bh)) {
908
                                get_bh(bh);
909
                                spin_unlock(&lru_list_lock);
910
                        /*
911
                         * Wait I/O completion before submitting
912
                         * the buffer, to be sure the write will
913
                         * be effective on the latest data in
914
                         * the buffer. (otherwise - if there's old
915
                         * I/O in flight - write_buffer would become
916
                         * a noop)
917
                         */
918
                                wait_on_buffer(bh);
919
                                write_buffer(bh);
920
                                brelse(bh);
921
                                spin_lock(&lru_list_lock);
922
                        }
923
                }
924
        }
925
 
926
        while (!list_empty(&tmp)) {
927
                bh = BH_ENTRY(tmp.prev);
928
                remove_inode_queue(bh);
929
                get_bh(bh);
930
                spin_unlock(&lru_list_lock);
931
                wait_on_buffer(bh);
932
                if (!buffer_uptodate(bh))
933
                        err = -EIO;
934
                brelse(bh);
935
                spin_lock(&lru_list_lock);
936
        }
937
 
938
        spin_unlock(&lru_list_lock);
939
        err2 = osync_buffers_list(list);
940
 
941
        if (err)
942
                return err;
943
        else
944
                return err2;
945
}
946
 
947
/*
948
 * osync is designed to support O_SYNC io.  It waits synchronously for
949
 * all already-submitted IO to complete, but does not queue any new
950
 * writes to the disk.
951
 *
952
 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
953
 * you dirty the buffers, and then use osync_buffers_list to wait for
954
 * completion.  Any other dirty buffers which are not yet queued for
955
 * write will not be flushed to disk by the osync.
956
 */
957
static int osync_buffers_list(struct list_head *list)
958
{
959
        struct buffer_head *bh;
960
        struct list_head *p;
961
        int err = 0;
962
 
963
        spin_lock(&lru_list_lock);
964
 
965
 repeat:
966
        list_for_each_prev(p, list) {
967
                bh = BH_ENTRY(p);
968
                if (buffer_locked(bh)) {
969
                        get_bh(bh);
970
                        spin_unlock(&lru_list_lock);
971
                        wait_on_buffer(bh);
972
                        if (!buffer_uptodate(bh))
973
                                err = -EIO;
974
                        brelse(bh);
975
                        spin_lock(&lru_list_lock);
976
                        goto repeat;
977
                }
978
        }
979
 
980
        spin_unlock(&lru_list_lock);
981
        return err;
982
}
983
 
984
/*
985
 * Invalidate any and all dirty buffers on a given inode.  We are
986
 * probably unmounting the fs, but that doesn't mean we have already
987
 * done a sync().  Just drop the buffers from the inode list.
988
 */
989
void invalidate_inode_buffers(struct inode *inode)
990
{
991
        struct list_head * entry;
992
 
993
        spin_lock(&lru_list_lock);
994
        while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
995
                remove_inode_queue(BH_ENTRY(entry));
996
        while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
997
                remove_inode_queue(BH_ENTRY(entry));
998
        spin_unlock(&lru_list_lock);
999
}
1000
 
1001
 
1002
/*
1003
 * Ok, this is getblk, and it isn't very clear, again to hinder
1004
 * race-conditions. Most of the code is seldom used, (ie repeating),
1005
 * so it should be much more efficient than it looks.
1006
 *
1007
 * The algorithm is changed: hopefully better, and an elusive bug removed.
1008
 *
1009
 * 14.02.92: changed it to sync dirty buffers a bit: better performance
1010
 * when the filesystem starts to get full of dirty blocks (I hope).
1011
 */
1012
struct buffer_head * getblk(kdev_t dev, int block, int size)
1013
{
1014
        for (;;) {
1015
                struct buffer_head * bh;
1016
 
1017
                bh = get_hash_table(dev, block, size);
1018
                if (bh) {
1019
                        touch_buffer(bh);
1020
                        return bh;
1021
                }
1022
 
1023
                if (!grow_buffers(dev, block, size))
1024
                        free_more_memory();
1025
        }
1026
}
1027
 
1028
/* -1 -> no need to flush
1029
 
1030
    1 -> sync flush (wait for I/O completion) */
1031
static int balance_dirty_state(void)
1032
{
1033
        unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
1034
 
1035
        dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
1036
        tot = nr_free_buffer_pages();
1037
 
1038
        dirty *= 100;
1039
        soft_dirty_limit = tot * bdf_prm.b_un.nfract;
1040
        hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
1041
 
1042
        /* First, check for the "real" dirty limit. */
1043
        if (dirty > soft_dirty_limit) {
1044
                if (dirty > hard_dirty_limit && !(current->flags & PF_NOIO))
1045
                        return 1;
1046
                return 0;
1047
        }
1048
 
1049
        return -1;
1050
}
1051
 
1052
static int bdflush_stop(void)
1053
{
1054
        unsigned long dirty, tot, dirty_limit;
1055
 
1056
        dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
1057
        tot = nr_free_buffer_pages();
1058
 
1059
        dirty *= 100;
1060
        dirty_limit = tot * bdf_prm.b_un.nfract_stop_bdflush;
1061
 
1062
        if (!laptop_mode && dirty > dirty_limit)
1063
                return 0;
1064
        return 1;
1065
}
1066
 
1067
/*
1068
 * if a new dirty buffer is created we need to balance bdflush.
1069
 *
1070
 * in the future we might want to make bdflush aware of different
1071
 * pressures on different devices - thus the (currently unused)
1072
 * 'dev' parameter.
1073
 */
1074
void balance_dirty(void)
1075
{
1076
        int state = balance_dirty_state();
1077
 
1078
        if (state < 0)
1079
                return;
1080
 
1081
        wakeup_bdflush();
1082
 
1083
        /*
1084
         * And if we're _really_ out of balance, wait for
1085
         * some of the dirty/locked buffers ourselves.
1086
         * This will throttle heavy writers.
1087
         */
1088
        if (state > 0) {
1089
                spin_lock(&lru_list_lock);
1090
                write_some_buffers(NODEV);
1091
        }
1092
}
1093
EXPORT_SYMBOL(balance_dirty);
1094
 
1095
inline void __mark_dirty(struct buffer_head *bh)
1096
{
1097
        bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1098
        refile_buffer(bh);
1099
}
1100
 
1101
/* atomic version, the user must call balance_dirty() by hand
1102
   as soon as it become possible to block */
1103
void __mark_buffer_dirty(struct buffer_head *bh)
1104
{
1105
        if (!atomic_set_buffer_dirty(bh))
1106
                __mark_dirty(bh);
1107
}
1108
 
1109
void mark_buffer_dirty(struct buffer_head *bh)
1110
{
1111
        if (!atomic_set_buffer_dirty(bh)) {
1112
                if (block_dump)
1113
                        printk("%s: dirtied buffer\n", current->comm);
1114
                __mark_dirty(bh);
1115
                balance_dirty();
1116
        }
1117
}
1118
 
1119
void set_buffer_flushtime(struct buffer_head *bh)
1120
{
1121
        bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1122
}
1123
EXPORT_SYMBOL(set_buffer_flushtime);
1124
 
1125
inline int get_buffer_flushtime(void)
1126
{
1127
        return bdf_prm.b_un.interval;
1128
}
1129
EXPORT_SYMBOL(get_buffer_flushtime);
1130
 
1131
/*
1132
 * A buffer may need to be moved from one buffer list to another
1133
 * (e.g. in case it is not shared any more). Handle this.
1134
 */
1135
static void __refile_buffer(struct buffer_head *bh)
1136
{
1137
        int dispose = BUF_CLEAN;
1138
        if (buffer_locked(bh))
1139
                dispose = BUF_LOCKED;
1140
        if (buffer_dirty(bh))
1141
                dispose = BUF_DIRTY;
1142
        if (dispose != bh->b_list) {
1143
                __remove_from_lru_list(bh);
1144
                bh->b_list = dispose;
1145
                if (dispose == BUF_CLEAN)
1146
                        remove_inode_queue(bh);
1147
                __insert_into_lru_list(bh, dispose);
1148
        }
1149
}
1150
 
1151
void refile_buffer(struct buffer_head *bh)
1152
{
1153
        spin_lock(&lru_list_lock);
1154
        __refile_buffer(bh);
1155
        spin_unlock(&lru_list_lock);
1156
}
1157
 
1158
/*
1159
 * Release a buffer head
1160
 */
1161
void __brelse(struct buffer_head * buf)
1162
{
1163
        if (atomic_read(&buf->b_count)) {
1164
                put_bh(buf);
1165
                return;
1166
        }
1167
        printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1168
}
1169
 
1170
/*
1171
 * bforget() is like brelse(), except it discards any
1172
 * potentially dirty data.
1173
 */
1174
void __bforget(struct buffer_head * buf)
1175
{
1176
        mark_buffer_clean(buf);
1177
        __brelse(buf);
1178
}
1179
 
1180
/**
1181
 *      bread() - reads a specified block and returns the bh
1182
 *      @block: number of block
1183
 *      @size: size (in bytes) to read
1184
 *
1185
 *      Reads a specified block, and returns buffer head that
1186
 *      contains it. It returns NULL if the block was unreadable.
1187
 */
1188
struct buffer_head * bread(kdev_t dev, int block, int size)
1189
{
1190
        struct buffer_head * bh;
1191
 
1192
        bh = getblk(dev, block, size);
1193
        if (buffer_uptodate(bh))
1194
                return bh;
1195
        set_bit(BH_Sync, &bh->b_state);
1196
        ll_rw_block(READ, 1, &bh);
1197
        wait_on_buffer(bh);
1198
        if (buffer_uptodate(bh))
1199
                return bh;
1200
        brelse(bh);
1201
        return NULL;
1202
}
1203
 
1204
/*
1205
 * Note: the caller should wake up the buffer_wait list if needed.
1206
 */
1207
static void __put_unused_buffer_head(struct buffer_head * bh)
1208
{
1209
        if (unlikely(buffer_attached(bh)))
1210
                BUG();
1211
        if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1212
                kmem_cache_free(bh_cachep, bh);
1213
        } else {
1214
                bh->b_dev = B_FREE;
1215
                bh->b_blocknr = -1;
1216
                bh->b_this_page = NULL;
1217
 
1218
                nr_unused_buffer_heads++;
1219
                bh->b_next_free = unused_list;
1220
                unused_list = bh;
1221
        }
1222
}
1223
 
1224
void put_unused_buffer_head(struct buffer_head *bh)
1225
{
1226
        spin_lock(&unused_list_lock);
1227
        __put_unused_buffer_head(bh);
1228
        spin_unlock(&unused_list_lock);
1229
}
1230
EXPORT_SYMBOL(put_unused_buffer_head);
1231
 
1232
/*
1233
 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1234
 * no-buffer-head deadlock.  Return NULL on failure; waiting for
1235
 * buffer heads is now handled in create_buffers().
1236
 */
1237
struct buffer_head * get_unused_buffer_head(int async)
1238
{
1239
        struct buffer_head * bh;
1240
 
1241
        spin_lock(&unused_list_lock);
1242
        if (nr_unused_buffer_heads > NR_RESERVED) {
1243
                bh = unused_list;
1244
                unused_list = bh->b_next_free;
1245
                nr_unused_buffer_heads--;
1246
                spin_unlock(&unused_list_lock);
1247
                return bh;
1248
        }
1249
        spin_unlock(&unused_list_lock);
1250
 
1251
        /* This is critical.  We can't call out to the FS
1252
         * to get more buffer heads, because the FS may need
1253
         * more buffer-heads itself.  Thus SLAB_NOFS.
1254
         */
1255
        if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) {
1256
                bh->b_blocknr = -1;
1257
                bh->b_this_page = NULL;
1258
                return bh;
1259
        }
1260
 
1261
        /*
1262
         * If we need an async buffer, use the reserved buffer heads.
1263
         */
1264
        if (async) {
1265
                spin_lock(&unused_list_lock);
1266
                if (unused_list) {
1267
                        bh = unused_list;
1268
                        unused_list = bh->b_next_free;
1269
                        nr_unused_buffer_heads--;
1270
                        spin_unlock(&unused_list_lock);
1271
                        return bh;
1272
                }
1273
                spin_unlock(&unused_list_lock);
1274
        }
1275
 
1276
        return NULL;
1277
}
1278
EXPORT_SYMBOL(get_unused_buffer_head);
1279
 
1280
void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1281
{
1282
        if (offset >= PAGE_SIZE)
1283
                BUG();
1284
 
1285
        if (PageHighMem(page)) {
1286
                bh->b_data = (char *)offset;
1287
        } else {
1288
                bh->b_data = page_address(page) + offset;
1289
        }
1290
        bh->b_page = page;
1291
}
1292
EXPORT_SYMBOL(set_bh_page);
1293
 
1294
/*
1295
 * Create the appropriate buffers when given a page for data area and
1296
 * the size of each buffer.. Use the bh->b_this_page linked list to
1297
 * follow the buffers created.  Return NULL if unable to create more
1298
 * buffers.
1299
 * The async flag is used to differentiate async IO (paging, swapping)
1300
 * from ordinary buffer allocations, and only async requests are allowed
1301
 * to sleep waiting for buffer heads.
1302
 */
1303
static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1304
{
1305
        struct buffer_head *bh, *head;
1306
        long offset;
1307
 
1308
try_again:
1309
        head = NULL;
1310
        offset = PAGE_SIZE;
1311
        while ((offset -= size) >= 0) {
1312
                bh = get_unused_buffer_head(async);
1313
                if (!bh)
1314
                        goto no_grow;
1315
 
1316
                bh->b_dev = NODEV;
1317
                bh->b_this_page = head;
1318
                head = bh;
1319
 
1320
                bh->b_state = 0;
1321
                bh->b_next_free = NULL;
1322
                bh->b_pprev = NULL;
1323
                atomic_set(&bh->b_count, 0);
1324
                bh->b_size = size;
1325
 
1326
                set_bh_page(bh, page, offset);
1327
 
1328
                bh->b_list = BUF_CLEAN;
1329
                bh->b_end_io = NULL;
1330
        }
1331
        return head;
1332
/*
1333
 * In case anything failed, we just free everything we got.
1334
 */
1335
no_grow:
1336
        if (head) {
1337
                spin_lock(&unused_list_lock);
1338
                do {
1339
                        bh = head;
1340
                        head = head->b_this_page;
1341
                        __put_unused_buffer_head(bh);
1342
                } while (head);
1343
                spin_unlock(&unused_list_lock);
1344
 
1345
                /* Wake up any waiters ... */
1346
                wake_up(&buffer_wait);
1347
        }
1348
 
1349
        /*
1350
         * Return failure for non-async IO requests.  Async IO requests
1351
         * are not allowed to fail, so we have to wait until buffer heads
1352
         * become available.  But we don't want tasks sleeping with
1353
         * partially complete buffers, so all were released above.
1354
         */
1355
        if (!async)
1356
                return NULL;
1357
 
1358
        /* We're _really_ low on memory. Now we just
1359
         * wait for old buffer heads to become free due to
1360
         * finishing IO.  Since this is an async request and
1361
         * the reserve list is empty, we're sure there are
1362
         * async buffer heads in use.
1363
         */
1364
        run_task_queue(&tq_disk);
1365
 
1366
        free_more_memory();
1367
        goto try_again;
1368
}
1369
 
1370
/*
1371
 * Called when truncating a buffer on a page completely.
1372
 */
1373
static void discard_buffer(struct buffer_head * bh)
1374
{
1375
        if (buffer_mapped(bh) || buffer_delay(bh)) {
1376
                mark_buffer_clean(bh);
1377
                lock_buffer(bh);
1378
                clear_bit(BH_Uptodate, &bh->b_state);
1379
                clear_bit(BH_Mapped, &bh->b_state);
1380
                clear_bit(BH_Req, &bh->b_state);
1381
                clear_bit(BH_New, &bh->b_state);
1382
                clear_bit(BH_Delay, &bh->b_state);
1383
                remove_from_queues(bh);
1384
                unlock_buffer(bh);
1385
        }
1386
}
1387
 
1388
/**
1389
 * try_to_release_page - release old fs-specific metadata on a page
1390
 *
1391
 */
1392
 
1393
int try_to_release_page(struct page * page, int gfp_mask)
1394
{
1395
        if (!PageLocked(page))
1396
                BUG();
1397
 
1398
        if (!page->mapping)
1399
                goto try_to_free;
1400
        if (!page->mapping->a_ops->releasepage)
1401
                goto try_to_free;
1402
        if (page->mapping->a_ops->releasepage(page, gfp_mask))
1403
                goto try_to_free;
1404
        /*
1405
         * We couldn't release buffer metadata; don't even bother trying
1406
         * to release buffers.
1407
         */
1408
        return 0;
1409
try_to_free:
1410
        return try_to_free_buffers(page, gfp_mask);
1411
}
1412
 
1413
/*
1414
 * We don't have to release all buffers here, but
1415
 * we have to be sure that no dirty buffer is left
1416
 * and no IO is going on (no buffer is locked), because
1417
 * we have truncated the file and are going to free the
1418
 * blocks on-disk..
1419
 */
1420
int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
1421
{
1422
        struct buffer_head *head, *bh, *next;
1423
        unsigned int curr_off = 0;
1424
 
1425
        if (!PageLocked(page))
1426
                BUG();
1427
        if (!page->buffers)
1428
                return 1;
1429
 
1430
        head = page->buffers;
1431
        bh = head;
1432
        do {
1433
                unsigned int next_off = curr_off + bh->b_size;
1434
                next = bh->b_this_page;
1435
 
1436
                /*
1437
                 * is this block fully flushed?
1438
                 */
1439
                if (offset <= curr_off)
1440
                        discard_buffer(bh);
1441
                curr_off = next_off;
1442
                bh = next;
1443
        } while (bh != head);
1444
 
1445
        /*
1446
         * subtle. We release buffer-heads only if this is
1447
         * the 'final' flushpage. We have invalidated the get_block
1448
         * cached value unconditionally, so real IO is not
1449
         * possible anymore.
1450
         *
1451
         * If the free doesn't work out, the buffers can be
1452
         * left around - they just turn into anonymous buffers
1453
         * instead.
1454
         */
1455
        if (!offset) {
1456
                if (!try_to_release_page(page, 0))
1457
                        return 0;
1458
        }
1459
 
1460
        return 1;
1461
}
1462
 
1463
void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
1464
{
1465
        struct buffer_head *bh, *head, *tail;
1466
 
1467
        /* FIXME: create_buffers should fail if there's no enough memory */
1468
        head = create_buffers(page, blocksize, 1);
1469
        if (page->buffers)
1470
                BUG();
1471
 
1472
        bh = head;
1473
        do {
1474
                bh->b_dev = dev;
1475
                bh->b_blocknr = 0;
1476
                bh->b_end_io = NULL;
1477
                tail = bh;
1478
                bh = bh->b_this_page;
1479
        } while (bh);
1480
        tail->b_this_page = head;
1481
        page->buffers = head;
1482
        page_cache_get(page);
1483
}
1484
EXPORT_SYMBOL(create_empty_buffers);
1485
 
1486
/*
1487
 * We are taking a block for data and we don't want any output from any
1488
 * buffer-cache aliases starting from return from that function and
1489
 * until the moment when something will explicitly mark the buffer
1490
 * dirty (hopefully that will not happen until we will free that block ;-)
1491
 * We don't even need to mark it not-uptodate - nobody can expect
1492
 * anything from a newly allocated buffer anyway. We used to used
1493
 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1494
 * don't want to mark the alias unmapped, for example - it would confuse
1495
 * anyone who might pick it with bread() afterwards...
1496
 */
1497
 
1498
static void unmap_underlying_metadata(struct buffer_head * bh)
1499
{
1500
        struct buffer_head *old_bh;
1501
 
1502
        old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1503
        if (old_bh) {
1504
                mark_buffer_clean(old_bh);
1505
                wait_on_buffer(old_bh);
1506
                clear_bit(BH_Req, &old_bh->b_state);
1507
                __brelse(old_bh);
1508
        }
1509
}
1510
 
1511
/*
1512
 * NOTE! All mapped/uptodate combinations are valid:
1513
 *
1514
 *      Mapped  Uptodate        Meaning
1515
 *
1516
 *      No      No              "unknown" - must do get_block()
1517
 *      No      Yes             "hole" - zero-filled
1518
 *      Yes     No              "allocated" - allocated on disk, not read in
1519
 *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1520
 *
1521
 * "Dirty" is valid only with the last case (mapped+uptodate).
1522
 */
1523
 
1524
/*
1525
 * block_write_full_page() is SMP threaded - the kernel lock is not held.
1526
 */
1527
static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1528
{
1529
        int err, i;
1530
        unsigned long block;
1531
        struct buffer_head *bh, *head;
1532
        int need_unlock;
1533
 
1534
        if (!PageLocked(page))
1535
                BUG();
1536
 
1537
        if (!page->buffers)
1538
                create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits);
1539
        head = page->buffers;
1540
 
1541
        block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1542
 
1543
        bh = head;
1544
        i = 0;
1545
 
1546
        /* Stage 1: make sure we have all the buffers mapped! */
1547
        do {
1548
                /*
1549
                 * If the buffer isn't up-to-date, we can't be sure
1550
                 * that the buffer has been initialized with the proper
1551
                 * block number information etc..
1552
                 *
1553
                 * Leave it to the low-level FS to make all those
1554
                 * decisions (block #0 may actually be a valid block)
1555
                 */
1556
                if (!buffer_mapped(bh)) {
1557
                        err = get_block(inode, block, bh, 1);
1558
                        if (err)
1559
                                goto out;
1560
                        if (buffer_new(bh))
1561
                                unmap_underlying_metadata(bh);
1562
                }
1563
                bh = bh->b_this_page;
1564
                block++;
1565
        } while (bh != head);
1566
 
1567
        /* Stage 2: lock the buffers, mark them clean */
1568
        do {
1569
                lock_buffer(bh);
1570
                set_buffer_async_io(bh);
1571
                set_bit(BH_Uptodate, &bh->b_state);
1572
                clear_bit(BH_Dirty, &bh->b_state);
1573
                bh = bh->b_this_page;
1574
        } while (bh != head);
1575
 
1576
        /* Stage 3: submit the IO */
1577
        do {
1578
                struct buffer_head *next = bh->b_this_page;
1579
                submit_bh(WRITE, bh);
1580
                bh = next;
1581
        } while (bh != head);
1582
 
1583
        /* Done - end_buffer_io_async will unlock */
1584
        SetPageUptodate(page);
1585
 
1586
        wakeup_page_waiters(page);
1587
 
1588
        return 0;
1589
 
1590
out:
1591
        /*
1592
         * ENOSPC, or some other error.  We may already have added some
1593
         * blocks to the file, so we need to write these out to avoid
1594
         * exposing stale data.
1595
         */
1596
        ClearPageUptodate(page);
1597
        bh = head;
1598
        need_unlock = 1;
1599
        /* Recovery: lock and submit the mapped buffers */
1600
        do {
1601
                if (buffer_mapped(bh)) {
1602
                        lock_buffer(bh);
1603
                        set_buffer_async_io(bh);
1604
                        need_unlock = 0;
1605
                }
1606
                bh = bh->b_this_page;
1607
        } while (bh != head);
1608
        do {
1609
                struct buffer_head *next = bh->b_this_page;
1610
                if (buffer_mapped(bh)) {
1611
                        set_bit(BH_Uptodate, &bh->b_state);
1612
                        clear_bit(BH_Dirty, &bh->b_state);
1613
                        submit_bh(WRITE, bh);
1614
                }
1615
                bh = next;
1616
        } while (bh != head);
1617
        if (need_unlock)
1618
                UnlockPage(page);
1619
        wakeup_page_waiters(page);
1620
        return err;
1621
}
1622
 
1623
static int __block_prepare_write(struct inode *inode, struct page *page,
1624
                unsigned from, unsigned to, get_block_t *get_block)
1625
{
1626
        unsigned block_start, block_end;
1627
        unsigned long block;
1628
        int err = 0;
1629
        unsigned blocksize, bbits;
1630
        struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1631
        char *kaddr = kmap(page);
1632
 
1633
        blocksize = 1 << inode->i_blkbits;
1634
        if (!page->buffers)
1635
                create_empty_buffers(page, inode->i_dev, blocksize);
1636
        head = page->buffers;
1637
 
1638
        bbits = inode->i_blkbits;
1639
        block = page->index << (PAGE_CACHE_SHIFT - bbits);
1640
 
1641
        for(bh = head, block_start = 0; bh != head || !block_start;
1642
            block++, block_start=block_end, bh = bh->b_this_page) {
1643
                if (!bh)
1644
                        BUG();
1645
                block_end = block_start+blocksize;
1646
                if (block_end <= from)
1647
                        continue;
1648
                if (block_start >= to)
1649
                        break;
1650
                clear_bit(BH_New, &bh->b_state);
1651
                if (!buffer_mapped(bh)) {
1652
                        err = get_block(inode, block, bh, 1);
1653
                        if (err)
1654
                                goto out;
1655
                        if (buffer_new(bh)) {
1656
                                unmap_underlying_metadata(bh);
1657
                                if (Page_Uptodate(page)) {
1658
                                        set_bit(BH_Uptodate, &bh->b_state);
1659
                                        continue;
1660
                                }
1661
                                if (block_end > to)
1662
                                        memset(kaddr+to, 0, block_end-to);
1663
                                if (block_start < from)
1664
                                        memset(kaddr+block_start, 0, from-block_start);
1665
                                if (block_end > to || block_start < from)
1666
                                        flush_dcache_page(page);
1667
                                continue;
1668
                        }
1669
                }
1670
                if (Page_Uptodate(page)) {
1671
                        set_bit(BH_Uptodate, &bh->b_state);
1672
                        continue;
1673
                }
1674
                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1675
                     (block_start < from || block_end > to)) {
1676
                        ll_rw_block(READ, 1, &bh);
1677
                        *wait_bh++=bh;
1678
                }
1679
        }
1680
        /*
1681
         * If we issued read requests - let them complete.
1682
         */
1683
        while(wait_bh > wait) {
1684
                wait_on_buffer(*--wait_bh);
1685
                if (!buffer_uptodate(*wait_bh))
1686
                        return -EIO;
1687
        }
1688
        return 0;
1689
out:
1690
        /*
1691
         * Zero out any newly allocated blocks to avoid exposing stale
1692
         * data.  If BH_New is set, we know that the block was newly
1693
         * allocated in the above loop.
1694
         *
1695
         * Details the buffer can be new and uptodate because:
1696
         * 1) hole in uptodate page, get_block(create) allocate the block,
1697
         *    so the buffer is new and additionally we also mark it uptodate
1698
         * 2) The buffer is not mapped and uptodate due a previous partial read.
1699
         *
1700
         * We can always ignore uptodate buffers here, if you mark a buffer
1701
         * uptodate you must make sure it contains the right data first.
1702
         *
1703
         * We must stop the "undo/clear" fixup pass not at the caller "to"
1704
         * but at the last block that we successfully arrived in the main loop.
1705
         */
1706
        bh = head;
1707
        to = block_start; /* stop at the last successfully handled block */
1708
        block_start = 0;
1709
        do {
1710
                block_end = block_start+blocksize;
1711
                if (block_end <= from)
1712
                        goto next_bh;
1713
                if (block_start >= to)
1714
                        break;
1715
                if (buffer_new(bh) && !buffer_uptodate(bh)) {
1716
                        memset(kaddr+block_start, 0, bh->b_size);
1717
                        flush_dcache_page(page);
1718
                        set_bit(BH_Uptodate, &bh->b_state);
1719
                        mark_buffer_dirty(bh);
1720
                }
1721
next_bh:
1722
                block_start = block_end;
1723
                bh = bh->b_this_page;
1724
        } while (bh != head);
1725
        return err;
1726
}
1727
 
1728
static int __block_commit_write(struct inode *inode, struct page *page,
1729
                unsigned from, unsigned to)
1730
{
1731
        unsigned block_start, block_end;
1732
        int partial = 0, need_balance_dirty = 0;
1733
        unsigned blocksize;
1734
        struct buffer_head *bh, *head;
1735
 
1736
        blocksize = 1 << inode->i_blkbits;
1737
 
1738
        for(bh = head = page->buffers, block_start = 0;
1739
            bh != head || !block_start;
1740
            block_start=block_end, bh = bh->b_this_page) {
1741
                block_end = block_start + blocksize;
1742
                if (block_end <= from || block_start >= to) {
1743
                        if (!buffer_uptodate(bh))
1744
                                partial = 1;
1745
                } else {
1746
                        set_bit(BH_Uptodate, &bh->b_state);
1747
                        if (!atomic_set_buffer_dirty(bh)) {
1748
                                __mark_dirty(bh);
1749
                                buffer_insert_inode_data_queue(bh, inode);
1750
                                need_balance_dirty = 1;
1751
                        }
1752
                }
1753
        }
1754
 
1755
        if (need_balance_dirty)
1756
                balance_dirty();
1757
        /*
1758
         * is this a partial write that happened to make all buffers
1759
         * uptodate then we can optimize away a bogus readpage() for
1760
         * the next read(). Here we 'discover' wether the page went
1761
         * uptodate as a result of this (potentially partial) write.
1762
         */
1763
        if (!partial)
1764
                SetPageUptodate(page);
1765
        return 0;
1766
}
1767
 
1768
/*
1769
 * Generic "read page" function for block devices that have the normal
1770
 * get_block functionality. This is most of the block device filesystems.
1771
 * Reads the page asynchronously --- the unlock_buffer() and
1772
 * mark_buffer_uptodate() functions propagate buffer state into the
1773
 * page struct once IO has completed.
1774
 */
1775
int block_read_full_page(struct page *page, get_block_t *get_block)
1776
{
1777
        struct inode *inode = page->mapping->host;
1778
        unsigned long iblock, lblock;
1779
        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1780
        unsigned int blocksize, blocks;
1781
        int nr, i;
1782
 
1783
        if (!PageLocked(page))
1784
                PAGE_BUG(page);
1785
        blocksize = 1 << inode->i_blkbits;
1786
        if (!page->buffers)
1787
                create_empty_buffers(page, inode->i_dev, blocksize);
1788
        head = page->buffers;
1789
 
1790
        blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
1791
        iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1792
        lblock = (inode->i_size+blocksize-1) >> inode->i_blkbits;
1793
        bh = head;
1794
        nr = 0;
1795
        i = 0;
1796
 
1797
        do {
1798
                if (buffer_uptodate(bh))
1799
                        continue;
1800
 
1801
                if (!buffer_mapped(bh)) {
1802
                        if (iblock < lblock) {
1803
                                if (get_block(inode, iblock, bh, 0))
1804
                                        SetPageError(page);
1805
                        }
1806
                        if (!buffer_mapped(bh)) {
1807
                                memset(kmap(page) + i*blocksize, 0, blocksize);
1808
                                flush_dcache_page(page);
1809
                                kunmap(page);
1810
                                set_bit(BH_Uptodate, &bh->b_state);
1811
                                continue;
1812
                        }
1813
                        /* get_block() might have updated the buffer synchronously */
1814
                        if (buffer_uptodate(bh))
1815
                                continue;
1816
                }
1817
 
1818
                arr[nr] = bh;
1819
                nr++;
1820
        } while (i++, iblock++, (bh = bh->b_this_page) != head);
1821
 
1822
        if (!nr) {
1823
                /*
1824
                 * All buffers are uptodate - we can set the page uptodate
1825
                 * as well. But not if get_block() returned an error.
1826
                 */
1827
                if (!PageError(page))
1828
                        SetPageUptodate(page);
1829
                UnlockPage(page);
1830
                return 0;
1831
        }
1832
 
1833
        /* Stage two: lock the buffers */
1834
        for (i = 0; i < nr; i++) {
1835
                struct buffer_head * bh = arr[i];
1836
                lock_buffer(bh);
1837
                set_buffer_async_io(bh);
1838
        }
1839
 
1840
        /* Stage 3: start the IO */
1841
        for (i = 0; i < nr; i++) {
1842
                struct buffer_head * bh = arr[i];
1843
                if (buffer_uptodate(bh))
1844
                        end_buffer_io_async(bh, 1);
1845
                else
1846
                        submit_bh(READ, bh);
1847
        }
1848
 
1849
        wakeup_page_waiters(page);
1850
 
1851
        return 0;
1852
}
1853
 
1854
/* utility function for filesystems that need to do work on expanding
1855
 * truncates.  Uses prepare/commit_write to allow the filesystem to
1856
 * deal with the hole.
1857
 */
1858
int generic_cont_expand(struct inode *inode, loff_t size)
1859
{
1860
        struct address_space *mapping = inode->i_mapping;
1861
        struct page *page;
1862
        unsigned long index, offset, limit;
1863
        int err;
1864
 
1865
        err = -EFBIG;
1866
        limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1867
        if (limit != RLIM_INFINITY && size > (loff_t)limit) {
1868
                send_sig(SIGXFSZ, current, 0);
1869
                goto out;
1870
        }
1871
        if (size > inode->i_sb->s_maxbytes)
1872
                goto out;
1873
 
1874
        offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
1875
 
1876
        /* ugh.  in prepare/commit_write, if from==to==start of block, we
1877
        ** skip the prepare.  make sure we never send an offset for the start
1878
        ** of a block
1879
        */
1880
        if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
1881
                offset++;
1882
        }
1883
        index = size >> PAGE_CACHE_SHIFT;
1884
        err = -ENOMEM;
1885
        page = grab_cache_page(mapping, index);
1886
        if (!page)
1887
                goto out;
1888
        err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
1889
        if (!err) {
1890
                err = mapping->a_ops->commit_write(NULL, page, offset, offset);
1891
        }
1892
        UnlockPage(page);
1893
        page_cache_release(page);
1894
        if (err > 0)
1895
                err = 0;
1896
out:
1897
        return err;
1898
}
1899
 
1900
/*
1901
 * For moronic filesystems that do not allow holes in file.
1902
 * We may have to extend the file.
1903
 */
1904
 
1905
int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1906
{
1907
        struct address_space *mapping = page->mapping;
1908
        struct inode *inode = mapping->host;
1909
        struct page *new_page;
1910
        unsigned long pgpos;
1911
        long status;
1912
        unsigned zerofrom;
1913
        unsigned blocksize = 1 << inode->i_blkbits;
1914
        char *kaddr;
1915
 
1916
        while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1917
                status = -ENOMEM;
1918
                new_page = grab_cache_page(mapping, pgpos);
1919
                if (!new_page)
1920
                        goto out;
1921
                /* we might sleep */
1922
                if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1923
                        UnlockPage(new_page);
1924
                        page_cache_release(new_page);
1925
                        continue;
1926
                }
1927
                zerofrom = *bytes & ~PAGE_CACHE_MASK;
1928
                if (zerofrom & (blocksize-1)) {
1929
                        *bytes |= (blocksize-1);
1930
                        (*bytes)++;
1931
                }
1932
                status = __block_prepare_write(inode, new_page, zerofrom,
1933
                                                PAGE_CACHE_SIZE, get_block);
1934
                if (status)
1935
                        goto out_unmap;
1936
                kaddr = page_address(new_page);
1937
                memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1938
                flush_dcache_page(new_page);
1939
                __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1940
                kunmap(new_page);
1941
                UnlockPage(new_page);
1942
                page_cache_release(new_page);
1943
        }
1944
 
1945
        if (page->index < pgpos) {
1946
                /* completely inside the area */
1947
                zerofrom = offset;
1948
        } else {
1949
                /* page covers the boundary, find the boundary offset */
1950
                zerofrom = *bytes & ~PAGE_CACHE_MASK;
1951
 
1952
                /* if we will expand the thing last block will be filled */
1953
                if (to > zerofrom && (zerofrom & (blocksize-1))) {
1954
                        *bytes |= (blocksize-1);
1955
                        (*bytes)++;
1956
                }
1957
 
1958
                /* starting below the boundary? Nothing to zero out */
1959
                if (offset <= zerofrom)
1960
                        zerofrom = offset;
1961
        }
1962
        status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1963
        if (status)
1964
                goto out1;
1965
        kaddr = page_address(page);
1966
        if (zerofrom < offset) {
1967
                memset(kaddr+zerofrom, 0, offset-zerofrom);
1968
                flush_dcache_page(page);
1969
                __block_commit_write(inode, page, zerofrom, offset);
1970
        }
1971
        return 0;
1972
out1:
1973
        ClearPageUptodate(page);
1974
        kunmap(page);
1975
        return status;
1976
 
1977
out_unmap:
1978
        ClearPageUptodate(new_page);
1979
        kunmap(new_page);
1980
        UnlockPage(new_page);
1981
        page_cache_release(new_page);
1982
out:
1983
        return status;
1984
}
1985
 
1986
int block_prepare_write(struct page *page, unsigned from, unsigned to,
1987
                        get_block_t *get_block)
1988
{
1989
        struct inode *inode = page->mapping->host;
1990
        int err = __block_prepare_write(inode, page, from, to, get_block);
1991
        if (err) {
1992
                ClearPageUptodate(page);
1993
                kunmap(page);
1994
        }
1995
        return err;
1996
}
1997
 
1998
int block_commit_write(struct page *page, unsigned from, unsigned to)
1999
{
2000
        struct inode *inode = page->mapping->host;
2001
        __block_commit_write(inode,page,from,to);
2002
        kunmap(page);
2003
        return 0;
2004
}
2005
 
2006
int generic_commit_write(struct file *file, struct page *page,
2007
                unsigned from, unsigned to)
2008
{
2009
        struct inode *inode = page->mapping->host;
2010
        loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2011
        __block_commit_write(inode,page,from,to);
2012
        kunmap(page);
2013
        if (pos > inode->i_size) {
2014
                inode->i_size = pos;
2015
                mark_inode_dirty(inode);
2016
        }
2017
        return 0;
2018
}
2019
 
2020
int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
2021
{
2022
        unsigned long index = from >> PAGE_CACHE_SHIFT;
2023
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
2024
        unsigned blocksize, iblock, length, pos;
2025
        struct inode *inode = mapping->host;
2026
        struct page *page;
2027
        struct buffer_head *bh;
2028
        int err;
2029
 
2030
        blocksize = 1 << inode->i_blkbits;
2031
        length = offset & (blocksize - 1);
2032
 
2033
        /* Block boundary? Nothing to do */
2034
        if (!length)
2035
                return 0;
2036
 
2037
        length = blocksize - length;
2038
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2039
 
2040
        page = grab_cache_page(mapping, index);
2041
        err = -ENOMEM;
2042
        if (!page)
2043
                goto out;
2044
 
2045
        if (!page->buffers)
2046
                create_empty_buffers(page, inode->i_dev, blocksize);
2047
 
2048
        /* Find the buffer that contains "offset" */
2049
        bh = page->buffers;
2050
        pos = blocksize;
2051
        while (offset >= pos) {
2052
                bh = bh->b_this_page;
2053
                iblock++;
2054
                pos += blocksize;
2055
        }
2056
 
2057
        err = 0;
2058
        if (!buffer_mapped(bh)) {
2059
                /* Hole? Nothing to do */
2060
                if (buffer_uptodate(bh))
2061
                        goto unlock;
2062
                get_block(inode, iblock, bh, 0);
2063
                /* Still unmapped? Nothing to do */
2064
                if (!buffer_mapped(bh))
2065
                        goto unlock;
2066
        }
2067
 
2068
        /* Ok, it's mapped. Make sure it's up-to-date */
2069
        if (Page_Uptodate(page))
2070
                set_bit(BH_Uptodate, &bh->b_state);
2071
 
2072
        if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2073
                err = -EIO;
2074
                ll_rw_block(READ, 1, &bh);
2075
                wait_on_buffer(bh);
2076
                /* Uhhuh. Read error. Complain and punt. */
2077
                if (!buffer_uptodate(bh))
2078
                        goto unlock;
2079
        }
2080
 
2081
        memset(kmap(page) + offset, 0, length);
2082
        flush_dcache_page(page);
2083
        kunmap(page);
2084
 
2085
        if (!atomic_set_buffer_dirty(bh)) {
2086
                __mark_dirty(bh);
2087
                buffer_insert_inode_data_queue(bh, inode);
2088
                balance_dirty();
2089
        }
2090
 
2091
        err = 0;
2092
 
2093
unlock:
2094
        UnlockPage(page);
2095
        page_cache_release(page);
2096
out:
2097
        return err;
2098
}
2099
 
2100
int block_write_full_page(struct page *page, get_block_t *get_block)
2101
{
2102
        struct inode *inode = page->mapping->host;
2103
        unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2104
        unsigned offset;
2105
        int err;
2106
 
2107
        /* easy case */
2108
        if (page->index < end_index)
2109
                return __block_write_full_page(inode, page, get_block);
2110
 
2111
        /* things got complicated... */
2112
        offset = inode->i_size & (PAGE_CACHE_SIZE-1);
2113
        /* OK, are we completely out? */
2114
        if (page->index >= end_index+1 || !offset) {
2115
                UnlockPage(page);
2116
                return -EIO;
2117
        }
2118
 
2119
        /* Sigh... will have to work, then... */
2120
        err = __block_prepare_write(inode, page, 0, offset, get_block);
2121
        if (!err) {
2122
                memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
2123
                flush_dcache_page(page);
2124
                __block_commit_write(inode,page,0,offset);
2125
done:
2126
                kunmap(page);
2127
                UnlockPage(page);
2128
                return err;
2129
        }
2130
        ClearPageUptodate(page);
2131
        goto done;
2132
}
2133
 
2134
/*
2135
 * Commence writeout of all the buffers against a page.  The
2136
 * page must be locked.   Returns zero on success or a negative
2137
 * errno.
2138
 */
2139
int writeout_one_page(struct page *page)
2140
{
2141
        struct buffer_head *bh, *head = page->buffers;
2142
 
2143
        if (!PageLocked(page))
2144
                BUG();
2145
        bh = head;
2146
        do {
2147
                if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
2148
                        continue;
2149
 
2150
                bh->b_flushtime = jiffies;
2151
                ll_rw_block(WRITE, 1, &bh);
2152
        } while ((bh = bh->b_this_page) != head);
2153
        return 0;
2154
}
2155
EXPORT_SYMBOL(writeout_one_page);
2156
 
2157
/*
2158
 * Wait for completion of I/O of all buffers against a page.  The page
2159
 * must be locked.  Returns zero on success or a negative errno.
2160
 */
2161
int waitfor_one_page(struct page *page)
2162
{
2163
        int error = 0;
2164
        struct buffer_head *bh, *head = page->buffers;
2165
 
2166
        bh = head;
2167
        do {
2168
                wait_on_buffer(bh);
2169
                if (buffer_req(bh) && !buffer_uptodate(bh))
2170
                        error = -EIO;
2171
        } while ((bh = bh->b_this_page) != head);
2172
        return error;
2173
}
2174
EXPORT_SYMBOL(waitfor_one_page);
2175
 
2176
int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
2177
{
2178
        struct buffer_head tmp;
2179
        struct inode *inode = mapping->host;
2180
        tmp.b_state = 0;
2181
        tmp.b_blocknr = 0;
2182
        get_block(inode, block, &tmp, 0);
2183
        return tmp.b_blocknr;
2184
}
2185
 
2186
int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
2187
{
2188
        int i, nr_blocks, retval;
2189
        unsigned long * blocks = iobuf->blocks;
2190
        int length;
2191
        int beyond_eof = 0;
2192
 
2193
        length = iobuf->length;
2194
        nr_blocks = length / blocksize;
2195
        /* build the blocklist */
2196
        for (i = 0; i < nr_blocks; i++, blocknr++) {
2197
                struct buffer_head bh;
2198
 
2199
                bh.b_state = 0;
2200
                bh.b_dev = inode->i_dev;
2201
                bh.b_size = blocksize;
2202
                bh.b_page = NULL;
2203
 
2204
                if (((loff_t) blocknr) * blocksize >= inode->i_size)
2205
                        beyond_eof = 1;
2206
 
2207
                /* Only allow get_block to create new blocks if we are safely
2208
                   beyond EOF.  O_DIRECT is unsafe inside sparse files. */
2209
                retval = get_block(inode, blocknr, &bh,
2210
                                   ((rw != READ) && beyond_eof));
2211
 
2212
                if (retval) {
2213
                        if (!i)
2214
                                /* report error to userspace */
2215
                                goto out;
2216
                        else
2217
                                /* do short I/O until 'i' */
2218
                                break;
2219
                }
2220
 
2221
                if (rw == READ) {
2222
                        if (buffer_new(&bh))
2223
                                BUG();
2224
                        if (!buffer_mapped(&bh)) {
2225
                                /* there was an hole in the filesystem */
2226
                                blocks[i] = -1UL;
2227
                                continue;
2228
                        }
2229
                } else {
2230
                        if (buffer_new(&bh))
2231
                                unmap_underlying_metadata(&bh);
2232
                        if (!buffer_mapped(&bh))
2233
                                /* upper layers need to pass the error on or
2234
                                 * fall back to buffered IO. */
2235
                                return -ENOTBLK;
2236
                }
2237
                blocks[i] = bh.b_blocknr;
2238
        }
2239
 
2240
        /* patch length to handle short I/O */
2241
        iobuf->length = i * blocksize;
2242
        if (!beyond_eof)
2243
                up(&inode->i_sem);
2244
        retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
2245
        if (!beyond_eof)
2246
                down(&inode->i_sem);
2247
        /* restore orig length */
2248
        iobuf->length = length;
2249
 out:
2250
 
2251
        return retval;
2252
}
2253
 
2254
/*
2255
 * IO completion routine for a buffer_head being used for kiobuf IO: we
2256
 * can't dispatch the kiobuf callback until io_count reaches 0.
2257
 */
2258
 
2259
static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
2260
{
2261
        struct kiobuf *kiobuf;
2262
 
2263
        mark_buffer_uptodate(bh, uptodate);
2264
 
2265
        kiobuf = bh->b_private;
2266
        end_kio_request(kiobuf, uptodate);
2267
        unlock_buffer(bh);
2268
}
2269
 
2270
/*
2271
 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
2272
 * for them to complete.  Clean up the buffer_heads afterwards.
2273
 */
2274
 
2275
static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
2276
{
2277
        int iosize, err;
2278
        int i;
2279
        struct buffer_head *tmp;
2280
 
2281
        iosize = 0;
2282
        err = 0;
2283
 
2284
        for (i = nr; --i >= 0; ) {
2285
                iosize += size;
2286
                tmp = bh[i];
2287
                wait_on_buffer(tmp);
2288
 
2289
                if (!buffer_uptodate(tmp)) {
2290
                        /* We are traversing bh'es in reverse order so
2291
                           clearing iosize on error calculates the
2292
                           amount of IO before the first error. */
2293
                        iosize = 0;
2294
                        err = -EIO;
2295
                }
2296
        }
2297
 
2298
        if (iosize)
2299
                return iosize;
2300
        return err;
2301
}
2302
 
2303
/*
2304
 * Start I/O on a physical range of kernel memory, defined by a vector
2305
 * of kiobuf structs (much like a user-space iovec list).
2306
 *
2307
 * The kiobuf must already be locked for IO.  IO is submitted
2308
 * asynchronously: you need to check page->locked and page->uptodate.
2309
 *
2310
 * It is up to the caller to make sure that there are enough blocks
2311
 * passed in to completely map the iobufs to disk.
2312
 */
2313
 
2314
int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
2315
               kdev_t dev, unsigned long b[], int size)
2316
{
2317
        int             err;
2318
        int             length;
2319
        int             transferred;
2320
        int             i;
2321
        int             bufind;
2322
        int             pageind;
2323
        int             bhind;
2324
        int             offset;
2325
        unsigned long   blocknr;
2326
        struct kiobuf * iobuf = NULL;
2327
        struct page *   map;
2328
        struct buffer_head *tmp, **bhs = NULL;
2329
 
2330
        if (!nr)
2331
                return 0;
2332
 
2333
        /*
2334
         * First, do some alignment and validity checks
2335
         */
2336
        for (i = 0; i < nr; i++) {
2337
                iobuf = iovec[i];
2338
                if ((iobuf->offset & (size-1)) ||
2339
                    (iobuf->length & (size-1)))
2340
                        return -EINVAL;
2341
                if (!iobuf->nr_pages)
2342
                        panic("brw_kiovec: iobuf not initialised");
2343
        }
2344
 
2345
        /*
2346
         * OK to walk down the iovec doing page IO on each page we find.
2347
         */
2348
        bufind = bhind = transferred = err = 0;
2349
        for (i = 0; i < nr; i++) {
2350
                iobuf = iovec[i];
2351
                offset = iobuf->offset;
2352
                length = iobuf->length;
2353
                iobuf->errno = 0;
2354
                if (!bhs)
2355
                        bhs = iobuf->bh;
2356
 
2357
                for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2358
                        map  = iobuf->maplist[pageind];
2359
                        if (!map) {
2360
                                err = -EFAULT;
2361
                                goto finished;
2362
                        }
2363
 
2364
                        while (length > 0) {
2365
                                blocknr = b[bufind++];
2366
                                if (blocknr == -1UL) {
2367
                                        if (rw == READ) {
2368
                                                /* there was an hole in the filesystem */
2369
                                                memset(kmap(map) + offset, 0, size);
2370
                                                flush_dcache_page(map);
2371
                                                kunmap(map);
2372
 
2373
                                                transferred += size;
2374
                                                goto skip_block;
2375
                                        } else
2376
                                                BUG();
2377
                                }
2378
                                tmp = bhs[bhind++];
2379
 
2380
                                tmp->b_size = size;
2381
                                set_bh_page(tmp, map, offset);
2382
                                tmp->b_this_page = tmp;
2383
 
2384
                                init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
2385
                                tmp->b_dev = dev;
2386
                                tmp->b_blocknr = blocknr;
2387
                                tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
2388
 
2389
                                if (rw == WRITE) {
2390
                                        set_bit(BH_Uptodate, &tmp->b_state);
2391
                                        clear_bit(BH_Dirty, &tmp->b_state);
2392
                                } else
2393
                                        set_bit(BH_Uptodate, &tmp->b_state);
2394
 
2395
                                atomic_inc(&iobuf->io_count);
2396
                                submit_bh(rw, tmp);
2397
                                /*
2398
                                 * Wait for IO if we have got too much
2399
                                 */
2400
                                if (bhind >= KIO_MAX_SECTORS) {
2401
                                        kiobuf_wait_for_io(iobuf); /* wake-one */
2402
                                        err = wait_kio(rw, bhind, bhs, size);
2403
                                        if (err >= 0)
2404
                                                transferred += err;
2405
                                        else
2406
                                                goto finished;
2407
                                        bhind = 0;
2408
                                }
2409
 
2410
                        skip_block:
2411
                                length -= size;
2412
                                offset += size;
2413
 
2414
                                if (offset >= PAGE_SIZE) {
2415
                                        offset = 0;
2416
                                        break;
2417
                                }
2418
                        } /* End of block loop */
2419
                } /* End of page loop */
2420
        } /* End of iovec loop */
2421
 
2422
        /* Is there any IO still left to submit? */
2423
        if (bhind) {
2424
                kiobuf_wait_for_io(iobuf); /* wake-one */
2425
                err = wait_kio(rw, bhind, bhs, size);
2426
                if (err >= 0)
2427
                        transferred += err;
2428
                else
2429
                        goto finished;
2430
        }
2431
 
2432
 finished:
2433
        if (transferred)
2434
                return transferred;
2435
        return err;
2436
}
2437
 
2438
/*
2439
 * Start I/O on a page.
2440
 * This function expects the page to be locked and may return
2441
 * before I/O is complete. You then have to check page->locked
2442
 * and page->uptodate.
2443
 *
2444
 * brw_page() is SMP-safe, although it's being called with the
2445
 * kernel lock held - but the code is ready.
2446
 *
2447
 * FIXME: we need a swapper_inode->get_block function to remove
2448
 *        some of the bmap kludges and interface ugliness here.
2449
 */
2450
int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2451
{
2452
        struct buffer_head *head, *bh;
2453
 
2454
        if (!PageLocked(page))
2455
                panic("brw_page: page not locked for I/O");
2456
 
2457
        if (!page->buffers)
2458
                create_empty_buffers(page, dev, size);
2459
        head = bh = page->buffers;
2460
 
2461
        /* Stage 1: lock all the buffers */
2462
        do {
2463
                lock_buffer(bh);
2464
                bh->b_blocknr = *(b++);
2465
                set_bit(BH_Mapped, &bh->b_state);
2466
                set_buffer_async_io(bh);
2467
                bh = bh->b_this_page;
2468
        } while (bh != head);
2469
 
2470
        /* Stage 2: start the IO */
2471
        do {
2472
                struct buffer_head *next = bh->b_this_page;
2473
                submit_bh(rw, bh);
2474
                bh = next;
2475
        } while (bh != head);
2476
        wakeup_page_waiters(page);
2477
        return 0;
2478
}
2479
 
2480
int block_symlink(struct inode *inode, const char *symname, int len)
2481
{
2482
        struct address_space *mapping = inode->i_mapping;
2483
        struct page *page = grab_cache_page(mapping, 0);
2484
        int err = -ENOMEM;
2485
        char *kaddr;
2486
 
2487
        if (!page)
2488
                goto fail;
2489
        err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2490
        if (err)
2491
                goto fail_map;
2492
        kaddr = page_address(page);
2493
        memcpy(kaddr, symname, len-1);
2494
        mapping->a_ops->commit_write(NULL, page, 0, len-1);
2495
        /*
2496
         * Notice that we are _not_ going to block here - end of page is
2497
         * unmapped, so this will only try to map the rest of page, see
2498
         * that it is unmapped (typically even will not look into inode -
2499
         * ->i_size will be enough for everything) and zero it out.
2500
         * OTOH it's obviously correct and should make the page up-to-date.
2501
         */
2502
        err = mapping->a_ops->readpage(NULL, page);
2503
        wait_on_page(page);
2504
        page_cache_release(page);
2505
        if (err < 0)
2506
                goto fail;
2507
        mark_inode_dirty(inode);
2508
        return 0;
2509
fail_map:
2510
        UnlockPage(page);
2511
        page_cache_release(page);
2512
fail:
2513
        return err;
2514
}
2515
 
2516
static inline void link_dev_buffers(struct page * page, struct buffer_head *head)
2517
{
2518
        struct buffer_head *bh, *tail;
2519
 
2520
        bh = head;
2521
        do {
2522
                tail = bh;
2523
                bh = bh->b_this_page;
2524
        } while (bh);
2525
        tail->b_this_page = head;
2526
        page->buffers = head;
2527
        page_cache_get(page);
2528
}
2529
 
2530
/*
2531
 * Create the page-cache page that contains the requested block
2532
 */
2533
static struct page * grow_dev_page(struct block_device *bdev, unsigned long index, int size)
2534
{
2535
        struct page * page;
2536
        struct buffer_head *bh;
2537
 
2538
        page = find_or_create_page(bdev->bd_inode->i_mapping, index, GFP_NOFS);
2539
        if (!page)
2540
                return NULL;
2541
 
2542
        if (!PageLocked(page))
2543
                BUG();
2544
 
2545
        bh = page->buffers;
2546
        if (bh) {
2547
                if (bh->b_size == size)
2548
                        return page;
2549
                if (!try_to_free_buffers(page, GFP_NOFS))
2550
                        goto failed;
2551
        }
2552
 
2553
        bh = create_buffers(page, size, 0);
2554
        if (!bh)
2555
                goto failed;
2556
        link_dev_buffers(page, bh);
2557
        return page;
2558
 
2559
failed:
2560
        UnlockPage(page);
2561
        page_cache_release(page);
2562
        return NULL;
2563
}
2564
 
2565
static void hash_page_buffers(struct page *page, kdev_t dev, int block, int size)
2566
{
2567
        struct buffer_head *head = page->buffers;
2568
        struct buffer_head *bh = head;
2569
        unsigned int uptodate;
2570
 
2571
        uptodate = 1 << BH_Mapped;
2572
        if (Page_Uptodate(page))
2573
                uptodate |= 1 << BH_Uptodate;
2574
 
2575
        write_lock(&hash_table_lock);
2576
        do {
2577
                if (!(bh->b_state & (1 << BH_Mapped))) {
2578
                        init_buffer(bh, NULL, NULL);
2579
                        bh->b_dev = dev;
2580
                        bh->b_blocknr = block;
2581
                        bh->b_state = uptodate;
2582
                }
2583
 
2584
                /* Insert the buffer into the hash lists if necessary */
2585
                if (!bh->b_pprev)
2586
                        __insert_into_hash_list(bh);
2587
 
2588
                block++;
2589
                bh = bh->b_this_page;
2590
        } while (bh != head);
2591
        write_unlock(&hash_table_lock);
2592
}
2593
 
2594
/*
2595
 * Try to increase the number of buffers available: the size argument
2596
 * is used to determine what kind of buffers we want.
2597
 */
2598
static int grow_buffers(kdev_t dev, unsigned long block, int size)
2599
{
2600
        struct page * page;
2601
        struct block_device *bdev;
2602
        unsigned long index;
2603
        int sizebits;
2604
 
2605
        /* Size must be multiple of hard sectorsize */
2606
        if (size & (get_hardsect_size(dev)-1))
2607
                BUG();
2608
        /* Size must be within 512 bytes and PAGE_SIZE */
2609
        if (size < 512 || size > PAGE_SIZE)
2610
                BUG();
2611
 
2612
        sizebits = -1;
2613
        do {
2614
                sizebits++;
2615
        } while ((size << sizebits) < PAGE_SIZE);
2616
 
2617
        index = block >> sizebits;
2618
        block = index << sizebits;
2619
 
2620
        bdev = bdget(kdev_t_to_nr(dev));
2621
        if (!bdev) {
2622
                printk("No block device for %s\n", kdevname(dev));
2623
                BUG();
2624
        }
2625
 
2626
        /* Create a page with the proper size buffers.. */
2627
        page = grow_dev_page(bdev, index, size);
2628
 
2629
        /* This is "wrong" - talk to Al Viro */
2630
        atomic_dec(&bdev->bd_count);
2631
        if (!page)
2632
                return 0;
2633
 
2634
        /* Hash in the buffers on the hash list */
2635
        hash_page_buffers(page, dev, block, size);
2636
        UnlockPage(page);
2637
        page_cache_release(page);
2638
 
2639
        /* We hashed up this page, so increment buffermem */
2640
        atomic_inc(&buffermem_pages);
2641
        return 1;
2642
}
2643
 
2644
/*
2645
 * The first time the VM inspects a page which has locked buffers, it
2646
 * will just mark it as needing waiting upon on the scan of the page LRU.
2647
 * BH_Wait_IO is used for this.
2648
 *
2649
 * The second time the VM visits the page, if it still has locked
2650
 * buffers, it is time to start writing them out.  (BH_Wait_IO was set).
2651
 *
2652
 * The third time the VM visits the page, if the I/O hasn't completed
2653
 * then it's time to wait upon writeout.  BH_Lock and BH_Launder are
2654
 * used for this.
2655
 *
2656
 * There is also the case of buffers which were locked by someone else
2657
 * - write(2) callers, bdflush, etc.  There can be a huge number of these
2658
 * and we don't want to just skip them all and fail the page allocation.
2659
 * We want to be able to wait on these buffers as well.
2660
 *
2661
 * The BH_Launder bit is set in submit_bh() to indicate that I/O is
2662
 * underway against the buffer, doesn't matter who started it - we know
2663
 * that the buffer will eventually come unlocked, and so it's safe to
2664
 * wait on it.
2665
 *
2666
 * The caller holds the page lock and the caller will free this page
2667
 * into current->local_page, so by waiting on the page's buffers the
2668
 * caller is guaranteed to obtain this page.
2669
 *
2670
 * sync_page_buffers() will sort-of return true if all the buffers
2671
 * against this page are freeable, so try_to_free_buffers() should
2672
 * try to free the page's buffers a second time.  This is a bit
2673
 * broken for blocksize < PAGE_CACHE_SIZE, but not very importantly.
2674
 */
2675
static int sync_page_buffers(struct buffer_head *head)
2676
{
2677
        struct buffer_head * bh = head;
2678
        int tryagain = 1;
2679
 
2680
        do {
2681
                if (!buffer_dirty(bh) && !buffer_locked(bh))
2682
                        continue;
2683
 
2684
                /* Don't start IO first time around.. */
2685
                if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) {
2686
                        tryagain = 0;
2687
                        continue;
2688
                }
2689
 
2690
                /* Second time through we start actively writing out.. */
2691
                if (test_and_set_bit(BH_Lock, &bh->b_state)) {
2692
                        if (unlikely(!buffer_launder(bh))) {
2693
                                tryagain = 0;
2694
                                continue;
2695
                        }
2696
                        wait_on_buffer(bh);
2697
                        tryagain = 1;
2698
                        continue;
2699
                }
2700
 
2701
                if (!atomic_set_buffer_clean(bh)) {
2702
                        unlock_buffer(bh);
2703
                        continue;
2704
                }
2705
 
2706
                __mark_buffer_clean(bh);
2707
                get_bh(bh);
2708
                bh->b_end_io = end_buffer_io_sync;
2709
                submit_bh(WRITE, bh);
2710
                tryagain = 0;
2711
        } while ((bh = bh->b_this_page) != head);
2712
 
2713
        return tryagain;
2714
}
2715
 
2716
/*
2717
 * Can the buffer be thrown out?
2718
 */
2719
#define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock))
2720
#define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2721
 
2722
/*
2723
 * try_to_free_buffers() checks if all the buffers on this particular page
2724
 * are unused, and free's the page if so.
2725
 *
2726
 * Wake up bdflush() if this fails - if we're running low on memory due
2727
 * to dirty buffers, we need to flush them out as quickly as possible.
2728
 *
2729
 * NOTE: There are quite a number of ways that threads of control can
2730
 *       obtain a reference to a buffer head within a page.  So we must
2731
 *       lock out all of these paths to cleanly toss the page.
2732
 */
2733
int try_to_free_buffers(struct page * page, unsigned int gfp_mask)
2734
{
2735
        struct buffer_head * tmp, * bh = page->buffers;
2736
 
2737
cleaned_buffers_try_again:
2738
        spin_lock(&lru_list_lock);
2739
        write_lock(&hash_table_lock);
2740
        tmp = bh;
2741
        do {
2742
                if (buffer_busy(tmp))
2743
                        goto busy_buffer_page;
2744
                tmp = tmp->b_this_page;
2745
        } while (tmp != bh);
2746
 
2747
        spin_lock(&unused_list_lock);
2748
        tmp = bh;
2749
 
2750
        /* if this buffer was hashed, this page counts as buffermem */
2751
        if (bh->b_pprev)
2752
                atomic_dec(&buffermem_pages);
2753
        do {
2754
                struct buffer_head * p = tmp;
2755
                tmp = tmp->b_this_page;
2756
 
2757
                if (p->b_dev == B_FREE) BUG();
2758
 
2759
                remove_inode_queue(p);
2760
                __remove_from_queues(p);
2761
                __put_unused_buffer_head(p);
2762
        } while (tmp != bh);
2763
        spin_unlock(&unused_list_lock);
2764
 
2765
        /* Wake up anyone waiting for buffer heads */
2766
        wake_up(&buffer_wait);
2767
 
2768
        /* And free the page */
2769
        page->buffers = NULL;
2770
        page_cache_release(page);
2771
        write_unlock(&hash_table_lock);
2772
        spin_unlock(&lru_list_lock);
2773
        return 1;
2774
 
2775
busy_buffer_page:
2776
        /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2777
        write_unlock(&hash_table_lock);
2778
        spin_unlock(&lru_list_lock);
2779
        gfp_mask = pf_gfp_mask(gfp_mask);
2780
        if (gfp_mask & __GFP_IO) {
2781
                if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
2782
                        if (sync_page_buffers(bh)) {
2783
                                /* no IO or waiting next time */
2784
                                gfp_mask = 0;
2785
                                goto cleaned_buffers_try_again;
2786
                        }
2787
                }
2788
        }
2789
        if (balance_dirty_state() >= 0)
2790
                wakeup_bdflush();
2791
        return 0;
2792
}
2793
EXPORT_SYMBOL(try_to_free_buffers);
2794
 
2795
/* ================== Debugging =================== */
2796
 
2797
void show_buffers(void)
2798
{
2799
#ifdef CONFIG_SMP
2800
        struct buffer_head * bh;
2801
        int delalloc = 0, found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2802
        int nlist;
2803
        static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
2804
#endif
2805
 
2806
        printk("Buffer memory:   %6dkB\n",
2807
                atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2808
 
2809
        printk("Cache memory:   %6ldkB\n",
2810
                (page_cache_size - atomic_read(&buffermem_pages)) << (PAGE_SHIFT-10));
2811
 
2812
#ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2813
        if (!spin_trylock(&lru_list_lock))
2814
                return;
2815
        for(nlist = 0; nlist < NR_LIST; nlist++) {
2816
                delalloc = found = locked = dirty = used = lastused = 0;
2817
                bh = lru_list[nlist];
2818
                if(!bh) continue;
2819
 
2820
                do {
2821
                        found++;
2822
                        if (buffer_locked(bh))
2823
                                locked++;
2824
                        if (buffer_dirty(bh))
2825
                                dirty++;
2826
                        if (buffer_delay(bh))
2827
                                delalloc++;
2828
                        if (atomic_read(&bh->b_count))
2829
                                used++, lastused = found;
2830
                        bh = bh->b_next_free;
2831
                } while (bh != lru_list[nlist]);
2832
                {
2833
                        int tmp = nr_buffers_type[nlist];
2834
                        if (found != tmp)
2835
                                printk("%9s: BUG -> found %d, reported %d\n",
2836
                                       buf_types[nlist], found, tmp);
2837
                }
2838
                printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2839
                       "%d locked, %d dirty %d delay\n",
2840
                       buf_types[nlist], found, size_buffers_type[nlist]>>10,
2841
                       used, lastused, locked, dirty, delalloc);
2842
        }
2843
        spin_unlock(&lru_list_lock);
2844
#endif
2845
}
2846
 
2847
/* ===================== Init ======================= */
2848
 
2849
/*
2850
 * allocate the hash table and init the free list
2851
 * Use gfp() for the hash table to decrease TLB misses, use
2852
 * SLAB cache for buffer heads.
2853
 */
2854
void __init buffer_init(unsigned long mempages)
2855
{
2856
        int order, i;
2857
        unsigned int nr_hash;
2858
 
2859
        /* The buffer cache hash table is less important these days,
2860
         * trim it a bit.
2861
         */
2862
        mempages >>= 14;
2863
 
2864
        mempages *= sizeof(struct buffer_head *);
2865
 
2866
        for (order = 0; (1 << order) < mempages; order++)
2867
                ;
2868
 
2869
        /* try to allocate something until we get it or we're asking
2870
           for something that is really too small */
2871
 
2872
        do {
2873
                unsigned long tmp;
2874
 
2875
                nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2876
                bh_hash_mask = (nr_hash - 1);
2877
 
2878
                tmp = nr_hash;
2879
                bh_hash_shift = 0;
2880
                while((tmp >>= 1UL) != 0UL)
2881
                        bh_hash_shift++;
2882
 
2883
                hash_table = (struct buffer_head **)
2884
                    __get_free_pages(GFP_ATOMIC, order);
2885
        } while (hash_table == NULL && --order > 0);
2886
        printk(KERN_INFO "Buffer cache hash table entries: %d (order: %d, %ld bytes)\n",
2887
               nr_hash, order, (PAGE_SIZE << order));
2888
 
2889
        if (!hash_table)
2890
                panic("Failed to allocate buffer hash table\n");
2891
 
2892
        /* Setup hash chains. */
2893
        for(i = 0; i < nr_hash; i++)
2894
                hash_table[i] = NULL;
2895
 
2896
        /* Setup lru lists. */
2897
        for(i = 0; i < NR_LIST; i++)
2898
                lru_list[i] = NULL;
2899
 
2900
}
2901
 
2902
 
2903
/* ====================== bdflush support =================== */
2904
 
2905
/* This is a simple kernel daemon, whose job it is to provide a dynamic
2906
 * response to dirty buffers.  Once this process is activated, we write back
2907
 * a limited number of buffers to the disks and then go back to sleep again.
2908
 */
2909
 
2910
DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
2911
 
2912
void wakeup_bdflush(void)
2913
{
2914
        wake_up_interruptible(&bdflush_wait);
2915
}
2916
 
2917
void wakeup_kupdate(void)
2918
{
2919
        if (waitqueue_active(&kupdate_wait))
2920
                wake_up(&kupdate_wait);
2921
}
2922
 
2923
/*
2924
 * Here we attempt to write back old buffers.  We also try to flush inodes
2925
 * and supers as well, since this function is essentially "update", and
2926
 * otherwise there would be no way of ensuring that these quantities ever
2927
 * get written back.  Ideally, we would have a timestamp on the inodes
2928
 * and superblocks so that we could write back only the old ones as well
2929
 */
2930
 
2931
static int sync_old_buffers(void)
2932
{
2933
        lock_kernel();
2934
        sync_unlocked_inodes();
2935
        sync_supers(0, 0);
2936
        unlock_kernel();
2937
 
2938
        for (;;) {
2939
                struct buffer_head *bh;
2940
 
2941
                spin_lock(&lru_list_lock);
2942
                bh = lru_list[BUF_DIRTY];
2943
                if (!bh)
2944
                        break;
2945
                if (time_before(jiffies, bh->b_flushtime) && !laptop_mode)
2946
                        break;
2947
                if (write_some_buffers(NODEV))
2948
                        continue;
2949
                return 0;
2950
        }
2951
        spin_unlock(&lru_list_lock);
2952
        return 0;
2953
}
2954
 
2955
int block_sync_page(struct page *page)
2956
{
2957
        run_task_queue(&tq_disk);
2958
        return 0;
2959
}
2960
 
2961
/* This is the interface to bdflush.  As we get more sophisticated, we can
2962
 * pass tuning parameters to this "process", to adjust how it behaves.
2963
 * We would want to verify each parameter, however, to make sure that it
2964
 * is reasonable. */
2965
 
2966
asmlinkage long sys_bdflush(int func, long data)
2967
{
2968
        if (!capable(CAP_SYS_ADMIN))
2969
                return -EPERM;
2970
 
2971
        if (func == 1) {
2972
                /* do_exit directly and let kupdate to do its work alone. */
2973
                do_exit(0);
2974
#if 0 /* left here as it's the only example of lazy-mm-stuff used from
2975
         a syscall that doesn't care about the current mm context. */
2976
                int error;
2977
                struct mm_struct *user_mm;
2978
 
2979
                /*
2980
                 * bdflush will spend all of it's time in kernel-space,
2981
                 * without touching user-space, so we can switch it into
2982
                 * 'lazy TLB mode' to reduce the cost of context-switches
2983
                 * to and from bdflush.
2984
                 */
2985
                user_mm = start_lazy_tlb();
2986
                error = sync_old_buffers();
2987
                end_lazy_tlb(user_mm);
2988
                return error;
2989
#endif
2990
        }
2991
 
2992
        /* Basically func 1 means read param 1, 2 means write param 1, etc */
2993
        if (func >= 2) {
2994
                int i = (func-2) >> 1;
2995
                if (i >= 0 && i < N_PARAM) {
2996
                        if ((func & 1) == 0)
2997
                                return put_user(bdf_prm.data[i], (int*)data);
2998
 
2999
                        if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
3000
                                bdf_prm.data[i] = data;
3001
                                return 0;
3002
                        }
3003
                }
3004
                return -EINVAL;
3005
        }
3006
 
3007
        /* Having func 0 used to launch the actual bdflush and then never
3008
         * return (unless explicitly killed). We return zero here to
3009
         * remain semi-compatible with present update(8) programs.
3010
         */
3011
        return 0;
3012
}
3013
 
3014
/*
3015
 * This is the actual bdflush daemon itself. It used to be started from
3016
 * the syscall above, but now we launch it ourselves internally with
3017
 * kernel_thread(...)  directly after the first thread in init/main.c
3018
 */
3019
int bdflush(void *startup)
3020
{
3021
        struct task_struct *tsk = current;
3022
 
3023
        /*
3024
         *      We have a bare-bones task_struct, and really should fill
3025
         *      in a few more things so "top" and /proc/2/{exe,root,cwd}
3026
         *      display semi-sane things. Not real crucial though...
3027
         */
3028
 
3029
        tsk->session = 1;
3030
        tsk->pgrp = 1;
3031
        strcpy(tsk->comm, "bdflush");
3032
 
3033
        /* avoid getting signals */
3034
        spin_lock_irq(&tsk->sigmask_lock);
3035
        flush_signals(tsk);
3036
        sigfillset(&tsk->blocked);
3037
        recalc_sigpending(tsk);
3038
        spin_unlock_irq(&tsk->sigmask_lock);
3039
 
3040
        complete((struct completion *)startup);
3041
 
3042
        /*
3043
         * FIXME: The ndirty logic here is wrong.  It's supposed to
3044
         * send bdflush back to sleep after writing ndirty buffers.
3045
         * In fact, the test is wrong so bdflush will in fact
3046
         * sleep when bdflush_stop() returns true.
3047
         *
3048
         * FIXME: If it proves useful to implement ndirty properly,
3049
         * then perhaps the value of ndirty should be scaled by the
3050
         * amount of memory in the machine.
3051
         */
3052
        for (;;) {
3053
                int ndirty = bdf_prm.b_un.ndirty;
3054
 
3055
                CHECK_EMERGENCY_SYNC
3056
 
3057
                while (ndirty > 0) {
3058
                        spin_lock(&lru_list_lock);
3059
                        if (!write_some_buffers(NODEV))
3060
                                break;
3061
                        ndirty -= NRSYNC;
3062
                }
3063
                if (ndirty > 0 || bdflush_stop())
3064
                        interruptible_sleep_on(&bdflush_wait);
3065
        }
3066
}
3067
 
3068
/*
3069
 * This is the kernel update daemon. It was used to live in userspace
3070
 * but since it's need to run safely we want it unkillable by mistake.
3071
 * You don't need to change your userspace configuration since
3072
 * the userspace `update` will do_exit(0) at the first sys_bdflush().
3073
 */
3074
int kupdate(void *startup)
3075
{
3076
        struct task_struct * tsk = current;
3077
        int interval;
3078
 
3079
        tsk->session = 1;
3080
        tsk->pgrp = 1;
3081
        strcpy(tsk->comm, "kupdated");
3082
 
3083
        /* sigstop and sigcont will stop and wakeup kupdate */
3084
        spin_lock_irq(&tsk->sigmask_lock);
3085
        sigfillset(&tsk->blocked);
3086
        siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
3087
        recalc_sigpending(tsk);
3088
        spin_unlock_irq(&tsk->sigmask_lock);
3089
 
3090
        complete((struct completion *)startup);
3091
 
3092
        for (;;) {
3093
                DECLARE_WAITQUEUE(wait, tsk);
3094
 
3095
                add_wait_queue(&kupdate_wait, &wait);
3096
 
3097
                /* update interval */
3098
                interval = bdf_prm.b_un.interval;
3099
                if (interval) {
3100
                        tsk->state = TASK_INTERRUPTIBLE;
3101
                        schedule_timeout(interval);
3102
                } else {
3103
                        tsk->state = TASK_STOPPED;
3104
                        schedule(); /* wait for SIGCONT */
3105
                }
3106
                remove_wait_queue(&kupdate_wait, &wait);
3107
                /* check for sigstop */
3108
                if (signal_pending(tsk)) {
3109
                        int sig, stopped = 0;
3110
                        struct siginfo info;
3111
 
3112
                        spin_lock_irq(&tsk->sigmask_lock);
3113
                        sig = dequeue_signal(&current->blocked, &info);
3114
                        if (sig == SIGSTOP)
3115
                                stopped = 1;
3116
                        spin_unlock_irq(&tsk->sigmask_lock);
3117
                        if (stopped) {
3118
                                tsk->state = TASK_STOPPED;
3119
                                schedule(); /* wait for SIGCONT */
3120
                        }
3121
                }
3122
#ifdef DEBUG
3123
                printk(KERN_DEBUG "kupdate() activated...\n");
3124
#endif
3125
                sync_old_buffers();
3126
                if (laptop_mode)
3127
                        fsync_dev(NODEV);
3128
                run_task_queue(&tq_disk);
3129
        }
3130
}
3131
 
3132
static int __init bdflush_init(void)
3133
{
3134
        static struct completion startup __initdata = COMPLETION_INITIALIZER(startup);
3135
 
3136
        kernel_thread(bdflush, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
3137
        wait_for_completion(&startup);
3138
        kernel_thread(kupdate, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
3139
        wait_for_completion(&startup);
3140
        return 0;
3141
}
3142
 
3143
module_init(bdflush_init)
3144
 

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.