OpenCores
URL https://opencores.org/ocsvn/test_project/test_project/trunk

Subversion Repositories test_project

[/] [test_project/] [trunk/] [linux_sd_driver/] [drivers/] [md/] [md.c] - Blame information for rev 62

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 62 marcus.erl
/*
2
   md.c : Multiple Devices driver for Linux
3
          Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
 
5
     completely rewritten, based on the MD driver code from Marc Zyngier
6
 
7
   Changes:
8
 
9
   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10
   - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11
   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12
   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13
   - kmod support by: Cyrus Durgin
14
   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15
   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
 
17
   - lots of fixes and improvements to the RAID1/RAID5 and generic
18
     RAID code (such as request based resynchronization):
19
 
20
     Neil Brown <neilb@cse.unsw.edu.au>.
21
 
22
   - persistent bitmap code
23
     Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
 
25
   This program is free software; you can redistribute it and/or modify
26
   it under the terms of the GNU General Public License as published by
27
   the Free Software Foundation; either version 2, or (at your option)
28
   any later version.
29
 
30
   You should have received a copy of the GNU General Public License
31
   (for example /usr/src/linux/COPYING); if not, write to the Free
32
   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33
*/
34
 
35
#include <linux/module.h>
36
#include <linux/kernel.h>
37
#include <linux/kthread.h>
38
#include <linux/linkage.h>
39
#include <linux/raid/md.h>
40
#include <linux/raid/bitmap.h>
41
#include <linux/sysctl.h>
42
#include <linux/buffer_head.h> /* for invalidate_bdev */
43
#include <linux/poll.h>
44
#include <linux/mutex.h>
45
#include <linux/ctype.h>
46
#include <linux/freezer.h>
47
 
48
#include <linux/init.h>
49
 
50
#include <linux/file.h>
51
 
52
#ifdef CONFIG_KMOD
53
#include <linux/kmod.h>
54
#endif
55
 
56
#include <asm/unaligned.h>
57
 
58
#define MAJOR_NR MD_MAJOR
59
#define MD_DRIVER
60
 
61
/* 63 partitions with the alternate major number (mdp) */
62
#define MdpMinorShift 6
63
 
64
#define DEBUG 0
65
#define dprintk(x...) ((void)(DEBUG && printk(x)))
66
 
67
 
68
#ifndef MODULE
69
static void autostart_arrays (int part);
70
#endif
71
 
72
static LIST_HEAD(pers_list);
73
static DEFINE_SPINLOCK(pers_lock);
74
 
75
static void md_print_devices(void);
76
 
77
#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
78
 
79
/*
80
 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
81
 * is 1000 KB/sec, so the extra system load does not show up that much.
82
 * Increase it if you want to have more _guaranteed_ speed. Note that
83
 * the RAID driver will use the maximum available bandwidth if the IO
84
 * subsystem is idle. There is also an 'absolute maximum' reconstruction
85
 * speed limit - in case reconstruction slows down your system despite
86
 * idle IO detection.
87
 *
88
 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
89
 * or /sys/block/mdX/md/sync_speed_{min,max}
90
 */
91
 
92
static int sysctl_speed_limit_min = 1000;
93
static int sysctl_speed_limit_max = 200000;
94
static inline int speed_min(mddev_t *mddev)
95
{
96
        return mddev->sync_speed_min ?
97
                mddev->sync_speed_min : sysctl_speed_limit_min;
98
}
99
 
100
static inline int speed_max(mddev_t *mddev)
101
{
102
        return mddev->sync_speed_max ?
103
                mddev->sync_speed_max : sysctl_speed_limit_max;
104
}
105
 
106
static struct ctl_table_header *raid_table_header;
107
 
108
static ctl_table raid_table[] = {
109
        {
110
                .ctl_name       = DEV_RAID_SPEED_LIMIT_MIN,
111
                .procname       = "speed_limit_min",
112
                .data           = &sysctl_speed_limit_min,
113
                .maxlen         = sizeof(int),
114
                .mode           = S_IRUGO|S_IWUSR,
115
                .proc_handler   = &proc_dointvec,
116
        },
117
        {
118
                .ctl_name       = DEV_RAID_SPEED_LIMIT_MAX,
119
                .procname       = "speed_limit_max",
120
                .data           = &sysctl_speed_limit_max,
121
                .maxlen         = sizeof(int),
122
                .mode           = S_IRUGO|S_IWUSR,
123
                .proc_handler   = &proc_dointvec,
124
        },
125
        { .ctl_name = 0 }
126
};
127
 
128
static ctl_table raid_dir_table[] = {
129
        {
130
                .ctl_name       = DEV_RAID,
131
                .procname       = "raid",
132
                .maxlen         = 0,
133
                .mode           = S_IRUGO|S_IXUGO,
134
                .child          = raid_table,
135
        },
136
        { .ctl_name = 0 }
137
};
138
 
139
static ctl_table raid_root_table[] = {
140
        {
141
                .ctl_name       = CTL_DEV,
142
                .procname       = "dev",
143
                .maxlen         = 0,
144
                .mode           = 0555,
145
                .child          = raid_dir_table,
146
        },
147
        { .ctl_name = 0 }
148
};
149
 
150
static struct block_device_operations md_fops;
151
 
152
static int start_readonly;
153
 
154
/*
155
 * We have a system wide 'event count' that is incremented
156
 * on any 'interesting' event, and readers of /proc/mdstat
157
 * can use 'poll' or 'select' to find out when the event
158
 * count increases.
159
 *
160
 * Events are:
161
 *  start array, stop array, error, add device, remove device,
162
 *  start build, activate spare
163
 */
164
static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
165
static atomic_t md_event_count;
166
void md_new_event(mddev_t *mddev)
167
{
168
        atomic_inc(&md_event_count);
169
        wake_up(&md_event_waiters);
170
        sysfs_notify(&mddev->kobj, NULL, "sync_action");
171
}
172
EXPORT_SYMBOL_GPL(md_new_event);
173
 
174
/* Alternate version that can be called from interrupts
175
 * when calling sysfs_notify isn't needed.
176
 */
177
static void md_new_event_inintr(mddev_t *mddev)
178
{
179
        atomic_inc(&md_event_count);
180
        wake_up(&md_event_waiters);
181
}
182
 
183
/*
184
 * Enables to iterate over all existing md arrays
185
 * all_mddevs_lock protects this list.
186
 */
187
static LIST_HEAD(all_mddevs);
188
static DEFINE_SPINLOCK(all_mddevs_lock);
189
 
190
 
191
/*
192
 * iterates through all used mddevs in the system.
193
 * We take care to grab the all_mddevs_lock whenever navigating
194
 * the list, and to always hold a refcount when unlocked.
195
 * Any code which breaks out of this loop while own
196
 * a reference to the current mddev and must mddev_put it.
197
 */
198
#define ITERATE_MDDEV(mddev,tmp)                                        \
199
                                                                        \
200
        for (({ spin_lock(&all_mddevs_lock);                            \
201
                tmp = all_mddevs.next;                                  \
202
                mddev = NULL;});                                        \
203
             ({ if (tmp != &all_mddevs)                                 \
204
                        mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
205
                spin_unlock(&all_mddevs_lock);                          \
206
                if (mddev) mddev_put(mddev);                            \
207
                mddev = list_entry(tmp, mddev_t, all_mddevs);           \
208
                tmp != &all_mddevs;});                                  \
209
             ({ spin_lock(&all_mddevs_lock);                            \
210
                tmp = tmp->next;})                                      \
211
                )
212
 
213
 
214
static int md_fail_request (struct request_queue *q, struct bio *bio)
215
{
216
        bio_io_error(bio);
217
        return 0;
218
}
219
 
220
static inline mddev_t *mddev_get(mddev_t *mddev)
221
{
222
        atomic_inc(&mddev->active);
223
        return mddev;
224
}
225
 
226
static void mddev_put(mddev_t *mddev)
227
{
228
        if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
229
                return;
230
        if (!mddev->raid_disks && list_empty(&mddev->disks)) {
231
                list_del(&mddev->all_mddevs);
232
                spin_unlock(&all_mddevs_lock);
233
                blk_cleanup_queue(mddev->queue);
234
                kobject_unregister(&mddev->kobj);
235
        } else
236
                spin_unlock(&all_mddevs_lock);
237
}
238
 
239
static mddev_t * mddev_find(dev_t unit)
240
{
241
        mddev_t *mddev, *new = NULL;
242
 
243
 retry:
244
        spin_lock(&all_mddevs_lock);
245
        list_for_each_entry(mddev, &all_mddevs, all_mddevs)
246
                if (mddev->unit == unit) {
247
                        mddev_get(mddev);
248
                        spin_unlock(&all_mddevs_lock);
249
                        kfree(new);
250
                        return mddev;
251
                }
252
 
253
        if (new) {
254
                list_add(&new->all_mddevs, &all_mddevs);
255
                spin_unlock(&all_mddevs_lock);
256
                return new;
257
        }
258
        spin_unlock(&all_mddevs_lock);
259
 
260
        new = kzalloc(sizeof(*new), GFP_KERNEL);
261
        if (!new)
262
                return NULL;
263
 
264
        new->unit = unit;
265
        if (MAJOR(unit) == MD_MAJOR)
266
                new->md_minor = MINOR(unit);
267
        else
268
                new->md_minor = MINOR(unit) >> MdpMinorShift;
269
 
270
        mutex_init(&new->reconfig_mutex);
271
        INIT_LIST_HEAD(&new->disks);
272
        INIT_LIST_HEAD(&new->all_mddevs);
273
        init_timer(&new->safemode_timer);
274
        atomic_set(&new->active, 1);
275
        spin_lock_init(&new->write_lock);
276
        init_waitqueue_head(&new->sb_wait);
277
        new->reshape_position = MaxSector;
278
 
279
        new->queue = blk_alloc_queue(GFP_KERNEL);
280
        if (!new->queue) {
281
                kfree(new);
282
                return NULL;
283
        }
284
        set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);
285
 
286
        blk_queue_make_request(new->queue, md_fail_request);
287
 
288
        goto retry;
289
}
290
 
291
static inline int mddev_lock(mddev_t * mddev)
292
{
293
        return mutex_lock_interruptible(&mddev->reconfig_mutex);
294
}
295
 
296
static inline int mddev_trylock(mddev_t * mddev)
297
{
298
        return mutex_trylock(&mddev->reconfig_mutex);
299
}
300
 
301
static inline void mddev_unlock(mddev_t * mddev)
302
{
303
        mutex_unlock(&mddev->reconfig_mutex);
304
 
305
        md_wakeup_thread(mddev->thread);
306
}
307
 
308
static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
309
{
310
        mdk_rdev_t * rdev;
311
        struct list_head *tmp;
312
 
313
        ITERATE_RDEV(mddev,rdev,tmp) {
314
                if (rdev->desc_nr == nr)
315
                        return rdev;
316
        }
317
        return NULL;
318
}
319
 
320
static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
321
{
322
        struct list_head *tmp;
323
        mdk_rdev_t *rdev;
324
 
325
        ITERATE_RDEV(mddev,rdev,tmp) {
326
                if (rdev->bdev->bd_dev == dev)
327
                        return rdev;
328
        }
329
        return NULL;
330
}
331
 
332
static struct mdk_personality *find_pers(int level, char *clevel)
333
{
334
        struct mdk_personality *pers;
335
        list_for_each_entry(pers, &pers_list, list) {
336
                if (level != LEVEL_NONE && pers->level == level)
337
                        return pers;
338
                if (strcmp(pers->name, clevel)==0)
339
                        return pers;
340
        }
341
        return NULL;
342
}
343
 
344
static inline sector_t calc_dev_sboffset(struct block_device *bdev)
345
{
346
        sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
347
        return MD_NEW_SIZE_BLOCKS(size);
348
}
349
 
350
static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
351
{
352
        sector_t size;
353
 
354
        size = rdev->sb_offset;
355
 
356
        if (chunk_size)
357
                size &= ~((sector_t)chunk_size/1024 - 1);
358
        return size;
359
}
360
 
361
static int alloc_disk_sb(mdk_rdev_t * rdev)
362
{
363
        if (rdev->sb_page)
364
                MD_BUG();
365
 
366
        rdev->sb_page = alloc_page(GFP_KERNEL);
367
        if (!rdev->sb_page) {
368
                printk(KERN_ALERT "md: out of memory.\n");
369
                return -EINVAL;
370
        }
371
 
372
        return 0;
373
}
374
 
375
static void free_disk_sb(mdk_rdev_t * rdev)
376
{
377
        if (rdev->sb_page) {
378
                put_page(rdev->sb_page);
379
                rdev->sb_loaded = 0;
380
                rdev->sb_page = NULL;
381
                rdev->sb_offset = 0;
382
                rdev->size = 0;
383
        }
384
}
385
 
386
 
387
static void super_written(struct bio *bio, int error)
388
{
389
        mdk_rdev_t *rdev = bio->bi_private;
390
        mddev_t *mddev = rdev->mddev;
391
 
392
        if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
393
                printk("md: super_written gets error=%d, uptodate=%d\n",
394
                       error, test_bit(BIO_UPTODATE, &bio->bi_flags));
395
                WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
396
                md_error(mddev, rdev);
397
        }
398
 
399
        if (atomic_dec_and_test(&mddev->pending_writes))
400
                wake_up(&mddev->sb_wait);
401
        bio_put(bio);
402
}
403
 
404
static void super_written_barrier(struct bio *bio, int error)
405
{
406
        struct bio *bio2 = bio->bi_private;
407
        mdk_rdev_t *rdev = bio2->bi_private;
408
        mddev_t *mddev = rdev->mddev;
409
 
410
        if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
411
            error == -EOPNOTSUPP) {
412
                unsigned long flags;
413
                /* barriers don't appear to be supported :-( */
414
                set_bit(BarriersNotsupp, &rdev->flags);
415
                mddev->barriers_work = 0;
416
                spin_lock_irqsave(&mddev->write_lock, flags);
417
                bio2->bi_next = mddev->biolist;
418
                mddev->biolist = bio2;
419
                spin_unlock_irqrestore(&mddev->write_lock, flags);
420
                wake_up(&mddev->sb_wait);
421
                bio_put(bio);
422
        } else {
423
                bio_put(bio2);
424
                bio->bi_private = rdev;
425
                super_written(bio, error);
426
        }
427
}
428
 
429
void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
430
                   sector_t sector, int size, struct page *page)
431
{
432
        /* write first size bytes of page to sector of rdev
433
         * Increment mddev->pending_writes before returning
434
         * and decrement it on completion, waking up sb_wait
435
         * if zero is reached.
436
         * If an error occurred, call md_error
437
         *
438
         * As we might need to resubmit the request if BIO_RW_BARRIER
439
         * causes ENOTSUPP, we allocate a spare bio...
440
         */
441
        struct bio *bio = bio_alloc(GFP_NOIO, 1);
442
        int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
443
 
444
        bio->bi_bdev = rdev->bdev;
445
        bio->bi_sector = sector;
446
        bio_add_page(bio, page, size, 0);
447
        bio->bi_private = rdev;
448
        bio->bi_end_io = super_written;
449
        bio->bi_rw = rw;
450
 
451
        atomic_inc(&mddev->pending_writes);
452
        if (!test_bit(BarriersNotsupp, &rdev->flags)) {
453
                struct bio *rbio;
454
                rw |= (1<<BIO_RW_BARRIER);
455
                rbio = bio_clone(bio, GFP_NOIO);
456
                rbio->bi_private = bio;
457
                rbio->bi_end_io = super_written_barrier;
458
                submit_bio(rw, rbio);
459
        } else
460
                submit_bio(rw, bio);
461
}
462
 
463
void md_super_wait(mddev_t *mddev)
464
{
465
        /* wait for all superblock writes that were scheduled to complete.
466
         * if any had to be retried (due to BARRIER problems), retry them
467
         */
468
        DEFINE_WAIT(wq);
469
        for(;;) {
470
                prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
471
                if (atomic_read(&mddev->pending_writes)==0)
472
                        break;
473
                while (mddev->biolist) {
474
                        struct bio *bio;
475
                        spin_lock_irq(&mddev->write_lock);
476
                        bio = mddev->biolist;
477
                        mddev->biolist = bio->bi_next ;
478
                        bio->bi_next = NULL;
479
                        spin_unlock_irq(&mddev->write_lock);
480
                        submit_bio(bio->bi_rw, bio);
481
                }
482
                schedule();
483
        }
484
        finish_wait(&mddev->sb_wait, &wq);
485
}
486
 
487
static void bi_complete(struct bio *bio, int error)
488
{
489
        complete((struct completion*)bio->bi_private);
490
}
491
 
492
int sync_page_io(struct block_device *bdev, sector_t sector, int size,
493
                   struct page *page, int rw)
494
{
495
        struct bio *bio = bio_alloc(GFP_NOIO, 1);
496
        struct completion event;
497
        int ret;
498
 
499
        rw |= (1 << BIO_RW_SYNC);
500
 
501
        bio->bi_bdev = bdev;
502
        bio->bi_sector = sector;
503
        bio_add_page(bio, page, size, 0);
504
        init_completion(&event);
505
        bio->bi_private = &event;
506
        bio->bi_end_io = bi_complete;
507
        submit_bio(rw, bio);
508
        wait_for_completion(&event);
509
 
510
        ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
511
        bio_put(bio);
512
        return ret;
513
}
514
EXPORT_SYMBOL_GPL(sync_page_io);
515
 
516
static int read_disk_sb(mdk_rdev_t * rdev, int size)
517
{
518
        char b[BDEVNAME_SIZE];
519
        if (!rdev->sb_page) {
520
                MD_BUG();
521
                return -EINVAL;
522
        }
523
        if (rdev->sb_loaded)
524
                return 0;
525
 
526
 
527
        if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
528
                goto fail;
529
        rdev->sb_loaded = 1;
530
        return 0;
531
 
532
fail:
533
        printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
534
                bdevname(rdev->bdev,b));
535
        return -EINVAL;
536
}
537
 
538
static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
539
{
540
        if (    (sb1->set_uuid0 == sb2->set_uuid0) &&
541
                (sb1->set_uuid1 == sb2->set_uuid1) &&
542
                (sb1->set_uuid2 == sb2->set_uuid2) &&
543
                (sb1->set_uuid3 == sb2->set_uuid3))
544
 
545
                return 1;
546
 
547
        return 0;
548
}
549
 
550
 
551
static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
552
{
553
        int ret;
554
        mdp_super_t *tmp1, *tmp2;
555
 
556
        tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
557
        tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
558
 
559
        if (!tmp1 || !tmp2) {
560
                ret = 0;
561
                printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
562
                goto abort;
563
        }
564
 
565
        *tmp1 = *sb1;
566
        *tmp2 = *sb2;
567
 
568
        /*
569
         * nr_disks is not constant
570
         */
571
        tmp1->nr_disks = 0;
572
        tmp2->nr_disks = 0;
573
 
574
        if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
575
                ret = 0;
576
        else
577
                ret = 1;
578
 
579
abort:
580
        kfree(tmp1);
581
        kfree(tmp2);
582
        return ret;
583
}
584
 
585
 
586
static u32 md_csum_fold(u32 csum)
587
{
588
        csum = (csum & 0xffff) + (csum >> 16);
589
        return (csum & 0xffff) + (csum >> 16);
590
}
591
 
592
static unsigned int calc_sb_csum(mdp_super_t * sb)
593
{
594
        u64 newcsum = 0;
595
        u32 *sb32 = (u32*)sb;
596
        int i;
597
        unsigned int disk_csum, csum;
598
 
599
        disk_csum = sb->sb_csum;
600
        sb->sb_csum = 0;
601
 
602
        for (i = 0; i < MD_SB_BYTES/4 ; i++)
603
                newcsum += sb32[i];
604
        csum = (newcsum & 0xffffffff) + (newcsum>>32);
605
 
606
 
607
#ifdef CONFIG_ALPHA
608
        /* This used to use csum_partial, which was wrong for several
609
         * reasons including that different results are returned on
610
         * different architectures.  It isn't critical that we get exactly
611
         * the same return value as before (we always csum_fold before
612
         * testing, and that removes any differences).  However as we
613
         * know that csum_partial always returned a 16bit value on
614
         * alphas, do a fold to maximise conformity to previous behaviour.
615
         */
616
        sb->sb_csum = md_csum_fold(disk_csum);
617
#else
618
        sb->sb_csum = disk_csum;
619
#endif
620
        return csum;
621
}
622
 
623
 
624
/*
625
 * Handle superblock details.
626
 * We want to be able to handle multiple superblock formats
627
 * so we have a common interface to them all, and an array of
628
 * different handlers.
629
 * We rely on user-space to write the initial superblock, and support
630
 * reading and updating of superblocks.
631
 * Interface methods are:
632
 *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
633
 *      loads and validates a superblock on dev.
634
 *      if refdev != NULL, compare superblocks on both devices
635
 *    Return:
636
 *      0 - dev has a superblock that is compatible with refdev
637
 *      1 - dev has a superblock that is compatible and newer than refdev
638
 *          so dev should be used as the refdev in future
639
 *     -EINVAL superblock incompatible or invalid
640
 *     -othererror e.g. -EIO
641
 *
642
 *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
643
 *      Verify that dev is acceptable into mddev.
644
 *       The first time, mddev->raid_disks will be 0, and data from
645
 *       dev should be merged in.  Subsequent calls check that dev
646
 *       is new enough.  Return 0 or -EINVAL
647
 *
648
 *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
649
 *     Update the superblock for rdev with data in mddev
650
 *     This does not write to disc.
651
 *
652
 */
653
 
654
struct super_type  {
655
        char            *name;
656
        struct module   *owner;
657
        int             (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
658
        int             (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
659
        void            (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
660
};
661
 
662
/*
663
 * load_super for 0.90.0
664
 */
665
static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
666
{
667
        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
668
        mdp_super_t *sb;
669
        int ret;
670
        sector_t sb_offset;
671
 
672
        /*
673
         * Calculate the position of the superblock,
674
         * it's at the end of the disk.
675
         *
676
         * It also happens to be a multiple of 4Kb.
677
         */
678
        sb_offset = calc_dev_sboffset(rdev->bdev);
679
        rdev->sb_offset = sb_offset;
680
 
681
        ret = read_disk_sb(rdev, MD_SB_BYTES);
682
        if (ret) return ret;
683
 
684
        ret = -EINVAL;
685
 
686
        bdevname(rdev->bdev, b);
687
        sb = (mdp_super_t*)page_address(rdev->sb_page);
688
 
689
        if (sb->md_magic != MD_SB_MAGIC) {
690
                printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
691
                       b);
692
                goto abort;
693
        }
694
 
695
        if (sb->major_version != 0 ||
696
            sb->minor_version < 90 ||
697
            sb->minor_version > 91) {
698
                printk(KERN_WARNING "Bad version number %d.%d on %s\n",
699
                        sb->major_version, sb->minor_version,
700
                        b);
701
                goto abort;
702
        }
703
 
704
        if (sb->raid_disks <= 0)
705
                goto abort;
706
 
707
        if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
708
                printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
709
                        b);
710
                goto abort;
711
        }
712
 
713
        rdev->preferred_minor = sb->md_minor;
714
        rdev->data_offset = 0;
715
        rdev->sb_size = MD_SB_BYTES;
716
 
717
        if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) {
718
                if (sb->level != 1 && sb->level != 4
719
                    && sb->level != 5 && sb->level != 6
720
                    && sb->level != 10) {
721
                        /* FIXME use a better test */
722
                        printk(KERN_WARNING
723
                               "md: bitmaps not supported for this level.\n");
724
                        goto abort;
725
                }
726
        }
727
 
728
        if (sb->level == LEVEL_MULTIPATH)
729
                rdev->desc_nr = -1;
730
        else
731
                rdev->desc_nr = sb->this_disk.number;
732
 
733
        if (refdev == 0)
734
                ret = 1;
735
        else {
736
                __u64 ev1, ev2;
737
                mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
738
                if (!uuid_equal(refsb, sb)) {
739
                        printk(KERN_WARNING "md: %s has different UUID to %s\n",
740
                                b, bdevname(refdev->bdev,b2));
741
                        goto abort;
742
                }
743
                if (!sb_equal(refsb, sb)) {
744
                        printk(KERN_WARNING "md: %s has same UUID"
745
                               " but different superblock to %s\n",
746
                               b, bdevname(refdev->bdev, b2));
747
                        goto abort;
748
                }
749
                ev1 = md_event(sb);
750
                ev2 = md_event(refsb);
751
                if (ev1 > ev2)
752
                        ret = 1;
753
                else
754
                        ret = 0;
755
        }
756
        rdev->size = calc_dev_size(rdev, sb->chunk_size);
757
 
758
        if (rdev->size < sb->size && sb->level > 1)
759
                /* "this cannot possibly happen" ... */
760
                ret = -EINVAL;
761
 
762
 abort:
763
        return ret;
764
}
765
 
766
/*
767
 * validate_super for 0.90.0
768
 */
769
static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
770
{
771
        mdp_disk_t *desc;
772
        mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
773
        __u64 ev1 = md_event(sb);
774
 
775
        rdev->raid_disk = -1;
776
        rdev->flags = 0;
777
        if (mddev->raid_disks == 0) {
778
                mddev->major_version = 0;
779
                mddev->minor_version = sb->minor_version;
780
                mddev->patch_version = sb->patch_version;
781
                mddev->persistent = ! sb->not_persistent;
782
                mddev->chunk_size = sb->chunk_size;
783
                mddev->ctime = sb->ctime;
784
                mddev->utime = sb->utime;
785
                mddev->level = sb->level;
786
                mddev->clevel[0] = 0;
787
                mddev->layout = sb->layout;
788
                mddev->raid_disks = sb->raid_disks;
789
                mddev->size = sb->size;
790
                mddev->events = ev1;
791
                mddev->bitmap_offset = 0;
792
                mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
793
 
794
                if (mddev->minor_version >= 91) {
795
                        mddev->reshape_position = sb->reshape_position;
796
                        mddev->delta_disks = sb->delta_disks;
797
                        mddev->new_level = sb->new_level;
798
                        mddev->new_layout = sb->new_layout;
799
                        mddev->new_chunk = sb->new_chunk;
800
                } else {
801
                        mddev->reshape_position = MaxSector;
802
                        mddev->delta_disks = 0;
803
                        mddev->new_level = mddev->level;
804
                        mddev->new_layout = mddev->layout;
805
                        mddev->new_chunk = mddev->chunk_size;
806
                }
807
 
808
                if (sb->state & (1<<MD_SB_CLEAN))
809
                        mddev->recovery_cp = MaxSector;
810
                else {
811
                        if (sb->events_hi == sb->cp_events_hi &&
812
                                sb->events_lo == sb->cp_events_lo) {
813
                                mddev->recovery_cp = sb->recovery_cp;
814
                        } else
815
                                mddev->recovery_cp = 0;
816
                }
817
 
818
                memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
819
                memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
820
                memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
821
                memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
822
 
823
                mddev->max_disks = MD_SB_DISKS;
824
 
825
                if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
826
                    mddev->bitmap_file == NULL)
827
                        mddev->bitmap_offset = mddev->default_bitmap_offset;
828
 
829
        } else if (mddev->pers == NULL) {
830
                /* Insist on good event counter while assembling */
831
                ++ev1;
832
                if (ev1 < mddev->events)
833
                        return -EINVAL;
834
        } else if (mddev->bitmap) {
835
                /* if adding to array with a bitmap, then we can accept an
836
                 * older device ... but not too old.
837
                 */
838
                if (ev1 < mddev->bitmap->events_cleared)
839
                        return 0;
840
        } else {
841
                if (ev1 < mddev->events)
842
                        /* just a hot-add of a new device, leave raid_disk at -1 */
843
                        return 0;
844
        }
845
 
846
        if (mddev->level != LEVEL_MULTIPATH) {
847
                desc = sb->disks + rdev->desc_nr;
848
 
849
                if (desc->state & (1<<MD_DISK_FAULTY))
850
                        set_bit(Faulty, &rdev->flags);
851
                else if (desc->state & (1<<MD_DISK_SYNC) /* &&
852
                            desc->raid_disk < mddev->raid_disks */) {
853
                        set_bit(In_sync, &rdev->flags);
854
                        rdev->raid_disk = desc->raid_disk;
855
                }
856
                if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
857
                        set_bit(WriteMostly, &rdev->flags);
858
        } else /* MULTIPATH are always insync */
859
                set_bit(In_sync, &rdev->flags);
860
        return 0;
861
}
862
 
863
/*
864
 * sync_super for 0.90.0
865
 */
866
static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
867
{
868
        mdp_super_t *sb;
869
        struct list_head *tmp;
870
        mdk_rdev_t *rdev2;
871
        int next_spare = mddev->raid_disks;
872
 
873
 
874
        /* make rdev->sb match mddev data..
875
         *
876
         * 1/ zero out disks
877
         * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
878
         * 3/ any empty disks < next_spare become removed
879
         *
880
         * disks[0] gets initialised to REMOVED because
881
         * we cannot be sure from other fields if it has
882
         * been initialised or not.
883
         */
884
        int i;
885
        int active=0, working=0,failed=0,spare=0,nr_disks=0;
886
 
887
        rdev->sb_size = MD_SB_BYTES;
888
 
889
        sb = (mdp_super_t*)page_address(rdev->sb_page);
890
 
891
        memset(sb, 0, sizeof(*sb));
892
 
893
        sb->md_magic = MD_SB_MAGIC;
894
        sb->major_version = mddev->major_version;
895
        sb->patch_version = mddev->patch_version;
896
        sb->gvalid_words  = 0; /* ignored */
897
        memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
898
        memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
899
        memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
900
        memcpy(&sb->set_uuid3, mddev->uuid+12,4);
901
 
902
        sb->ctime = mddev->ctime;
903
        sb->level = mddev->level;
904
        sb->size  = mddev->size;
905
        sb->raid_disks = mddev->raid_disks;
906
        sb->md_minor = mddev->md_minor;
907
        sb->not_persistent = !mddev->persistent;
908
        sb->utime = mddev->utime;
909
        sb->state = 0;
910
        sb->events_hi = (mddev->events>>32);
911
        sb->events_lo = (u32)mddev->events;
912
 
913
        if (mddev->reshape_position == MaxSector)
914
                sb->minor_version = 90;
915
        else {
916
                sb->minor_version = 91;
917
                sb->reshape_position = mddev->reshape_position;
918
                sb->new_level = mddev->new_level;
919
                sb->delta_disks = mddev->delta_disks;
920
                sb->new_layout = mddev->new_layout;
921
                sb->new_chunk = mddev->new_chunk;
922
        }
923
        mddev->minor_version = sb->minor_version;
924
        if (mddev->in_sync)
925
        {
926
                sb->recovery_cp = mddev->recovery_cp;
927
                sb->cp_events_hi = (mddev->events>>32);
928
                sb->cp_events_lo = (u32)mddev->events;
929
                if (mddev->recovery_cp == MaxSector)
930
                        sb->state = (1<< MD_SB_CLEAN);
931
        } else
932
                sb->recovery_cp = 0;
933
 
934
        sb->layout = mddev->layout;
935
        sb->chunk_size = mddev->chunk_size;
936
 
937
        if (mddev->bitmap && mddev->bitmap_file == NULL)
938
                sb->state |= (1<<MD_SB_BITMAP_PRESENT);
939
 
940
        sb->disks[0].state = (1<<MD_DISK_REMOVED);
941
        ITERATE_RDEV(mddev,rdev2,tmp) {
942
                mdp_disk_t *d;
943
                int desc_nr;
944
                if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
945
                    && !test_bit(Faulty, &rdev2->flags))
946
                        desc_nr = rdev2->raid_disk;
947
                else
948
                        desc_nr = next_spare++;
949
                rdev2->desc_nr = desc_nr;
950
                d = &sb->disks[rdev2->desc_nr];
951
                nr_disks++;
952
                d->number = rdev2->desc_nr;
953
                d->major = MAJOR(rdev2->bdev->bd_dev);
954
                d->minor = MINOR(rdev2->bdev->bd_dev);
955
                if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
956
                    && !test_bit(Faulty, &rdev2->flags))
957
                        d->raid_disk = rdev2->raid_disk;
958
                else
959
                        d->raid_disk = rdev2->desc_nr; /* compatibility */
960
                if (test_bit(Faulty, &rdev2->flags))
961
                        d->state = (1<<MD_DISK_FAULTY);
962
                else if (test_bit(In_sync, &rdev2->flags)) {
963
                        d->state = (1<<MD_DISK_ACTIVE);
964
                        d->state |= (1<<MD_DISK_SYNC);
965
                        active++;
966
                        working++;
967
                } else {
968
                        d->state = 0;
969
                        spare++;
970
                        working++;
971
                }
972
                if (test_bit(WriteMostly, &rdev2->flags))
973
                        d->state |= (1<<MD_DISK_WRITEMOSTLY);
974
        }
975
        /* now set the "removed" and "faulty" bits on any missing devices */
976
        for (i=0 ; i < mddev->raid_disks ; i++) {
977
                mdp_disk_t *d = &sb->disks[i];
978
                if (d->state == 0 && d->number == 0) {
979
                        d->number = i;
980
                        d->raid_disk = i;
981
                        d->state = (1<<MD_DISK_REMOVED);
982
                        d->state |= (1<<MD_DISK_FAULTY);
983
                        failed++;
984
                }
985
        }
986
        sb->nr_disks = nr_disks;
987
        sb->active_disks = active;
988
        sb->working_disks = working;
989
        sb->failed_disks = failed;
990
        sb->spare_disks = spare;
991
 
992
        sb->this_disk = sb->disks[rdev->desc_nr];
993
        sb->sb_csum = calc_sb_csum(sb);
994
}
995
 
996
/*
997
 * version 1 superblock
998
 */
999
 
1000
static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1001
{
1002
        __le32 disk_csum;
1003
        u32 csum;
1004
        unsigned long long newcsum;
1005
        int size = 256 + le32_to_cpu(sb->max_dev)*2;
1006
        __le32 *isuper = (__le32*)sb;
1007
        int i;
1008
 
1009
        disk_csum = sb->sb_csum;
1010
        sb->sb_csum = 0;
1011
        newcsum = 0;
1012
        for (i=0; size>=4; size -= 4 )
1013
                newcsum += le32_to_cpu(*isuper++);
1014
 
1015
        if (size == 2)
1016
                newcsum += le16_to_cpu(*(__le16*) isuper);
1017
 
1018
        csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1019
        sb->sb_csum = disk_csum;
1020
        return cpu_to_le32(csum);
1021
}
1022
 
1023
static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1024
{
1025
        struct mdp_superblock_1 *sb;
1026
        int ret;
1027
        sector_t sb_offset;
1028
        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1029
        int bmask;
1030
 
1031
        /*
1032
         * Calculate the position of the superblock.
1033
         * It is always aligned to a 4K boundary and
1034
         * depeding on minor_version, it can be:
1035
         * 0: At least 8K, but less than 12K, from end of device
1036
         * 1: At start of device
1037
         * 2: 4K from start of device.
1038
         */
1039
        switch(minor_version) {
1040
        case 0:
1041
                sb_offset = rdev->bdev->bd_inode->i_size >> 9;
1042
                sb_offset -= 8*2;
1043
                sb_offset &= ~(sector_t)(4*2-1);
1044
                /* convert from sectors to K */
1045
                sb_offset /= 2;
1046
                break;
1047
        case 1:
1048
                sb_offset = 0;
1049
                break;
1050
        case 2:
1051
                sb_offset = 4;
1052
                break;
1053
        default:
1054
                return -EINVAL;
1055
        }
1056
        rdev->sb_offset = sb_offset;
1057
 
1058
        /* superblock is rarely larger than 1K, but it can be larger,
1059
         * and it is safe to read 4k, so we do that
1060
         */
1061
        ret = read_disk_sb(rdev, 4096);
1062
        if (ret) return ret;
1063
 
1064
 
1065
        sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1066
 
1067
        if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1068
            sb->major_version != cpu_to_le32(1) ||
1069
            le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1070
            le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
1071
            (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1072
                return -EINVAL;
1073
 
1074
        if (calc_sb_1_csum(sb) != sb->sb_csum) {
1075
                printk("md: invalid superblock checksum on %s\n",
1076
                        bdevname(rdev->bdev,b));
1077
                return -EINVAL;
1078
        }
1079
        if (le64_to_cpu(sb->data_size) < 10) {
1080
                printk("md: data_size too small on %s\n",
1081
                       bdevname(rdev->bdev,b));
1082
                return -EINVAL;
1083
        }
1084
        if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) {
1085
                if (sb->level != cpu_to_le32(1) &&
1086
                    sb->level != cpu_to_le32(4) &&
1087
                    sb->level != cpu_to_le32(5) &&
1088
                    sb->level != cpu_to_le32(6) &&
1089
                    sb->level != cpu_to_le32(10)) {
1090
                        printk(KERN_WARNING
1091
                               "md: bitmaps not supported for this level.\n");
1092
                        return -EINVAL;
1093
                }
1094
        }
1095
 
1096
        rdev->preferred_minor = 0xffff;
1097
        rdev->data_offset = le64_to_cpu(sb->data_offset);
1098
        atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1099
 
1100
        rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1101
        bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
1102
        if (rdev->sb_size & bmask)
1103
                rdev-> sb_size = (rdev->sb_size | bmask)+1;
1104
 
1105
        if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1106
                rdev->desc_nr = -1;
1107
        else
1108
                rdev->desc_nr = le32_to_cpu(sb->dev_number);
1109
 
1110
        if (refdev == 0)
1111
                ret = 1;
1112
        else {
1113
                __u64 ev1, ev2;
1114
                struct mdp_superblock_1 *refsb =
1115
                        (struct mdp_superblock_1*)page_address(refdev->sb_page);
1116
 
1117
                if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1118
                    sb->level != refsb->level ||
1119
                    sb->layout != refsb->layout ||
1120
                    sb->chunksize != refsb->chunksize) {
1121
                        printk(KERN_WARNING "md: %s has strangely different"
1122
                                " superblock to %s\n",
1123
                                bdevname(rdev->bdev,b),
1124
                                bdevname(refdev->bdev,b2));
1125
                        return -EINVAL;
1126
                }
1127
                ev1 = le64_to_cpu(sb->events);
1128
                ev2 = le64_to_cpu(refsb->events);
1129
 
1130
                if (ev1 > ev2)
1131
                        ret = 1;
1132
                else
1133
                        ret = 0;
1134
        }
1135
        if (minor_version)
1136
                rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
1137
        else
1138
                rdev->size = rdev->sb_offset;
1139
        if (rdev->size < le64_to_cpu(sb->data_size)/2)
1140
                return -EINVAL;
1141
        rdev->size = le64_to_cpu(sb->data_size)/2;
1142
        if (le32_to_cpu(sb->chunksize))
1143
                rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
1144
 
1145
        if (le64_to_cpu(sb->size) > rdev->size*2)
1146
                return -EINVAL;
1147
        return ret;
1148
}
1149
 
1150
static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1151
{
1152
        struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1153
        __u64 ev1 = le64_to_cpu(sb->events);
1154
 
1155
        rdev->raid_disk = -1;
1156
        rdev->flags = 0;
1157
        if (mddev->raid_disks == 0) {
1158
                mddev->major_version = 1;
1159
                mddev->patch_version = 0;
1160
                mddev->persistent = 1;
1161
                mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
1162
                mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1163
                mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1164
                mddev->level = le32_to_cpu(sb->level);
1165
                mddev->clevel[0] = 0;
1166
                mddev->layout = le32_to_cpu(sb->layout);
1167
                mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1168
                mddev->size = le64_to_cpu(sb->size)/2;
1169
                mddev->events = ev1;
1170
                mddev->bitmap_offset = 0;
1171
                mddev->default_bitmap_offset = 1024 >> 9;
1172
 
1173
                mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1174
                memcpy(mddev->uuid, sb->set_uuid, 16);
1175
 
1176
                mddev->max_disks =  (4096-256)/2;
1177
 
1178
                if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1179
                    mddev->bitmap_file == NULL )
1180
                        mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1181
 
1182
                if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1183
                        mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1184
                        mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1185
                        mddev->new_level = le32_to_cpu(sb->new_level);
1186
                        mddev->new_layout = le32_to_cpu(sb->new_layout);
1187
                        mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
1188
                } else {
1189
                        mddev->reshape_position = MaxSector;
1190
                        mddev->delta_disks = 0;
1191
                        mddev->new_level = mddev->level;
1192
                        mddev->new_layout = mddev->layout;
1193
                        mddev->new_chunk = mddev->chunk_size;
1194
                }
1195
 
1196
        } else if (mddev->pers == NULL) {
1197
                /* Insist of good event counter while assembling */
1198
                ++ev1;
1199
                if (ev1 < mddev->events)
1200
                        return -EINVAL;
1201
        } else if (mddev->bitmap) {
1202
                /* If adding to array with a bitmap, then we can accept an
1203
                 * older device, but not too old.
1204
                 */
1205
                if (ev1 < mddev->bitmap->events_cleared)
1206
                        return 0;
1207
        } else {
1208
                if (ev1 < mddev->events)
1209
                        /* just a hot-add of a new device, leave raid_disk at -1 */
1210
                        return 0;
1211
        }
1212
        if (mddev->level != LEVEL_MULTIPATH) {
1213
                int role;
1214
                role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1215
                switch(role) {
1216
                case 0xffff: /* spare */
1217
                        break;
1218
                case 0xfffe: /* faulty */
1219
                        set_bit(Faulty, &rdev->flags);
1220
                        break;
1221
                default:
1222
                        if ((le32_to_cpu(sb->feature_map) &
1223
                             MD_FEATURE_RECOVERY_OFFSET))
1224
                                rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1225
                        else
1226
                                set_bit(In_sync, &rdev->flags);
1227
                        rdev->raid_disk = role;
1228
                        break;
1229
                }
1230
                if (sb->devflags & WriteMostly1)
1231
                        set_bit(WriteMostly, &rdev->flags);
1232
        } else /* MULTIPATH are always insync */
1233
                set_bit(In_sync, &rdev->flags);
1234
 
1235
        return 0;
1236
}
1237
 
1238
static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1239
{
1240
        struct mdp_superblock_1 *sb;
1241
        struct list_head *tmp;
1242
        mdk_rdev_t *rdev2;
1243
        int max_dev, i;
1244
        /* make rdev->sb match mddev and rdev data. */
1245
 
1246
        sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1247
 
1248
        sb->feature_map = 0;
1249
        sb->pad0 = 0;
1250
        sb->recovery_offset = cpu_to_le64(0);
1251
        memset(sb->pad1, 0, sizeof(sb->pad1));
1252
        memset(sb->pad2, 0, sizeof(sb->pad2));
1253
        memset(sb->pad3, 0, sizeof(sb->pad3));
1254
 
1255
        sb->utime = cpu_to_le64((__u64)mddev->utime);
1256
        sb->events = cpu_to_le64(mddev->events);
1257
        if (mddev->in_sync)
1258
                sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1259
        else
1260
                sb->resync_offset = cpu_to_le64(0);
1261
 
1262
        sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1263
 
1264
        sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1265
        sb->size = cpu_to_le64(mddev->size<<1);
1266
 
1267
        if (mddev->bitmap && mddev->bitmap_file == NULL) {
1268
                sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1269
                sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1270
        }
1271
 
1272
        if (rdev->raid_disk >= 0 &&
1273
            !test_bit(In_sync, &rdev->flags) &&
1274
            rdev->recovery_offset > 0) {
1275
                sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1276
                sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
1277
        }
1278
 
1279
        if (mddev->reshape_position != MaxSector) {
1280
                sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1281
                sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1282
                sb->new_layout = cpu_to_le32(mddev->new_layout);
1283
                sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1284
                sb->new_level = cpu_to_le32(mddev->new_level);
1285
                sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
1286
        }
1287
 
1288
        max_dev = 0;
1289
        ITERATE_RDEV(mddev,rdev2,tmp)
1290
                if (rdev2->desc_nr+1 > max_dev)
1291
                        max_dev = rdev2->desc_nr+1;
1292
 
1293
        if (max_dev > le32_to_cpu(sb->max_dev))
1294
                sb->max_dev = cpu_to_le32(max_dev);
1295
        for (i=0; i<max_dev;i++)
1296
                sb->dev_roles[i] = cpu_to_le16(0xfffe);
1297
 
1298
        ITERATE_RDEV(mddev,rdev2,tmp) {
1299
                i = rdev2->desc_nr;
1300
                if (test_bit(Faulty, &rdev2->flags))
1301
                        sb->dev_roles[i] = cpu_to_le16(0xfffe);
1302
                else if (test_bit(In_sync, &rdev2->flags))
1303
                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1304
                else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1305
                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1306
                else
1307
                        sb->dev_roles[i] = cpu_to_le16(0xffff);
1308
        }
1309
 
1310
        sb->sb_csum = calc_sb_1_csum(sb);
1311
}
1312
 
1313
 
1314
static struct super_type super_types[] = {
1315
        [0] = {
1316
                .name   = "0.90.0",
1317
                .owner  = THIS_MODULE,
1318
                .load_super     = super_90_load,
1319
                .validate_super = super_90_validate,
1320
                .sync_super     = super_90_sync,
1321
        },
1322
        [1] = {
1323
                .name   = "md-1",
1324
                .owner  = THIS_MODULE,
1325
                .load_super     = super_1_load,
1326
                .validate_super = super_1_validate,
1327
                .sync_super     = super_1_sync,
1328
        },
1329
};
1330
 
1331
static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1332
{
1333
        struct list_head *tmp, *tmp2;
1334
        mdk_rdev_t *rdev, *rdev2;
1335
 
1336
        ITERATE_RDEV(mddev1,rdev,tmp)
1337
                ITERATE_RDEV(mddev2, rdev2, tmp2)
1338
                        if (rdev->bdev->bd_contains ==
1339
                            rdev2->bdev->bd_contains)
1340
                                return 1;
1341
 
1342
        return 0;
1343
}
1344
 
1345
static LIST_HEAD(pending_raid_disks);
1346
 
1347
static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1348
{
1349
        char b[BDEVNAME_SIZE];
1350
        struct kobject *ko;
1351
        char *s;
1352
        int err;
1353
 
1354
        if (rdev->mddev) {
1355
                MD_BUG();
1356
                return -EINVAL;
1357
        }
1358
        /* make sure rdev->size exceeds mddev->size */
1359
        if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
1360
                if (mddev->pers) {
1361
                        /* Cannot change size, so fail
1362
                         * If mddev->level <= 0, then we don't care
1363
                         * about aligning sizes (e.g. linear)
1364
                         */
1365
                        if (mddev->level > 0)
1366
                                return -ENOSPC;
1367
                } else
1368
                        mddev->size = rdev->size;
1369
        }
1370
 
1371
        /* Verify rdev->desc_nr is unique.
1372
         * If it is -1, assign a free number, else
1373
         * check number is not in use
1374
         */
1375
        if (rdev->desc_nr < 0) {
1376
                int choice = 0;
1377
                if (mddev->pers) choice = mddev->raid_disks;
1378
                while (find_rdev_nr(mddev, choice))
1379
                        choice++;
1380
                rdev->desc_nr = choice;
1381
        } else {
1382
                if (find_rdev_nr(mddev, rdev->desc_nr))
1383
                        return -EBUSY;
1384
        }
1385
        bdevname(rdev->bdev,b);
1386
        if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0)
1387
                return -ENOMEM;
1388
        while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL)
1389
                *s = '!';
1390
 
1391
        rdev->mddev = mddev;
1392
        printk(KERN_INFO "md: bind<%s>\n", b);
1393
 
1394
        rdev->kobj.parent = &mddev->kobj;
1395
        if ((err = kobject_add(&rdev->kobj)))
1396
                goto fail;
1397
 
1398
        if (rdev->bdev->bd_part)
1399
                ko = &rdev->bdev->bd_part->kobj;
1400
        else
1401
                ko = &rdev->bdev->bd_disk->kobj;
1402
        if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
1403
                kobject_del(&rdev->kobj);
1404
                goto fail;
1405
        }
1406
        list_add(&rdev->same_set, &mddev->disks);
1407
        bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk);
1408
        return 0;
1409
 
1410
 fail:
1411
        printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
1412
               b, mdname(mddev));
1413
        return err;
1414
}
1415
 
1416
static void delayed_delete(struct work_struct *ws)
1417
{
1418
        mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
1419
        kobject_del(&rdev->kobj);
1420
}
1421
 
1422
static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1423
{
1424
        char b[BDEVNAME_SIZE];
1425
        if (!rdev->mddev) {
1426
                MD_BUG();
1427
                return;
1428
        }
1429
        bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1430
        list_del_init(&rdev->same_set);
1431
        printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1432
        rdev->mddev = NULL;
1433
        sysfs_remove_link(&rdev->kobj, "block");
1434
 
1435
        /* We need to delay this, otherwise we can deadlock when
1436
         * writing to 'remove' to "dev/state"
1437
         */
1438
        INIT_WORK(&rdev->del_work, delayed_delete);
1439
        schedule_work(&rdev->del_work);
1440
}
1441
 
1442
/*
1443
 * prevent the device from being mounted, repartitioned or
1444
 * otherwise reused by a RAID array (or any other kernel
1445
 * subsystem), by bd_claiming the device.
1446
 */
1447
static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
1448
{
1449
        int err = 0;
1450
        struct block_device *bdev;
1451
        char b[BDEVNAME_SIZE];
1452
 
1453
        bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1454
        if (IS_ERR(bdev)) {
1455
                printk(KERN_ERR "md: could not open %s.\n",
1456
                        __bdevname(dev, b));
1457
                return PTR_ERR(bdev);
1458
        }
1459
        err = bd_claim(bdev, rdev);
1460
        if (err) {
1461
                printk(KERN_ERR "md: could not bd_claim %s.\n",
1462
                        bdevname(bdev, b));
1463
                blkdev_put(bdev);
1464
                return err;
1465
        }
1466
        rdev->bdev = bdev;
1467
        return err;
1468
}
1469
 
1470
static void unlock_rdev(mdk_rdev_t *rdev)
1471
{
1472
        struct block_device *bdev = rdev->bdev;
1473
        rdev->bdev = NULL;
1474
        if (!bdev)
1475
                MD_BUG();
1476
        bd_release(bdev);
1477
        blkdev_put(bdev);
1478
}
1479
 
1480
void md_autodetect_dev(dev_t dev);
1481
 
1482
static void export_rdev(mdk_rdev_t * rdev)
1483
{
1484
        char b[BDEVNAME_SIZE];
1485
        printk(KERN_INFO "md: export_rdev(%s)\n",
1486
                bdevname(rdev->bdev,b));
1487
        if (rdev->mddev)
1488
                MD_BUG();
1489
        free_disk_sb(rdev);
1490
        list_del_init(&rdev->same_set);
1491
#ifndef MODULE
1492
        md_autodetect_dev(rdev->bdev->bd_dev);
1493
#endif
1494
        unlock_rdev(rdev);
1495
        kobject_put(&rdev->kobj);
1496
}
1497
 
1498
static void kick_rdev_from_array(mdk_rdev_t * rdev)
1499
{
1500
        unbind_rdev_from_array(rdev);
1501
        export_rdev(rdev);
1502
}
1503
 
1504
static void export_array(mddev_t *mddev)
1505
{
1506
        struct list_head *tmp;
1507
        mdk_rdev_t *rdev;
1508
 
1509
        ITERATE_RDEV(mddev,rdev,tmp) {
1510
                if (!rdev->mddev) {
1511
                        MD_BUG();
1512
                        continue;
1513
                }
1514
                kick_rdev_from_array(rdev);
1515
        }
1516
        if (!list_empty(&mddev->disks))
1517
                MD_BUG();
1518
        mddev->raid_disks = 0;
1519
        mddev->major_version = 0;
1520
}
1521
 
1522
static void print_desc(mdp_disk_t *desc)
1523
{
1524
        printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1525
                desc->major,desc->minor,desc->raid_disk,desc->state);
1526
}
1527
 
1528
static void print_sb(mdp_super_t *sb)
1529
{
1530
        int i;
1531
 
1532
        printk(KERN_INFO
1533
                "md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1534
                sb->major_version, sb->minor_version, sb->patch_version,
1535
                sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1536
                sb->ctime);
1537
        printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1538
                sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1539
                sb->md_minor, sb->layout, sb->chunk_size);
1540
        printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
1541
                " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1542
                sb->utime, sb->state, sb->active_disks, sb->working_disks,
1543
                sb->failed_disks, sb->spare_disks,
1544
                sb->sb_csum, (unsigned long)sb->events_lo);
1545
 
1546
        printk(KERN_INFO);
1547
        for (i = 0; i < MD_SB_DISKS; i++) {
1548
                mdp_disk_t *desc;
1549
 
1550
                desc = sb->disks + i;
1551
                if (desc->number || desc->major || desc->minor ||
1552
                    desc->raid_disk || (desc->state && (desc->state != 4))) {
1553
                        printk("     D %2d: ", i);
1554
                        print_desc(desc);
1555
                }
1556
        }
1557
        printk(KERN_INFO "md:     THIS: ");
1558
        print_desc(&sb->this_disk);
1559
 
1560
}
1561
 
1562
static void print_rdev(mdk_rdev_t *rdev)
1563
{
1564
        char b[BDEVNAME_SIZE];
1565
        printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
1566
                bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
1567
                test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1568
                rdev->desc_nr);
1569
        if (rdev->sb_loaded) {
1570
                printk(KERN_INFO "md: rdev superblock:\n");
1571
                print_sb((mdp_super_t*)page_address(rdev->sb_page));
1572
        } else
1573
                printk(KERN_INFO "md: no rdev superblock!\n");
1574
}
1575
 
1576
static void md_print_devices(void)
1577
{
1578
        struct list_head *tmp, *tmp2;
1579
        mdk_rdev_t *rdev;
1580
        mddev_t *mddev;
1581
        char b[BDEVNAME_SIZE];
1582
 
1583
        printk("\n");
1584
        printk("md:     **********************************\n");
1585
        printk("md:     * <COMPLETE RAID STATE PRINTOUT> *\n");
1586
        printk("md:     **********************************\n");
1587
        ITERATE_MDDEV(mddev,tmp) {
1588
 
1589
                if (mddev->bitmap)
1590
                        bitmap_print_sb(mddev->bitmap);
1591
                else
1592
                        printk("%s: ", mdname(mddev));
1593
                ITERATE_RDEV(mddev,rdev,tmp2)
1594
                        printk("<%s>", bdevname(rdev->bdev,b));
1595
                printk("\n");
1596
 
1597
                ITERATE_RDEV(mddev,rdev,tmp2)
1598
                        print_rdev(rdev);
1599
        }
1600
        printk("md:     **********************************\n");
1601
        printk("\n");
1602
}
1603
 
1604
 
1605
static void sync_sbs(mddev_t * mddev, int nospares)
1606
{
1607
        /* Update each superblock (in-memory image), but
1608
         * if we are allowed to, skip spares which already
1609
         * have the right event counter, or have one earlier
1610
         * (which would mean they aren't being marked as dirty
1611
         * with the rest of the array)
1612
         */
1613
        mdk_rdev_t *rdev;
1614
        struct list_head *tmp;
1615
 
1616
        ITERATE_RDEV(mddev,rdev,tmp) {
1617
                if (rdev->sb_events == mddev->events ||
1618
                    (nospares &&
1619
                     rdev->raid_disk < 0 &&
1620
                     (rdev->sb_events&1)==0 &&
1621
                     rdev->sb_events+1 == mddev->events)) {
1622
                        /* Don't update this superblock */
1623
                        rdev->sb_loaded = 2;
1624
                } else {
1625
                        super_types[mddev->major_version].
1626
                                sync_super(mddev, rdev);
1627
                        rdev->sb_loaded = 1;
1628
                }
1629
        }
1630
}
1631
 
1632
static void md_update_sb(mddev_t * mddev, int force_change)
1633
{
1634
        struct list_head *tmp;
1635
        mdk_rdev_t *rdev;
1636
        int sync_req;
1637
        int nospares = 0;
1638
 
1639
repeat:
1640
        spin_lock_irq(&mddev->write_lock);
1641
 
1642
        set_bit(MD_CHANGE_PENDING, &mddev->flags);
1643
        if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
1644
                force_change = 1;
1645
        if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
1646
                /* just a clean<-> dirty transition, possibly leave spares alone,
1647
                 * though if events isn't the right even/odd, we will have to do
1648
                 * spares after all
1649
                 */
1650
                nospares = 1;
1651
        if (force_change)
1652
                nospares = 0;
1653
        if (mddev->degraded)
1654
                /* If the array is degraded, then skipping spares is both
1655
                 * dangerous and fairly pointless.
1656
                 * Dangerous because a device that was removed from the array
1657
                 * might have a event_count that still looks up-to-date,
1658
                 * so it can be re-added without a resync.
1659
                 * Pointless because if there are any spares to skip,
1660
                 * then a recovery will happen and soon that array won't
1661
                 * be degraded any more and the spare can go back to sleep then.
1662
                 */
1663
                nospares = 0;
1664
 
1665
        sync_req = mddev->in_sync;
1666
        mddev->utime = get_seconds();
1667
 
1668
        /* If this is just a dirty<->clean transition, and the array is clean
1669
         * and 'events' is odd, we can roll back to the previous clean state */
1670
        if (nospares
1671
            && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1672
            && (mddev->events & 1)
1673
            && mddev->events != 1)
1674
                mddev->events--;
1675
        else {
1676
                /* otherwise we have to go forward and ... */
1677
                mddev->events ++;
1678
                if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1679
                        /* .. if the array isn't clean, insist on an odd 'events' */
1680
                        if ((mddev->events&1)==0) {
1681
                                mddev->events++;
1682
                                nospares = 0;
1683
                        }
1684
                } else {
1685
                        /* otherwise insist on an even 'events' (for clean states) */
1686
                        if ((mddev->events&1)) {
1687
                                mddev->events++;
1688
                                nospares = 0;
1689
                        }
1690
                }
1691
        }
1692
 
1693
        if (!mddev->events) {
1694
                /*
1695
                 * oops, this 64-bit counter should never wrap.
1696
                 * Either we are in around ~1 trillion A.C., assuming
1697
                 * 1 reboot per second, or we have a bug:
1698
                 */
1699
                MD_BUG();
1700
                mddev->events --;
1701
        }
1702
        sync_sbs(mddev, nospares);
1703
 
1704
        /*
1705
         * do not write anything to disk if using
1706
         * nonpersistent superblocks
1707
         */
1708
        if (!mddev->persistent) {
1709
                clear_bit(MD_CHANGE_PENDING, &mddev->flags);
1710
                spin_unlock_irq(&mddev->write_lock);
1711
                wake_up(&mddev->sb_wait);
1712
                return;
1713
        }
1714
        spin_unlock_irq(&mddev->write_lock);
1715
 
1716
        dprintk(KERN_INFO
1717
                "md: updating %s RAID superblock on device (in sync %d)\n",
1718
                mdname(mddev),mddev->in_sync);
1719
 
1720
        bitmap_update_sb(mddev->bitmap);
1721
        ITERATE_RDEV(mddev,rdev,tmp) {
1722
                char b[BDEVNAME_SIZE];
1723
                dprintk(KERN_INFO "md: ");
1724
                if (rdev->sb_loaded != 1)
1725
                        continue; /* no noise on spare devices */
1726
                if (test_bit(Faulty, &rdev->flags))
1727
                        dprintk("(skipping faulty ");
1728
 
1729
                dprintk("%s ", bdevname(rdev->bdev,b));
1730
                if (!test_bit(Faulty, &rdev->flags)) {
1731
                        md_super_write(mddev,rdev,
1732
                                       rdev->sb_offset<<1, rdev->sb_size,
1733
                                       rdev->sb_page);
1734
                        dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1735
                                bdevname(rdev->bdev,b),
1736
                                (unsigned long long)rdev->sb_offset);
1737
                        rdev->sb_events = mddev->events;
1738
 
1739
                } else
1740
                        dprintk(")\n");
1741
                if (mddev->level == LEVEL_MULTIPATH)
1742
                        /* only need to write one superblock... */
1743
                        break;
1744
        }
1745
        md_super_wait(mddev);
1746
        /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
1747
 
1748
        spin_lock_irq(&mddev->write_lock);
1749
        if (mddev->in_sync != sync_req ||
1750
            test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
1751
                /* have to write it out again */
1752
                spin_unlock_irq(&mddev->write_lock);
1753
                goto repeat;
1754
        }
1755
        clear_bit(MD_CHANGE_PENDING, &mddev->flags);
1756
        spin_unlock_irq(&mddev->write_lock);
1757
        wake_up(&mddev->sb_wait);
1758
 
1759
}
1760
 
1761
/* words written to sysfs files may, or my not, be \n terminated.
1762
 * We want to accept with case. For this we use cmd_match.
1763
 */
1764
static int cmd_match(const char *cmd, const char *str)
1765
{
1766
        /* See if cmd, written into a sysfs file, matches
1767
         * str.  They must either be the same, or cmd can
1768
         * have a trailing newline
1769
         */
1770
        while (*cmd && *str && *cmd == *str) {
1771
                cmd++;
1772
                str++;
1773
        }
1774
        if (*cmd == '\n')
1775
                cmd++;
1776
        if (*str || *cmd)
1777
                return 0;
1778
        return 1;
1779
}
1780
 
1781
struct rdev_sysfs_entry {
1782
        struct attribute attr;
1783
        ssize_t (*show)(mdk_rdev_t *, char *);
1784
        ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
1785
};
1786
 
1787
static ssize_t
1788
state_show(mdk_rdev_t *rdev, char *page)
1789
{
1790
        char *sep = "";
1791
        int len=0;
1792
 
1793
        if (test_bit(Faulty, &rdev->flags)) {
1794
                len+= sprintf(page+len, "%sfaulty",sep);
1795
                sep = ",";
1796
        }
1797
        if (test_bit(In_sync, &rdev->flags)) {
1798
                len += sprintf(page+len, "%sin_sync",sep);
1799
                sep = ",";
1800
        }
1801
        if (test_bit(WriteMostly, &rdev->flags)) {
1802
                len += sprintf(page+len, "%swrite_mostly",sep);
1803
                sep = ",";
1804
        }
1805
        if (!test_bit(Faulty, &rdev->flags) &&
1806
            !test_bit(In_sync, &rdev->flags)) {
1807
                len += sprintf(page+len, "%sspare", sep);
1808
                sep = ",";
1809
        }
1810
        return len+sprintf(page+len, "\n");
1811
}
1812
 
1813
static ssize_t
1814
state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1815
{
1816
        /* can write
1817
         *  faulty  - simulates and error
1818
         *  remove  - disconnects the device
1819
         *  writemostly - sets write_mostly
1820
         *  -writemostly - clears write_mostly
1821
         */
1822
        int err = -EINVAL;
1823
        if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
1824
                md_error(rdev->mddev, rdev);
1825
                err = 0;
1826
        } else if (cmd_match(buf, "remove")) {
1827
                if (rdev->raid_disk >= 0)
1828
                        err = -EBUSY;
1829
                else {
1830
                        mddev_t *mddev = rdev->mddev;
1831
                        kick_rdev_from_array(rdev);
1832
                        if (mddev->pers)
1833
                                md_update_sb(mddev, 1);
1834
                        md_new_event(mddev);
1835
                        err = 0;
1836
                }
1837
        } else if (cmd_match(buf, "writemostly")) {
1838
                set_bit(WriteMostly, &rdev->flags);
1839
                err = 0;
1840
        } else if (cmd_match(buf, "-writemostly")) {
1841
                clear_bit(WriteMostly, &rdev->flags);
1842
                err = 0;
1843
        }
1844
        return err ? err : len;
1845
}
1846
static struct rdev_sysfs_entry rdev_state =
1847
__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
1848
 
1849
static ssize_t
1850
super_show(mdk_rdev_t *rdev, char *page)
1851
{
1852
        if (rdev->sb_loaded && rdev->sb_size) {
1853
                memcpy(page, page_address(rdev->sb_page), rdev->sb_size);
1854
                return rdev->sb_size;
1855
        } else
1856
                return 0;
1857
}
1858
static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
1859
 
1860
static ssize_t
1861
errors_show(mdk_rdev_t *rdev, char *page)
1862
{
1863
        return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
1864
}
1865
 
1866
static ssize_t
1867
errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1868
{
1869
        char *e;
1870
        unsigned long n = simple_strtoul(buf, &e, 10);
1871
        if (*buf && (*e == 0 || *e == '\n')) {
1872
                atomic_set(&rdev->corrected_errors, n);
1873
                return len;
1874
        }
1875
        return -EINVAL;
1876
}
1877
static struct rdev_sysfs_entry rdev_errors =
1878
__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
1879
 
1880
static ssize_t
1881
slot_show(mdk_rdev_t *rdev, char *page)
1882
{
1883
        if (rdev->raid_disk < 0)
1884
                return sprintf(page, "none\n");
1885
        else
1886
                return sprintf(page, "%d\n", rdev->raid_disk);
1887
}
1888
 
1889
static ssize_t
1890
slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1891
{
1892
        char *e;
1893
        int slot = simple_strtoul(buf, &e, 10);
1894
        if (strncmp(buf, "none", 4)==0)
1895
                slot = -1;
1896
        else if (e==buf || (*e && *e!= '\n'))
1897
                return -EINVAL;
1898
        if (rdev->mddev->pers)
1899
                /* Cannot set slot in active array (yet) */
1900
                return -EBUSY;
1901
        if (slot >= rdev->mddev->raid_disks)
1902
                return -ENOSPC;
1903
        rdev->raid_disk = slot;
1904
        /* assume it is working */
1905
        rdev->flags = 0;
1906
        set_bit(In_sync, &rdev->flags);
1907
        return len;
1908
}
1909
 
1910
 
1911
static struct rdev_sysfs_entry rdev_slot =
1912
__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
1913
 
1914
static ssize_t
1915
offset_show(mdk_rdev_t *rdev, char *page)
1916
{
1917
        return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
1918
}
1919
 
1920
static ssize_t
1921
offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1922
{
1923
        char *e;
1924
        unsigned long long offset = simple_strtoull(buf, &e, 10);
1925
        if (e==buf || (*e && *e != '\n'))
1926
                return -EINVAL;
1927
        if (rdev->mddev->pers)
1928
                return -EBUSY;
1929
        rdev->data_offset = offset;
1930
        return len;
1931
}
1932
 
1933
static struct rdev_sysfs_entry rdev_offset =
1934
__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
1935
 
1936
static ssize_t
1937
rdev_size_show(mdk_rdev_t *rdev, char *page)
1938
{
1939
        return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
1940
}
1941
 
1942
static ssize_t
1943
rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1944
{
1945
        char *e;
1946
        unsigned long long size = simple_strtoull(buf, &e, 10);
1947
        if (e==buf || (*e && *e != '\n'))
1948
                return -EINVAL;
1949
        if (rdev->mddev->pers)
1950
                return -EBUSY;
1951
        rdev->size = size;
1952
        if (size < rdev->mddev->size || rdev->mddev->size == 0)
1953
                rdev->mddev->size = size;
1954
        return len;
1955
}
1956
 
1957
static struct rdev_sysfs_entry rdev_size =
1958
__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
1959
 
1960
static struct attribute *rdev_default_attrs[] = {
1961
        &rdev_state.attr,
1962
        &rdev_super.attr,
1963
        &rdev_errors.attr,
1964
        &rdev_slot.attr,
1965
        &rdev_offset.attr,
1966
        &rdev_size.attr,
1967
        NULL,
1968
};
1969
static ssize_t
1970
rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1971
{
1972
        struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1973
        mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1974
 
1975
        if (!entry->show)
1976
                return -EIO;
1977
        return entry->show(rdev, page);
1978
}
1979
 
1980
static ssize_t
1981
rdev_attr_store(struct kobject *kobj, struct attribute *attr,
1982
              const char *page, size_t length)
1983
{
1984
        struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1985
        mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1986
 
1987
        if (!entry->store)
1988
                return -EIO;
1989
        if (!capable(CAP_SYS_ADMIN))
1990
                return -EACCES;
1991
        return entry->store(rdev, page, length);
1992
}
1993
 
1994
static void rdev_free(struct kobject *ko)
1995
{
1996
        mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
1997
        kfree(rdev);
1998
}
1999
static struct sysfs_ops rdev_sysfs_ops = {
2000
        .show           = rdev_attr_show,
2001
        .store          = rdev_attr_store,
2002
};
2003
static struct kobj_type rdev_ktype = {
2004
        .release        = rdev_free,
2005
        .sysfs_ops      = &rdev_sysfs_ops,
2006
        .default_attrs  = rdev_default_attrs,
2007
};
2008
 
2009
/*
2010
 * Import a device. If 'super_format' >= 0, then sanity check the superblock
2011
 *
2012
 * mark the device faulty if:
2013
 *
2014
 *   - the device is nonexistent (zero size)
2015
 *   - the device has no valid superblock
2016
 *
2017
 * a faulty rdev _never_ has rdev->sb set.
2018
 */
2019
static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
2020
{
2021
        char b[BDEVNAME_SIZE];
2022
        int err;
2023
        mdk_rdev_t *rdev;
2024
        sector_t size;
2025
 
2026
        rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
2027
        if (!rdev) {
2028
                printk(KERN_ERR "md: could not alloc mem for new device!\n");
2029
                return ERR_PTR(-ENOMEM);
2030
        }
2031
 
2032
        if ((err = alloc_disk_sb(rdev)))
2033
                goto abort_free;
2034
 
2035
        err = lock_rdev(rdev, newdev);
2036
        if (err)
2037
                goto abort_free;
2038
 
2039
        rdev->kobj.parent = NULL;
2040
        rdev->kobj.ktype = &rdev_ktype;
2041
        kobject_init(&rdev->kobj);
2042
 
2043
        rdev->desc_nr = -1;
2044
        rdev->saved_raid_disk = -1;
2045
        rdev->raid_disk = -1;
2046
        rdev->flags = 0;
2047
        rdev->data_offset = 0;
2048
        rdev->sb_events = 0;
2049
        atomic_set(&rdev->nr_pending, 0);
2050
        atomic_set(&rdev->read_errors, 0);
2051
        atomic_set(&rdev->corrected_errors, 0);
2052
 
2053
        size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2054
        if (!size) {
2055
                printk(KERN_WARNING
2056
                        "md: %s has zero or unknown size, marking faulty!\n",
2057
                        bdevname(rdev->bdev,b));
2058
                err = -EINVAL;
2059
                goto abort_free;
2060
        }
2061
 
2062
        if (super_format >= 0) {
2063
                err = super_types[super_format].
2064
                        load_super(rdev, NULL, super_minor);
2065
                if (err == -EINVAL) {
2066
                        printk(KERN_WARNING
2067
                                "md: %s does not have a valid v%d.%d "
2068
                               "superblock, not importing!\n",
2069
                                bdevname(rdev->bdev,b),
2070
                               super_format, super_minor);
2071
                        goto abort_free;
2072
                }
2073
                if (err < 0) {
2074
                        printk(KERN_WARNING
2075
                                "md: could not read %s's sb, not importing!\n",
2076
                                bdevname(rdev->bdev,b));
2077
                        goto abort_free;
2078
                }
2079
        }
2080
        INIT_LIST_HEAD(&rdev->same_set);
2081
 
2082
        return rdev;
2083
 
2084
abort_free:
2085
        if (rdev->sb_page) {
2086
                if (rdev->bdev)
2087
                        unlock_rdev(rdev);
2088
                free_disk_sb(rdev);
2089
        }
2090
        kfree(rdev);
2091
        return ERR_PTR(err);
2092
}
2093
 
2094
/*
2095
 * Check a full RAID array for plausibility
2096
 */
2097
 
2098
 
2099
static void analyze_sbs(mddev_t * mddev)
2100
{
2101
        int i;
2102
        struct list_head *tmp;
2103
        mdk_rdev_t *rdev, *freshest;
2104
        char b[BDEVNAME_SIZE];
2105
 
2106
        freshest = NULL;
2107
        ITERATE_RDEV(mddev,rdev,tmp)
2108
                switch (super_types[mddev->major_version].
2109
                        load_super(rdev, freshest, mddev->minor_version)) {
2110
                case 1:
2111
                        freshest = rdev;
2112
                        break;
2113
                case 0:
2114
                        break;
2115
                default:
2116
                        printk( KERN_ERR \
2117
                                "md: fatal superblock inconsistency in %s"
2118
                                " -- removing from array\n",
2119
                                bdevname(rdev->bdev,b));
2120
                        kick_rdev_from_array(rdev);
2121
                }
2122
 
2123
 
2124
        super_types[mddev->major_version].
2125
                validate_super(mddev, freshest);
2126
 
2127
        i = 0;
2128
        ITERATE_RDEV(mddev,rdev,tmp) {
2129
                if (rdev != freshest)
2130
                        if (super_types[mddev->major_version].
2131
                            validate_super(mddev, rdev)) {
2132
                                printk(KERN_WARNING "md: kicking non-fresh %s"
2133
                                        " from array!\n",
2134
                                        bdevname(rdev->bdev,b));
2135
                                kick_rdev_from_array(rdev);
2136
                                continue;
2137
                        }
2138
                if (mddev->level == LEVEL_MULTIPATH) {
2139
                        rdev->desc_nr = i++;
2140
                        rdev->raid_disk = rdev->desc_nr;
2141
                        set_bit(In_sync, &rdev->flags);
2142
                } else if (rdev->raid_disk >= mddev->raid_disks) {
2143
                        rdev->raid_disk = -1;
2144
                        clear_bit(In_sync, &rdev->flags);
2145
                }
2146
        }
2147
 
2148
 
2149
 
2150
        if (mddev->recovery_cp != MaxSector &&
2151
            mddev->level >= 1)
2152
                printk(KERN_ERR "md: %s: raid array is not clean"
2153
                       " -- starting background reconstruction\n",
2154
                       mdname(mddev));
2155
 
2156
}
2157
 
2158
static ssize_t
2159
safe_delay_show(mddev_t *mddev, char *page)
2160
{
2161
        int msec = (mddev->safemode_delay*1000)/HZ;
2162
        return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2163
}
2164
static ssize_t
2165
safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2166
{
2167
        int scale=1;
2168
        int dot=0;
2169
        int i;
2170
        unsigned long msec;
2171
        char buf[30];
2172
        char *e;
2173
        /* remove a period, and count digits after it */
2174
        if (len >= sizeof(buf))
2175
                return -EINVAL;
2176
        strlcpy(buf, cbuf, len);
2177
        buf[len] = 0;
2178
        for (i=0; i<len; i++) {
2179
                if (dot) {
2180
                        if (isdigit(buf[i])) {
2181
                                buf[i-1] = buf[i];
2182
                                scale *= 10;
2183
                        }
2184
                        buf[i] = 0;
2185
                } else if (buf[i] == '.') {
2186
                        dot=1;
2187
                        buf[i] = 0;
2188
                }
2189
        }
2190
        msec = simple_strtoul(buf, &e, 10);
2191
        if (e == buf || (*e && *e != '\n'))
2192
                return -EINVAL;
2193
        msec = (msec * 1000) / scale;
2194
        if (msec == 0)
2195
                mddev->safemode_delay = 0;
2196
        else {
2197
                mddev->safemode_delay = (msec*HZ)/1000;
2198
                if (mddev->safemode_delay == 0)
2199
                        mddev->safemode_delay = 1;
2200
        }
2201
        return len;
2202
}
2203
static struct md_sysfs_entry md_safe_delay =
2204
__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
2205
 
2206
static ssize_t
2207
level_show(mddev_t *mddev, char *page)
2208
{
2209
        struct mdk_personality *p = mddev->pers;
2210
        if (p)
2211
                return sprintf(page, "%s\n", p->name);
2212
        else if (mddev->clevel[0])
2213
                return sprintf(page, "%s\n", mddev->clevel);
2214
        else if (mddev->level != LEVEL_NONE)
2215
                return sprintf(page, "%d\n", mddev->level);
2216
        else
2217
                return 0;
2218
}
2219
 
2220
static ssize_t
2221
level_store(mddev_t *mddev, const char *buf, size_t len)
2222
{
2223
        int rv = len;
2224
        if (mddev->pers)
2225
                return -EBUSY;
2226
        if (len == 0)
2227
                return 0;
2228
        if (len >= sizeof(mddev->clevel))
2229
                return -ENOSPC;
2230
        strncpy(mddev->clevel, buf, len);
2231
        if (mddev->clevel[len-1] == '\n')
2232
                len--;
2233
        mddev->clevel[len] = 0;
2234
        mddev->level = LEVEL_NONE;
2235
        return rv;
2236
}
2237
 
2238
static struct md_sysfs_entry md_level =
2239
__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
2240
 
2241
 
2242
static ssize_t
2243
layout_show(mddev_t *mddev, char *page)
2244
{
2245
        /* just a number, not meaningful for all levels */
2246
        if (mddev->reshape_position != MaxSector &&
2247
            mddev->layout != mddev->new_layout)
2248
                return sprintf(page, "%d (%d)\n",
2249
                               mddev->new_layout, mddev->layout);
2250
        return sprintf(page, "%d\n", mddev->layout);
2251
}
2252
 
2253
static ssize_t
2254
layout_store(mddev_t *mddev, const char *buf, size_t len)
2255
{
2256
        char *e;
2257
        unsigned long n = simple_strtoul(buf, &e, 10);
2258
 
2259
        if (!*buf || (*e && *e != '\n'))
2260
                return -EINVAL;
2261
 
2262
        if (mddev->pers)
2263
                return -EBUSY;
2264
        if (mddev->reshape_position != MaxSector)
2265
                mddev->new_layout = n;
2266
        else
2267
                mddev->layout = n;
2268
        return len;
2269
}
2270
static struct md_sysfs_entry md_layout =
2271
__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
2272
 
2273
 
2274
static ssize_t
2275
raid_disks_show(mddev_t *mddev, char *page)
2276
{
2277
        if (mddev->raid_disks == 0)
2278
                return 0;
2279
        if (mddev->reshape_position != MaxSector &&
2280
            mddev->delta_disks != 0)
2281
                return sprintf(page, "%d (%d)\n", mddev->raid_disks,
2282
                               mddev->raid_disks - mddev->delta_disks);
2283
        return sprintf(page, "%d\n", mddev->raid_disks);
2284
}
2285
 
2286
static int update_raid_disks(mddev_t *mddev, int raid_disks);
2287
 
2288
static ssize_t
2289
raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2290
{
2291
        char *e;
2292
        int rv = 0;
2293
        unsigned long n = simple_strtoul(buf, &e, 10);
2294
 
2295
        if (!*buf || (*e && *e != '\n'))
2296
                return -EINVAL;
2297
 
2298
        if (mddev->pers)
2299
                rv = update_raid_disks(mddev, n);
2300
        else if (mddev->reshape_position != MaxSector) {
2301
                int olddisks = mddev->raid_disks - mddev->delta_disks;
2302
                mddev->delta_disks = n - olddisks;
2303
                mddev->raid_disks = n;
2304
        } else
2305
                mddev->raid_disks = n;
2306
        return rv ? rv : len;
2307
}
2308
static struct md_sysfs_entry md_raid_disks =
2309
__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
2310
 
2311
static ssize_t
2312
chunk_size_show(mddev_t *mddev, char *page)
2313
{
2314
        if (mddev->reshape_position != MaxSector &&
2315
            mddev->chunk_size != mddev->new_chunk)
2316
                return sprintf(page, "%d (%d)\n", mddev->new_chunk,
2317
                               mddev->chunk_size);
2318
        return sprintf(page, "%d\n", mddev->chunk_size);
2319
}
2320
 
2321
static ssize_t
2322
chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2323
{
2324
        /* can only set chunk_size if array is not yet active */
2325
        char *e;
2326
        unsigned long n = simple_strtoul(buf, &e, 10);
2327
 
2328
        if (!*buf || (*e && *e != '\n'))
2329
                return -EINVAL;
2330
 
2331
        if (mddev->pers)
2332
                return -EBUSY;
2333
        else if (mddev->reshape_position != MaxSector)
2334
                mddev->new_chunk = n;
2335
        else
2336
                mddev->chunk_size = n;
2337
        return len;
2338
}
2339
static struct md_sysfs_entry md_chunk_size =
2340
__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
2341
 
2342
static ssize_t
2343
resync_start_show(mddev_t *mddev, char *page)
2344
{
2345
        return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2346
}
2347
 
2348
static ssize_t
2349
resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2350
{
2351
        /* can only set chunk_size if array is not yet active */
2352
        char *e;
2353
        unsigned long long n = simple_strtoull(buf, &e, 10);
2354
 
2355
        if (mddev->pers)
2356
                return -EBUSY;
2357
        if (!*buf || (*e && *e != '\n'))
2358
                return -EINVAL;
2359
 
2360
        mddev->recovery_cp = n;
2361
        return len;
2362
}
2363
static struct md_sysfs_entry md_resync_start =
2364
__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2365
 
2366
/*
2367
 * The array state can be:
2368
 *
2369
 * clear
2370
 *     No devices, no size, no level
2371
 *     Equivalent to STOP_ARRAY ioctl
2372
 * inactive
2373
 *     May have some settings, but array is not active
2374
 *        all IO results in error
2375
 *     When written, doesn't tear down array, but just stops it
2376
 * suspended (not supported yet)
2377
 *     All IO requests will block. The array can be reconfigured.
2378
 *     Writing this, if accepted, will block until array is quiessent
2379
 * readonly
2380
 *     no resync can happen.  no superblocks get written.
2381
 *     write requests fail
2382
 * read-auto
2383
 *     like readonly, but behaves like 'clean' on a write request.
2384
 *
2385
 * clean - no pending writes, but otherwise active.
2386
 *     When written to inactive array, starts without resync
2387
 *     If a write request arrives then
2388
 *       if metadata is known, mark 'dirty' and switch to 'active'.
2389
 *       if not known, block and switch to write-pending
2390
 *     If written to an active array that has pending writes, then fails.
2391
 * active
2392
 *     fully active: IO and resync can be happening.
2393
 *     When written to inactive array, starts with resync
2394
 *
2395
 * write-pending
2396
 *     clean, but writes are blocked waiting for 'active' to be written.
2397
 *
2398
 * active-idle
2399
 *     like active, but no writes have been seen for a while (100msec).
2400
 *
2401
 */
2402
enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2403
                   write_pending, active_idle, bad_word};
2404
static char *array_states[] = {
2405
        "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
2406
        "write-pending", "active-idle", NULL };
2407
 
2408
static int match_word(const char *word, char **list)
2409
{
2410
        int n;
2411
        for (n=0; list[n]; n++)
2412
                if (cmd_match(word, list[n]))
2413
                        break;
2414
        return n;
2415
}
2416
 
2417
static ssize_t
2418
array_state_show(mddev_t *mddev, char *page)
2419
{
2420
        enum array_state st = inactive;
2421
 
2422
        if (mddev->pers)
2423
                switch(mddev->ro) {
2424
                case 1:
2425
                        st = readonly;
2426
                        break;
2427
                case 2:
2428
                        st = read_auto;
2429
                        break;
2430
                case 0:
2431
                        if (mddev->in_sync)
2432
                                st = clean;
2433
                        else if (mddev->safemode)
2434
                                st = active_idle;
2435
                        else
2436
                                st = active;
2437
                }
2438
        else {
2439
                if (list_empty(&mddev->disks) &&
2440
                    mddev->raid_disks == 0 &&
2441
                    mddev->size == 0)
2442
                        st = clear;
2443
                else
2444
                        st = inactive;
2445
        }
2446
        return sprintf(page, "%s\n", array_states[st]);
2447
}
2448
 
2449
static int do_md_stop(mddev_t * mddev, int ro);
2450
static int do_md_run(mddev_t * mddev);
2451
static int restart_array(mddev_t *mddev);
2452
 
2453
static ssize_t
2454
array_state_store(mddev_t *mddev, const char *buf, size_t len)
2455
{
2456
        int err = -EINVAL;
2457
        enum array_state st = match_word(buf, array_states);
2458
        switch(st) {
2459
        case bad_word:
2460
                break;
2461
        case clear:
2462
                /* stopping an active array */
2463
                if (mddev->pers) {
2464
                        if (atomic_read(&mddev->active) > 1)
2465
                                return -EBUSY;
2466
                        err = do_md_stop(mddev, 0);
2467
                }
2468
                break;
2469
        case inactive:
2470
                /* stopping an active array */
2471
                if (mddev->pers) {
2472
                        if (atomic_read(&mddev->active) > 1)
2473
                                return -EBUSY;
2474
                        err = do_md_stop(mddev, 2);
2475
                }
2476
                break;
2477
        case suspended:
2478
                break; /* not supported yet */
2479
        case readonly:
2480
                if (mddev->pers)
2481
                        err = do_md_stop(mddev, 1);
2482
                else {
2483
                        mddev->ro = 1;
2484
                        err = do_md_run(mddev);
2485
                }
2486
                break;
2487
        case read_auto:
2488
                /* stopping an active array */
2489
                if (mddev->pers) {
2490
                        err = do_md_stop(mddev, 1);
2491
                        if (err == 0)
2492
                                mddev->ro = 2; /* FIXME mark devices writable */
2493
                } else {
2494
                        mddev->ro = 2;
2495
                        err = do_md_run(mddev);
2496
                }
2497
                break;
2498
        case clean:
2499
                if (mddev->pers) {
2500
                        restart_array(mddev);
2501
                        spin_lock_irq(&mddev->write_lock);
2502
                        if (atomic_read(&mddev->writes_pending) == 0) {
2503
                                mddev->in_sync = 1;
2504
                                set_bit(MD_CHANGE_CLEAN, &mddev->flags);
2505
                        }
2506
                        spin_unlock_irq(&mddev->write_lock);
2507
                } else {
2508
                        mddev->ro = 0;
2509
                        mddev->recovery_cp = MaxSector;
2510
                        err = do_md_run(mddev);
2511
                }
2512
                break;
2513
        case active:
2514
                if (mddev->pers) {
2515
                        restart_array(mddev);
2516
                        clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2517
                        wake_up(&mddev->sb_wait);
2518
                        err = 0;
2519
                } else {
2520
                        mddev->ro = 0;
2521
                        err = do_md_run(mddev);
2522
                }
2523
                break;
2524
        case write_pending:
2525
        case active_idle:
2526
                /* these cannot be set */
2527
                break;
2528
        }
2529
        if (err)
2530
                return err;
2531
        else
2532
                return len;
2533
}
2534
static struct md_sysfs_entry md_array_state =
2535
__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
2536
 
2537
static ssize_t
2538
null_show(mddev_t *mddev, char *page)
2539
{
2540
        return -EINVAL;
2541
}
2542
 
2543
static ssize_t
2544
new_dev_store(mddev_t *mddev, const char *buf, size_t len)
2545
{
2546
        /* buf must be %d:%d\n? giving major and minor numbers */
2547
        /* The new device is added to the array.
2548
         * If the array has a persistent superblock, we read the
2549
         * superblock to initialise info and check validity.
2550
         * Otherwise, only checking done is that in bind_rdev_to_array,
2551
         * which mainly checks size.
2552
         */
2553
        char *e;
2554
        int major = simple_strtoul(buf, &e, 10);
2555
        int minor;
2556
        dev_t dev;
2557
        mdk_rdev_t *rdev;
2558
        int err;
2559
 
2560
        if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
2561
                return -EINVAL;
2562
        minor = simple_strtoul(e+1, &e, 10);
2563
        if (*e && *e != '\n')
2564
                return -EINVAL;
2565
        dev = MKDEV(major, minor);
2566
        if (major != MAJOR(dev) ||
2567
            minor != MINOR(dev))
2568
                return -EOVERFLOW;
2569
 
2570
 
2571
        if (mddev->persistent) {
2572
                rdev = md_import_device(dev, mddev->major_version,
2573
                                        mddev->minor_version);
2574
                if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
2575
                        mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2576
                                                       mdk_rdev_t, same_set);
2577
                        err = super_types[mddev->major_version]
2578
                                .load_super(rdev, rdev0, mddev->minor_version);
2579
                        if (err < 0)
2580
                                goto out;
2581
                }
2582
        } else
2583
                rdev = md_import_device(dev, -1, -1);
2584
 
2585
        if (IS_ERR(rdev))
2586
                return PTR_ERR(rdev);
2587
        err = bind_rdev_to_array(rdev, mddev);
2588
 out:
2589
        if (err)
2590
                export_rdev(rdev);
2591
        return err ? err : len;
2592
}
2593
 
2594
static struct md_sysfs_entry md_new_device =
2595
__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
2596
 
2597
static ssize_t
2598
bitmap_store(mddev_t *mddev, const char *buf, size_t len)
2599
{
2600
        char *end;
2601
        unsigned long chunk, end_chunk;
2602
 
2603
        if (!mddev->bitmap)
2604
                goto out;
2605
        /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
2606
        while (*buf) {
2607
                chunk = end_chunk = simple_strtoul(buf, &end, 0);
2608
                if (buf == end) break;
2609
                if (*end == '-') { /* range */
2610
                        buf = end + 1;
2611
                        end_chunk = simple_strtoul(buf, &end, 0);
2612
                        if (buf == end) break;
2613
                }
2614
                if (*end && !isspace(*end)) break;
2615
                bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
2616
                buf = end;
2617
                while (isspace(*buf)) buf++;
2618
        }
2619
        bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
2620
out:
2621
        return len;
2622
}
2623
 
2624
static struct md_sysfs_entry md_bitmap =
2625
__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
2626
 
2627
static ssize_t
2628
size_show(mddev_t *mddev, char *page)
2629
{
2630
        return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
2631
}
2632
 
2633
static int update_size(mddev_t *mddev, unsigned long size);
2634
 
2635
static ssize_t
2636
size_store(mddev_t *mddev, const char *buf, size_t len)
2637
{
2638
        /* If array is inactive, we can reduce the component size, but
2639
         * not increase it (except from 0).
2640
         * If array is active, we can try an on-line resize
2641
         */
2642
        char *e;
2643
        int err = 0;
2644
        unsigned long long size = simple_strtoull(buf, &e, 10);
2645
        if (!*buf || *buf == '\n' ||
2646
            (*e && *e != '\n'))
2647
                return -EINVAL;
2648
 
2649
        if (mddev->pers) {
2650
                err = update_size(mddev, size);
2651
                md_update_sb(mddev, 1);
2652
        } else {
2653
                if (mddev->size == 0 ||
2654
                    mddev->size > size)
2655
                        mddev->size = size;
2656
                else
2657
                        err = -ENOSPC;
2658
        }
2659
        return err ? err : len;
2660
}
2661
 
2662
static struct md_sysfs_entry md_size =
2663
__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
2664
 
2665
 
2666
/* Metdata version.
2667
 * This is either 'none' for arrays with externally managed metadata,
2668
 * or N.M for internally known formats
2669
 */
2670
static ssize_t
2671
metadata_show(mddev_t *mddev, char *page)
2672
{
2673
        if (mddev->persistent)
2674
                return sprintf(page, "%d.%d\n",
2675
                               mddev->major_version, mddev->minor_version);
2676
        else
2677
                return sprintf(page, "none\n");
2678
}
2679
 
2680
static ssize_t
2681
metadata_store(mddev_t *mddev, const char *buf, size_t len)
2682
{
2683
        int major, minor;
2684
        char *e;
2685
        if (!list_empty(&mddev->disks))
2686
                return -EBUSY;
2687
 
2688
        if (cmd_match(buf, "none")) {
2689
                mddev->persistent = 0;
2690
                mddev->major_version = 0;
2691
                mddev->minor_version = 90;
2692
                return len;
2693
        }
2694
        major = simple_strtoul(buf, &e, 10);
2695
        if (e==buf || *e != '.')
2696
                return -EINVAL;
2697
        buf = e+1;
2698
        minor = simple_strtoul(buf, &e, 10);
2699
        if (e==buf || (*e && *e != '\n') )
2700
                return -EINVAL;
2701
        if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
2702
                return -ENOENT;
2703
        mddev->major_version = major;
2704
        mddev->minor_version = minor;
2705
        mddev->persistent = 1;
2706
        return len;
2707
}
2708
 
2709
static struct md_sysfs_entry md_metadata =
2710
__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
2711
 
2712
static ssize_t
2713
action_show(mddev_t *mddev, char *page)
2714
{
2715
        char *type = "idle";
2716
        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2717
            (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
2718
                if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2719
                        type = "reshape";
2720
                else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2721
                        if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2722
                                type = "resync";
2723
                        else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2724
                                type = "check";
2725
                        else
2726
                                type = "repair";
2727
                } else
2728
                        type = "recover";
2729
        }
2730
        return sprintf(page, "%s\n", type);
2731
}
2732
 
2733
static ssize_t
2734
action_store(mddev_t *mddev, const char *page, size_t len)
2735
{
2736
        if (!mddev->pers || !mddev->pers->sync_request)
2737
                return -EINVAL;
2738
 
2739
        if (cmd_match(page, "idle")) {
2740
                if (mddev->sync_thread) {
2741
                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2742
                        md_unregister_thread(mddev->sync_thread);
2743
                        mddev->sync_thread = NULL;
2744
                        mddev->recovery = 0;
2745
                }
2746
        } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2747
                   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
2748
                return -EBUSY;
2749
        else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
2750
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2751
        else if (cmd_match(page, "reshape")) {
2752
                int err;
2753
                if (mddev->pers->start_reshape == NULL)
2754
                        return -EINVAL;
2755
                err = mddev->pers->start_reshape(mddev);
2756
                if (err)
2757
                        return err;
2758
        } else {
2759
                if (cmd_match(page, "check"))
2760
                        set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2761
                else if (!cmd_match(page, "repair"))
2762
                        return -EINVAL;
2763
                set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
2764
                set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
2765
        }
2766
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2767
        md_wakeup_thread(mddev->thread);
2768
        return len;
2769
}
2770
 
2771
static ssize_t
2772
mismatch_cnt_show(mddev_t *mddev, char *page)
2773
{
2774
        return sprintf(page, "%llu\n",
2775
                       (unsigned long long) mddev->resync_mismatches);
2776
}
2777
 
2778
static struct md_sysfs_entry md_scan_mode =
2779
__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
2780
 
2781
 
2782
static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
2783
 
2784
static ssize_t
2785
sync_min_show(mddev_t *mddev, char *page)
2786
{
2787
        return sprintf(page, "%d (%s)\n", speed_min(mddev),
2788
                       mddev->sync_speed_min ? "local": "system");
2789
}
2790
 
2791
static ssize_t
2792
sync_min_store(mddev_t *mddev, const char *buf, size_t len)
2793
{
2794
        int min;
2795
        char *e;
2796
        if (strncmp(buf, "system", 6)==0) {
2797
                mddev->sync_speed_min = 0;
2798
                return len;
2799
        }
2800
        min = simple_strtoul(buf, &e, 10);
2801
        if (buf == e || (*e && *e != '\n') || min <= 0)
2802
                return -EINVAL;
2803
        mddev->sync_speed_min = min;
2804
        return len;
2805
}
2806
 
2807
static struct md_sysfs_entry md_sync_min =
2808
__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
2809
 
2810
static ssize_t
2811
sync_max_show(mddev_t *mddev, char *page)
2812
{
2813
        return sprintf(page, "%d (%s)\n", speed_max(mddev),
2814
                       mddev->sync_speed_max ? "local": "system");
2815
}
2816
 
2817
static ssize_t
2818
sync_max_store(mddev_t *mddev, const char *buf, size_t len)
2819
{
2820
        int max;
2821
        char *e;
2822
        if (strncmp(buf, "system", 6)==0) {
2823
                mddev->sync_speed_max = 0;
2824
                return len;
2825
        }
2826
        max = simple_strtoul(buf, &e, 10);
2827
        if (buf == e || (*e && *e != '\n') || max <= 0)
2828
                return -EINVAL;
2829
        mddev->sync_speed_max = max;
2830
        return len;
2831
}
2832
 
2833
static struct md_sysfs_entry md_sync_max =
2834
__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
2835
 
2836
static ssize_t
2837
degraded_show(mddev_t *mddev, char *page)
2838
{
2839
        return sprintf(page, "%d\n", mddev->degraded);
2840
}
2841
static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
2842
 
2843
static ssize_t
2844
sync_speed_show(mddev_t *mddev, char *page)
2845
{
2846
        unsigned long resync, dt, db;
2847
        resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active));
2848
        dt = ((jiffies - mddev->resync_mark) / HZ);
2849
        if (!dt) dt++;
2850
        db = resync - (mddev->resync_mark_cnt);
2851
        return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
2852
}
2853
 
2854
static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
2855
 
2856
static ssize_t
2857
sync_completed_show(mddev_t *mddev, char *page)
2858
{
2859
        unsigned long max_blocks, resync;
2860
 
2861
        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2862
                max_blocks = mddev->resync_max_sectors;
2863
        else
2864
                max_blocks = mddev->size << 1;
2865
 
2866
        resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
2867
        return sprintf(page, "%lu / %lu\n", resync, max_blocks);
2868
}
2869
 
2870
static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
2871
 
2872
static ssize_t
2873
suspend_lo_show(mddev_t *mddev, char *page)
2874
{
2875
        return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
2876
}
2877
 
2878
static ssize_t
2879
suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
2880
{
2881
        char *e;
2882
        unsigned long long new = simple_strtoull(buf, &e, 10);
2883
 
2884
        if (mddev->pers->quiesce == NULL)
2885
                return -EINVAL;
2886
        if (buf == e || (*e && *e != '\n'))
2887
                return -EINVAL;
2888
        if (new >= mddev->suspend_hi ||
2889
            (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
2890
                mddev->suspend_lo = new;
2891
                mddev->pers->quiesce(mddev, 2);
2892
                return len;
2893
        } else
2894
                return -EINVAL;
2895
}
2896
static struct md_sysfs_entry md_suspend_lo =
2897
__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
2898
 
2899
 
2900
static ssize_t
2901
suspend_hi_show(mddev_t *mddev, char *page)
2902
{
2903
        return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
2904
}
2905
 
2906
static ssize_t
2907
suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
2908
{
2909
        char *e;
2910
        unsigned long long new = simple_strtoull(buf, &e, 10);
2911
 
2912
        if (mddev->pers->quiesce == NULL)
2913
                return -EINVAL;
2914
        if (buf == e || (*e && *e != '\n'))
2915
                return -EINVAL;
2916
        if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
2917
            (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
2918
                mddev->suspend_hi = new;
2919
                mddev->pers->quiesce(mddev, 1);
2920
                mddev->pers->quiesce(mddev, 0);
2921
                return len;
2922
        } else
2923
                return -EINVAL;
2924
}
2925
static struct md_sysfs_entry md_suspend_hi =
2926
__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
2927
 
2928
static ssize_t
2929
reshape_position_show(mddev_t *mddev, char *page)
2930
{
2931
        if (mddev->reshape_position != MaxSector)
2932
                return sprintf(page, "%llu\n",
2933
                               (unsigned long long)mddev->reshape_position);
2934
        strcpy(page, "none\n");
2935
        return 5;
2936
}
2937
 
2938
static ssize_t
2939
reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
2940
{
2941
        char *e;
2942
        unsigned long long new = simple_strtoull(buf, &e, 10);
2943
        if (mddev->pers)
2944
                return -EBUSY;
2945
        if (buf == e || (*e && *e != '\n'))
2946
                return -EINVAL;
2947
        mddev->reshape_position = new;
2948
        mddev->delta_disks = 0;
2949
        mddev->new_level = mddev->level;
2950
        mddev->new_layout = mddev->layout;
2951
        mddev->new_chunk = mddev->chunk_size;
2952
        return len;
2953
}
2954
 
2955
static struct md_sysfs_entry md_reshape_position =
2956
__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
2957
       reshape_position_store);
2958
 
2959
 
2960
static struct attribute *md_default_attrs[] = {
2961
        &md_level.attr,
2962
        &md_layout.attr,
2963
        &md_raid_disks.attr,
2964
        &md_chunk_size.attr,
2965
        &md_size.attr,
2966
        &md_resync_start.attr,
2967
        &md_metadata.attr,
2968
        &md_new_device.attr,
2969
        &md_safe_delay.attr,
2970
        &md_array_state.attr,
2971
        &md_reshape_position.attr,
2972
        NULL,
2973
};
2974
 
2975
static struct attribute *md_redundancy_attrs[] = {
2976
        &md_scan_mode.attr,
2977
        &md_mismatches.attr,
2978
        &md_sync_min.attr,
2979
        &md_sync_max.attr,
2980
        &md_sync_speed.attr,
2981
        &md_sync_completed.attr,
2982
        &md_suspend_lo.attr,
2983
        &md_suspend_hi.attr,
2984
        &md_bitmap.attr,
2985
        &md_degraded.attr,
2986
        NULL,
2987
};
2988
static struct attribute_group md_redundancy_group = {
2989
        .name = NULL,
2990
        .attrs = md_redundancy_attrs,
2991
};
2992
 
2993
 
2994
static ssize_t
2995
md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2996
{
2997
        struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
2998
        mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
2999
        ssize_t rv;
3000
 
3001
        if (!entry->show)
3002
                return -EIO;
3003
        rv = mddev_lock(mddev);
3004
        if (!rv) {
3005
                rv = entry->show(mddev, page);
3006
                mddev_unlock(mddev);
3007
        }
3008
        return rv;
3009
}
3010
 
3011
static ssize_t
3012
md_attr_store(struct kobject *kobj, struct attribute *attr,
3013
              const char *page, size_t length)
3014
{
3015
        struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3016
        mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3017
        ssize_t rv;
3018
 
3019
        if (!entry->store)
3020
                return -EIO;
3021
        if (!capable(CAP_SYS_ADMIN))
3022
                return -EACCES;
3023
        rv = mddev_lock(mddev);
3024
        if (!rv) {
3025
                rv = entry->store(mddev, page, length);
3026
                mddev_unlock(mddev);
3027
        }
3028
        return rv;
3029
}
3030
 
3031
static void md_free(struct kobject *ko)
3032
{
3033
        mddev_t *mddev = container_of(ko, mddev_t, kobj);
3034
        kfree(mddev);
3035
}
3036
 
3037
static struct sysfs_ops md_sysfs_ops = {
3038
        .show   = md_attr_show,
3039
        .store  = md_attr_store,
3040
};
3041
static struct kobj_type md_ktype = {
3042
        .release        = md_free,
3043
        .sysfs_ops      = &md_sysfs_ops,
3044
        .default_attrs  = md_default_attrs,
3045
};
3046
 
3047
int mdp_major = 0;
3048
 
3049
static struct kobject *md_probe(dev_t dev, int *part, void *data)
3050
{
3051
        static DEFINE_MUTEX(disks_mutex);
3052
        mddev_t *mddev = mddev_find(dev);
3053
        struct gendisk *disk;
3054
        int partitioned = (MAJOR(dev) != MD_MAJOR);
3055
        int shift = partitioned ? MdpMinorShift : 0;
3056
        int unit = MINOR(dev) >> shift;
3057
 
3058
        if (!mddev)
3059
                return NULL;
3060
 
3061
        mutex_lock(&disks_mutex);
3062
        if (mddev->gendisk) {
3063
                mutex_unlock(&disks_mutex);
3064
                mddev_put(mddev);
3065
                return NULL;
3066
        }
3067
        disk = alloc_disk(1 << shift);
3068
        if (!disk) {
3069
                mutex_unlock(&disks_mutex);
3070
                mddev_put(mddev);
3071
                return NULL;
3072
        }
3073
        disk->major = MAJOR(dev);
3074
        disk->first_minor = unit << shift;
3075
        if (partitioned)
3076
                sprintf(disk->disk_name, "md_d%d", unit);
3077
        else
3078
                sprintf(disk->disk_name, "md%d", unit);
3079
        disk->fops = &md_fops;
3080
        disk->private_data = mddev;
3081
        disk->queue = mddev->queue;
3082
        add_disk(disk);
3083
        mddev->gendisk = disk;
3084
        mutex_unlock(&disks_mutex);
3085
        mddev->kobj.parent = &disk->kobj;
3086
        kobject_set_name(&mddev->kobj, "%s", "md");
3087
        mddev->kobj.ktype = &md_ktype;
3088
        if (kobject_register(&mddev->kobj))
3089
                printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3090
                       disk->disk_name);
3091
        return NULL;
3092
}
3093
 
3094
static void md_safemode_timeout(unsigned long data)
3095
{
3096
        mddev_t *mddev = (mddev_t *) data;
3097
 
3098
        mddev->safemode = 1;
3099
        md_wakeup_thread(mddev->thread);
3100
}
3101
 
3102
static int start_dirty_degraded;
3103
 
3104
static int do_md_run(mddev_t * mddev)
3105
{
3106
        int err;
3107
        int chunk_size;
3108
        struct list_head *tmp;
3109
        mdk_rdev_t *rdev;
3110
        struct gendisk *disk;
3111
        struct mdk_personality *pers;
3112
        char b[BDEVNAME_SIZE];
3113
 
3114
        if (list_empty(&mddev->disks))
3115
                /* cannot run an array with no devices.. */
3116
                return -EINVAL;
3117
 
3118
        if (mddev->pers)
3119
                return -EBUSY;
3120
 
3121
        /*
3122
         * Analyze all RAID superblock(s)
3123
         */
3124
        if (!mddev->raid_disks)
3125
                analyze_sbs(mddev);
3126
 
3127
        chunk_size = mddev->chunk_size;
3128
 
3129
        if (chunk_size) {
3130
                if (chunk_size > MAX_CHUNK_SIZE) {
3131
                        printk(KERN_ERR "too big chunk_size: %d > %d\n",
3132
                                chunk_size, MAX_CHUNK_SIZE);
3133
                        return -EINVAL;
3134
                }
3135
                /*
3136
                 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
3137
                 */
3138
                if ( (1 << ffz(~chunk_size)) != chunk_size) {
3139
                        printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
3140
                        return -EINVAL;
3141
                }
3142
                if (chunk_size < PAGE_SIZE) {
3143
                        printk(KERN_ERR "too small chunk_size: %d < %ld\n",
3144
                                chunk_size, PAGE_SIZE);
3145
                        return -EINVAL;
3146
                }
3147
 
3148
                /* devices must have minimum size of one chunk */
3149
                ITERATE_RDEV(mddev,rdev,tmp) {
3150
                        if (test_bit(Faulty, &rdev->flags))
3151
                                continue;
3152
                        if (rdev->size < chunk_size / 1024) {
3153
                                printk(KERN_WARNING
3154
                                        "md: Dev %s smaller than chunk_size:"
3155
                                        " %lluk < %dk\n",
3156
                                        bdevname(rdev->bdev,b),
3157
                                        (unsigned long long)rdev->size,
3158
                                        chunk_size / 1024);
3159
                                return -EINVAL;
3160
                        }
3161
                }
3162
        }
3163
 
3164
#ifdef CONFIG_KMOD
3165
        if (mddev->level != LEVEL_NONE)
3166
                request_module("md-level-%d", mddev->level);
3167
        else if (mddev->clevel[0])
3168
                request_module("md-%s", mddev->clevel);
3169
#endif
3170
 
3171
        /*
3172
         * Drop all container device buffers, from now on
3173
         * the only valid external interface is through the md
3174
         * device.
3175
         */
3176
        ITERATE_RDEV(mddev,rdev,tmp) {
3177
                if (test_bit(Faulty, &rdev->flags))
3178
                        continue;
3179
                sync_blockdev(rdev->bdev);
3180
                invalidate_bdev(rdev->bdev);
3181
 
3182
                /* perform some consistency tests on the device.
3183
                 * We don't want the data to overlap the metadata,
3184
                 * Internal Bitmap issues has handled elsewhere.
3185
                 */
3186
                if (rdev->data_offset < rdev->sb_offset) {
3187
                        if (mddev->size &&
3188
                            rdev->data_offset + mddev->size*2
3189
                            > rdev->sb_offset*2) {
3190
                                printk("md: %s: data overlaps metadata\n",
3191
                                       mdname(mddev));
3192
                                return -EINVAL;
3193
                        }
3194
                } else {
3195
                        if (rdev->sb_offset*2 + rdev->sb_size/512
3196
                            > rdev->data_offset) {
3197
                                printk("md: %s: metadata overlaps data\n",
3198
                                       mdname(mddev));
3199
                                return -EINVAL;
3200
                        }
3201
                }
3202
        }
3203
 
3204
        md_probe(mddev->unit, NULL, NULL);
3205
        disk = mddev->gendisk;
3206
        if (!disk)
3207
                return -ENOMEM;
3208
 
3209
        spin_lock(&pers_lock);
3210
        pers = find_pers(mddev->level, mddev->clevel);
3211
        if (!pers || !try_module_get(pers->owner)) {
3212
                spin_unlock(&pers_lock);
3213
                if (mddev->level != LEVEL_NONE)
3214
                        printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
3215
                               mddev->level);
3216
                else
3217
                        printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
3218
                               mddev->clevel);
3219
                return -EINVAL;
3220
        }
3221
        mddev->pers = pers;
3222
        spin_unlock(&pers_lock);
3223
        mddev->level = pers->level;
3224
        strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3225
 
3226
        if (mddev->reshape_position != MaxSector &&
3227
            pers->start_reshape == NULL) {
3228
                /* This personality cannot handle reshaping... */
3229
                mddev->pers = NULL;
3230
                module_put(pers->owner);
3231
                return -EINVAL;
3232
        }
3233
 
3234
        if (pers->sync_request) {
3235
                /* Warn if this is a potentially silly
3236
                 * configuration.
3237
                 */
3238
                char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3239
                mdk_rdev_t *rdev2;
3240
                struct list_head *tmp2;
3241
                int warned = 0;
3242
                ITERATE_RDEV(mddev, rdev, tmp) {
3243
                        ITERATE_RDEV(mddev, rdev2, tmp2) {
3244
                                if (rdev < rdev2 &&
3245
                                    rdev->bdev->bd_contains ==
3246
                                    rdev2->bdev->bd_contains) {
3247
                                        printk(KERN_WARNING
3248
                                               "%s: WARNING: %s appears to be"
3249
                                               " on the same physical disk as"
3250
                                               " %s.\n",
3251
                                               mdname(mddev),
3252
                                               bdevname(rdev->bdev,b),
3253
                                               bdevname(rdev2->bdev,b2));
3254
                                        warned = 1;
3255
                                }
3256
                        }
3257
                }
3258
                if (warned)
3259
                        printk(KERN_WARNING
3260
                               "True protection against single-disk"
3261
                               " failure might be compromised.\n");
3262
        }
3263
 
3264
        mddev->recovery = 0;
3265
        mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
3266
        mddev->barriers_work = 1;
3267
        mddev->ok_start_degraded = start_dirty_degraded;
3268
 
3269
        if (start_readonly)
3270
                mddev->ro = 2; /* read-only, but switch on first write */
3271
 
3272
        err = mddev->pers->run(mddev);
3273
        if (!err && mddev->pers->sync_request) {
3274
                err = bitmap_create(mddev);
3275
                if (err) {
3276
                        printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
3277
                               mdname(mddev), err);
3278
                        mddev->pers->stop(mddev);
3279
                }
3280
        }
3281
        if (err) {
3282
                printk(KERN_ERR "md: pers->run() failed ...\n");
3283
                module_put(mddev->pers->owner);
3284
                mddev->pers = NULL;
3285
                bitmap_destroy(mddev);
3286
                return err;
3287
        }
3288
        if (mddev->pers->sync_request) {
3289
                if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3290
                        printk(KERN_WARNING
3291
                               "md: cannot register extra attributes for %s\n",
3292
                               mdname(mddev));
3293
        } else if (mddev->ro == 2) /* auto-readonly not meaningful */
3294
                mddev->ro = 0;
3295
 
3296
        atomic_set(&mddev->writes_pending,0);
3297
        mddev->safemode = 0;
3298
        mddev->safemode_timer.function = md_safemode_timeout;
3299
        mddev->safemode_timer.data = (unsigned long) mddev;
3300
        mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
3301
        mddev->in_sync = 1;
3302
 
3303
        ITERATE_RDEV(mddev,rdev,tmp)
3304
                if (rdev->raid_disk >= 0) {
3305
                        char nm[20];
3306
                        sprintf(nm, "rd%d", rdev->raid_disk);
3307
                        if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
3308
                                printk("md: cannot register %s for %s\n",
3309
                                       nm, mdname(mddev));
3310
                }
3311
 
3312
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3313
 
3314
        if (mddev->flags)
3315
                md_update_sb(mddev, 0);
3316
 
3317
        set_capacity(disk, mddev->array_size<<1);
3318
 
3319
        /* If we call blk_queue_make_request here, it will
3320
         * re-initialise max_sectors etc which may have been
3321
         * refined inside -> run.  So just set the bits we need to set.
3322
         * Most initialisation happended when we called
3323
         * blk_queue_make_request(..., md_fail_request)
3324
         * earlier.
3325
         */
3326
        mddev->queue->queuedata = mddev;
3327
        mddev->queue->make_request_fn = mddev->pers->make_request;
3328
 
3329
        /* If there is a partially-recovered drive we need to
3330
         * start recovery here.  If we leave it to md_check_recovery,
3331
         * it will remove the drives and not do the right thing
3332
         */
3333
        if (mddev->degraded && !mddev->sync_thread) {
3334
                struct list_head *rtmp;
3335
                int spares = 0;
3336
                ITERATE_RDEV(mddev,rdev,rtmp)
3337
                        if (rdev->raid_disk >= 0 &&
3338
                            !test_bit(In_sync, &rdev->flags) &&
3339
                            !test_bit(Faulty, &rdev->flags))
3340
                                /* complete an interrupted recovery */
3341
                                spares++;
3342
                if (spares && mddev->pers->sync_request) {
3343
                        mddev->recovery = 0;
3344
                        set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3345
                        mddev->sync_thread = md_register_thread(md_do_sync,
3346
                                                                mddev,
3347
                                                                "%s_resync");
3348
                        if (!mddev->sync_thread) {
3349
                                printk(KERN_ERR "%s: could not start resync"
3350
                                       " thread...\n",
3351
                                       mdname(mddev));
3352
                                /* leave the spares where they are, it shouldn't hurt */
3353
                                mddev->recovery = 0;
3354
                        }
3355
                }
3356
        }
3357
        md_wakeup_thread(mddev->thread);
3358
        md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
3359
 
3360
        mddev->changed = 1;
3361
        md_new_event(mddev);
3362
        kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE);
3363
        return 0;
3364
}
3365
 
3366
static int restart_array(mddev_t *mddev)
3367
{
3368
        struct gendisk *disk = mddev->gendisk;
3369
        int err;
3370
 
3371
        /*
3372
         * Complain if it has no devices
3373
         */
3374
        err = -ENXIO;
3375
        if (list_empty(&mddev->disks))
3376
                goto out;
3377
 
3378
        if (mddev->pers) {
3379
                err = -EBUSY;
3380
                if (!mddev->ro)
3381
                        goto out;
3382
 
3383
                mddev->safemode = 0;
3384
                mddev->ro = 0;
3385
                set_disk_ro(disk, 0);
3386
 
3387
                printk(KERN_INFO "md: %s switched to read-write mode.\n",
3388
                        mdname(mddev));
3389
                /*
3390
                 * Kick recovery or resync if necessary
3391
                 */
3392
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3393
                md_wakeup_thread(mddev->thread);
3394
                md_wakeup_thread(mddev->sync_thread);
3395
                err = 0;
3396
        } else
3397
                err = -EINVAL;
3398
 
3399
out:
3400
        return err;
3401
}
3402
 
3403
/* similar to deny_write_access, but accounts for our holding a reference
3404
 * to the file ourselves */
3405
static int deny_bitmap_write_access(struct file * file)
3406
{
3407
        struct inode *inode = file->f_mapping->host;
3408
 
3409
        spin_lock(&inode->i_lock);
3410
        if (atomic_read(&inode->i_writecount) > 1) {
3411
                spin_unlock(&inode->i_lock);
3412
                return -ETXTBSY;
3413
        }
3414
        atomic_set(&inode->i_writecount, -1);
3415
        spin_unlock(&inode->i_lock);
3416
 
3417
        return 0;
3418
}
3419
 
3420
static void restore_bitmap_write_access(struct file *file)
3421
{
3422
        struct inode *inode = file->f_mapping->host;
3423
 
3424
        spin_lock(&inode->i_lock);
3425
        atomic_set(&inode->i_writecount, 1);
3426
        spin_unlock(&inode->i_lock);
3427
}
3428
 
3429
/* mode:
3430
 *   0 - completely stop and dis-assemble array
3431
 *   1 - switch to readonly
3432
 *   2 - stop but do not disassemble array
3433
 */
3434
static int do_md_stop(mddev_t * mddev, int mode)
3435
{
3436
        int err = 0;
3437
        struct gendisk *disk = mddev->gendisk;
3438
 
3439
        if (mddev->pers) {
3440
                if (atomic_read(&mddev->active)>2) {
3441
                        printk("md: %s still in use.\n",mdname(mddev));
3442
                        return -EBUSY;
3443
                }
3444
 
3445
                if (mddev->sync_thread) {
3446
                        set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3447
                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3448
                        md_unregister_thread(mddev->sync_thread);
3449
                        mddev->sync_thread = NULL;
3450
                }
3451
 
3452
                del_timer_sync(&mddev->safemode_timer);
3453
 
3454
                invalidate_partition(disk, 0);
3455
 
3456
                switch(mode) {
3457
                case 1: /* readonly */
3458
                        err  = -ENXIO;
3459
                        if (mddev->ro==1)
3460
                                goto out;
3461
                        mddev->ro = 1;
3462
                        break;
3463
                case 0: /* disassemble */
3464
                case 2: /* stop */
3465
                        bitmap_flush(mddev);
3466
                        md_super_wait(mddev);
3467
                        if (mddev->ro)
3468
                                set_disk_ro(disk, 0);
3469
                        blk_queue_make_request(mddev->queue, md_fail_request);
3470
                        mddev->pers->stop(mddev);
3471
                        mddev->queue->merge_bvec_fn = NULL;
3472
                        mddev->queue->unplug_fn = NULL;
3473
                        mddev->queue->backing_dev_info.congested_fn = NULL;
3474
                        if (mddev->pers->sync_request)
3475
                                sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
3476
 
3477
                        module_put(mddev->pers->owner);
3478
                        mddev->pers = NULL;
3479
 
3480
                        set_capacity(disk, 0);
3481
                        mddev->changed = 1;
3482
 
3483
                        if (mddev->ro)
3484
                                mddev->ro = 0;
3485
                }
3486
                if (!mddev->in_sync || mddev->flags) {
3487
                        /* mark array as shutdown cleanly */
3488
                        mddev->in_sync = 1;
3489
                        md_update_sb(mddev, 1);
3490
                }
3491
                if (mode == 1)
3492
                        set_disk_ro(disk, 1);
3493
                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3494
        }
3495
 
3496
        /*
3497
         * Free resources if final stop
3498
         */
3499
        if (mode == 0) {
3500
                mdk_rdev_t *rdev;
3501
                struct list_head *tmp;
3502
 
3503
                printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
3504
 
3505
                bitmap_destroy(mddev);
3506
                if (mddev->bitmap_file) {
3507
                        restore_bitmap_write_access(mddev->bitmap_file);
3508
                        fput(mddev->bitmap_file);
3509
                        mddev->bitmap_file = NULL;
3510
                }
3511
                mddev->bitmap_offset = 0;
3512
 
3513
                ITERATE_RDEV(mddev,rdev,tmp)
3514
                        if (rdev->raid_disk >= 0) {
3515
                                char nm[20];
3516
                                sprintf(nm, "rd%d", rdev->raid_disk);
3517
                                sysfs_remove_link(&mddev->kobj, nm);
3518
                        }
3519
 
3520
                /* make sure all delayed_delete calls have finished */
3521
                flush_scheduled_work();
3522
 
3523
                export_array(mddev);
3524
 
3525
                mddev->array_size = 0;
3526
                mddev->size = 0;
3527
                mddev->raid_disks = 0;
3528
                mddev->recovery_cp = 0;
3529
                mddev->reshape_position = MaxSector;
3530
 
3531
        } else if (mddev->pers)
3532
                printk(KERN_INFO "md: %s switched to read-only mode.\n",
3533
                        mdname(mddev));
3534
        err = 0;
3535
        md_new_event(mddev);
3536
out:
3537
        return err;
3538
}
3539
 
3540
#ifndef MODULE
3541
static void autorun_array(mddev_t *mddev)
3542
{
3543
        mdk_rdev_t *rdev;
3544
        struct list_head *tmp;
3545
        int err;
3546
 
3547
        if (list_empty(&mddev->disks))
3548
                return;
3549
 
3550
        printk(KERN_INFO "md: running: ");
3551
 
3552
        ITERATE_RDEV(mddev,rdev,tmp) {
3553
                char b[BDEVNAME_SIZE];
3554
                printk("<%s>", bdevname(rdev->bdev,b));
3555
        }
3556
        printk("\n");
3557
 
3558
        err = do_md_run (mddev);
3559
        if (err) {
3560
                printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
3561
                do_md_stop (mddev, 0);
3562
        }
3563
}
3564
 
3565
/*
3566
 * lets try to run arrays based on all disks that have arrived
3567
 * until now. (those are in pending_raid_disks)
3568
 *
3569
 * the method: pick the first pending disk, collect all disks with
3570
 * the same UUID, remove all from the pending list and put them into
3571
 * the 'same_array' list. Then order this list based on superblock
3572
 * update time (freshest comes first), kick out 'old' disks and
3573
 * compare superblocks. If everything's fine then run it.
3574
 *
3575
 * If "unit" is allocated, then bump its reference count
3576
 */
3577
static void autorun_devices(int part)
3578
{
3579
        struct list_head *tmp;
3580
        mdk_rdev_t *rdev0, *rdev;
3581
        mddev_t *mddev;
3582
        char b[BDEVNAME_SIZE];
3583
 
3584
        printk(KERN_INFO "md: autorun ...\n");
3585
        while (!list_empty(&pending_raid_disks)) {
3586
                int unit;
3587
                dev_t dev;
3588
                LIST_HEAD(candidates);
3589
                rdev0 = list_entry(pending_raid_disks.next,
3590
                                         mdk_rdev_t, same_set);
3591
 
3592
                printk(KERN_INFO "md: considering %s ...\n",
3593
                        bdevname(rdev0->bdev,b));
3594
                INIT_LIST_HEAD(&candidates);
3595
                ITERATE_RDEV_PENDING(rdev,tmp)
3596
                        if (super_90_load(rdev, rdev0, 0) >= 0) {
3597
                                printk(KERN_INFO "md:  adding %s ...\n",
3598
                                        bdevname(rdev->bdev,b));
3599
                                list_move(&rdev->same_set, &candidates);
3600
                        }
3601
                /*
3602
                 * now we have a set of devices, with all of them having
3603
                 * mostly sane superblocks. It's time to allocate the
3604
                 * mddev.
3605
                 */
3606
                if (part) {
3607
                        dev = MKDEV(mdp_major,
3608
                                    rdev0->preferred_minor << MdpMinorShift);
3609
                        unit = MINOR(dev) >> MdpMinorShift;
3610
                } else {
3611
                        dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
3612
                        unit = MINOR(dev);
3613
                }
3614
                if (rdev0->preferred_minor != unit) {
3615
                        printk(KERN_INFO "md: unit number in %s is bad: %d\n",
3616
                               bdevname(rdev0->bdev, b), rdev0->preferred_minor);
3617
                        break;
3618
                }
3619
 
3620
                md_probe(dev, NULL, NULL);
3621
                mddev = mddev_find(dev);
3622
                if (!mddev) {
3623
                        printk(KERN_ERR
3624
                                "md: cannot allocate memory for md drive.\n");
3625
                        break;
3626
                }
3627
                if (mddev_lock(mddev))
3628
                        printk(KERN_WARNING "md: %s locked, cannot run\n",
3629
                               mdname(mddev));
3630
                else if (mddev->raid_disks || mddev->major_version
3631
                         || !list_empty(&mddev->disks)) {
3632
                        printk(KERN_WARNING
3633
                                "md: %s already running, cannot run %s\n",
3634
                                mdname(mddev), bdevname(rdev0->bdev,b));
3635
                        mddev_unlock(mddev);
3636
                } else {
3637
                        printk(KERN_INFO "md: created %s\n", mdname(mddev));
3638
                        ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
3639
                                list_del_init(&rdev->same_set);
3640
                                if (bind_rdev_to_array(rdev, mddev))
3641
                                        export_rdev(rdev);
3642
                        }
3643
                        autorun_array(mddev);
3644
                        mddev_unlock(mddev);
3645
                }
3646
                /* on success, candidates will be empty, on error
3647
                 * it won't...
3648
                 */
3649
                ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
3650
                        export_rdev(rdev);
3651
                mddev_put(mddev);
3652
        }
3653
        printk(KERN_INFO "md: ... autorun DONE.\n");
3654
}
3655
#endif /* !MODULE */
3656
 
3657
static int get_version(void __user * arg)
3658
{
3659
        mdu_version_t ver;
3660
 
3661
        ver.major = MD_MAJOR_VERSION;
3662
        ver.minor = MD_MINOR_VERSION;
3663
        ver.patchlevel = MD_PATCHLEVEL_VERSION;
3664
 
3665
        if (copy_to_user(arg, &ver, sizeof(ver)))
3666
                return -EFAULT;
3667
 
3668
        return 0;
3669
}
3670
 
3671
static int get_array_info(mddev_t * mddev, void __user * arg)
3672
{
3673
        mdu_array_info_t info;
3674
        int nr,working,active,failed,spare;
3675
        mdk_rdev_t *rdev;
3676
        struct list_head *tmp;
3677
 
3678
        nr=working=active=failed=spare=0;
3679
        ITERATE_RDEV(mddev,rdev,tmp) {
3680
                nr++;
3681
                if (test_bit(Faulty, &rdev->flags))
3682
                        failed++;
3683
                else {
3684
                        working++;
3685
                        if (test_bit(In_sync, &rdev->flags))
3686
                                active++;
3687
                        else
3688
                                spare++;
3689
                }
3690
        }
3691
 
3692
        info.major_version = mddev->major_version;
3693
        info.minor_version = mddev->minor_version;
3694
        info.patch_version = MD_PATCHLEVEL_VERSION;
3695
        info.ctime         = mddev->ctime;
3696
        info.level         = mddev->level;
3697
        info.size          = mddev->size;
3698
        if (info.size != mddev->size) /* overflow */
3699
                info.size = -1;
3700
        info.nr_disks      = nr;
3701
        info.raid_disks    = mddev->raid_disks;
3702
        info.md_minor      = mddev->md_minor;
3703
        info.not_persistent= !mddev->persistent;
3704
 
3705
        info.utime         = mddev->utime;
3706
        info.state         = 0;
3707
        if (mddev->in_sync)
3708
                info.state = (1<<MD_SB_CLEAN);
3709
        if (mddev->bitmap && mddev->bitmap_offset)
3710
                info.state = (1<<MD_SB_BITMAP_PRESENT);
3711
        info.active_disks  = active;
3712
        info.working_disks = working;
3713
        info.failed_disks  = failed;
3714
        info.spare_disks   = spare;
3715
 
3716
        info.layout        = mddev->layout;
3717
        info.chunk_size    = mddev->chunk_size;
3718
 
3719
        if (copy_to_user(arg, &info, sizeof(info)))
3720
                return -EFAULT;
3721
 
3722
        return 0;
3723
}
3724
 
3725
static int get_bitmap_file(mddev_t * mddev, void __user * arg)
3726
{
3727
        mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
3728
        char *ptr, *buf = NULL;
3729
        int err = -ENOMEM;
3730
 
3731
        md_allow_write(mddev);
3732
 
3733
        file = kmalloc(sizeof(*file), GFP_KERNEL);
3734
        if (!file)
3735
                goto out;
3736
 
3737
        /* bitmap disabled, zero the first byte and copy out */
3738
        if (!mddev->bitmap || !mddev->bitmap->file) {
3739
                file->pathname[0] = '\0';
3740
                goto copy_out;
3741
        }
3742
 
3743
        buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
3744
        if (!buf)
3745
                goto out;
3746
 
3747
        ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
3748
        if (!ptr)
3749
                goto out;
3750
 
3751
        strcpy(file->pathname, ptr);
3752
 
3753
copy_out:
3754
        err = 0;
3755
        if (copy_to_user(arg, file, sizeof(*file)))
3756
                err = -EFAULT;
3757
out:
3758
        kfree(buf);
3759
        kfree(file);
3760
        return err;
3761
}
3762
 
3763
static int get_disk_info(mddev_t * mddev, void __user * arg)
3764
{
3765
        mdu_disk_info_t info;
3766
        unsigned int nr;
3767
        mdk_rdev_t *rdev;
3768
 
3769
        if (copy_from_user(&info, arg, sizeof(info)))
3770
                return -EFAULT;
3771
 
3772
        nr = info.number;
3773
 
3774
        rdev = find_rdev_nr(mddev, nr);
3775
        if (rdev) {
3776
                info.major = MAJOR(rdev->bdev->bd_dev);
3777
                info.minor = MINOR(rdev->bdev->bd_dev);
3778
                info.raid_disk = rdev->raid_disk;
3779
                info.state = 0;
3780
                if (test_bit(Faulty, &rdev->flags))
3781
                        info.state |= (1<<MD_DISK_FAULTY);
3782
                else if (test_bit(In_sync, &rdev->flags)) {
3783
                        info.state |= (1<<MD_DISK_ACTIVE);
3784
                        info.state |= (1<<MD_DISK_SYNC);
3785
                }
3786
                if (test_bit(WriteMostly, &rdev->flags))
3787
                        info.state |= (1<<MD_DISK_WRITEMOSTLY);
3788
        } else {
3789
                info.major = info.minor = 0;
3790
                info.raid_disk = -1;
3791
                info.state = (1<<MD_DISK_REMOVED);
3792
        }
3793
 
3794
        if (copy_to_user(arg, &info, sizeof(info)))
3795
                return -EFAULT;
3796
 
3797
        return 0;
3798
}
3799
 
3800
static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
3801
{
3802
        char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3803
        mdk_rdev_t *rdev;
3804
        dev_t dev = MKDEV(info->major,info->minor);
3805
 
3806
        if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
3807
                return -EOVERFLOW;
3808
 
3809
        if (!mddev->raid_disks) {
3810
                int err;
3811
                /* expecting a device which has a superblock */
3812
                rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
3813
                if (IS_ERR(rdev)) {
3814
                        printk(KERN_WARNING
3815
                                "md: md_import_device returned %ld\n",
3816
                                PTR_ERR(rdev));
3817
                        return PTR_ERR(rdev);
3818
                }
3819
                if (!list_empty(&mddev->disks)) {
3820
                        mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
3821
                                                        mdk_rdev_t, same_set);
3822
                        int err = super_types[mddev->major_version]
3823
                                .load_super(rdev, rdev0, mddev->minor_version);
3824
                        if (err < 0) {
3825
                                printk(KERN_WARNING
3826
                                        "md: %s has different UUID to %s\n",
3827
                                        bdevname(rdev->bdev,b),
3828
                                        bdevname(rdev0->bdev,b2));
3829
                                export_rdev(rdev);
3830
                                return -EINVAL;
3831
                        }
3832
                }
3833
                err = bind_rdev_to_array(rdev, mddev);
3834
                if (err)
3835
                        export_rdev(rdev);
3836
                return err;
3837
        }
3838
 
3839
        /*
3840
         * add_new_disk can be used once the array is assembled
3841
         * to add "hot spares".  They must already have a superblock
3842
         * written
3843
         */
3844
        if (mddev->pers) {
3845
                int err;
3846
                if (!mddev->pers->hot_add_disk) {
3847
                        printk(KERN_WARNING
3848
                                "%s: personality does not support diskops!\n",
3849
                               mdname(mddev));
3850
                        return -EINVAL;
3851
                }
3852
                if (mddev->persistent)
3853
                        rdev = md_import_device(dev, mddev->major_version,
3854
                                                mddev->minor_version);
3855
                else
3856
                        rdev = md_import_device(dev, -1, -1);
3857
                if (IS_ERR(rdev)) {
3858
                        printk(KERN_WARNING
3859
                                "md: md_import_device returned %ld\n",
3860
                                PTR_ERR(rdev));
3861
                        return PTR_ERR(rdev);
3862
                }
3863
                /* set save_raid_disk if appropriate */
3864
                if (!mddev->persistent) {
3865
                        if (info->state & (1<<MD_DISK_SYNC)  &&
3866
                            info->raid_disk < mddev->raid_disks)
3867
                                rdev->raid_disk = info->raid_disk;
3868
                        else
3869
                                rdev->raid_disk = -1;
3870
                } else
3871
                        super_types[mddev->major_version].
3872
                                validate_super(mddev, rdev);
3873
                rdev->saved_raid_disk = rdev->raid_disk;
3874
 
3875
                clear_bit(In_sync, &rdev->flags); /* just to be sure */
3876
                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
3877
                        set_bit(WriteMostly, &rdev->flags);
3878
 
3879
                rdev->raid_disk = -1;
3880
                err = bind_rdev_to_array(rdev, mddev);
3881
                if (!err && !mddev->pers->hot_remove_disk) {
3882
                        /* If there is hot_add_disk but no hot_remove_disk
3883
                         * then added disks for geometry changes,
3884
                         * and should be added immediately.
3885
                         */
3886
                        super_types[mddev->major_version].
3887
                                validate_super(mddev, rdev);
3888
                        err = mddev->pers->hot_add_disk(mddev, rdev);
3889
                        if (err)
3890
                                unbind_rdev_from_array(rdev);
3891
                }
3892
                if (err)
3893
                        export_rdev(rdev);
3894
 
3895
                md_update_sb(mddev, 1);
3896
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3897
                md_wakeup_thread(mddev->thread);
3898
                return err;
3899
        }
3900
 
3901
        /* otherwise, add_new_disk is only allowed
3902
         * for major_version==0 superblocks
3903
         */
3904
        if (mddev->major_version != 0) {
3905
                printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
3906
                       mdname(mddev));
3907
                return -EINVAL;
3908
        }
3909
 
3910
        if (!(info->state & (1<<MD_DISK_FAULTY))) {
3911
                int err;
3912
                rdev = md_import_device (dev, -1, 0);
3913
                if (IS_ERR(rdev)) {
3914
                        printk(KERN_WARNING
3915
                                "md: error, md_import_device() returned %ld\n",
3916
                                PTR_ERR(rdev));
3917
                        return PTR_ERR(rdev);
3918
                }
3919
                rdev->desc_nr = info->number;
3920
                if (info->raid_disk < mddev->raid_disks)
3921
                        rdev->raid_disk = info->raid_disk;
3922
                else
3923
                        rdev->raid_disk = -1;
3924
 
3925
                rdev->flags = 0;
3926
 
3927
                if (rdev->raid_disk < mddev->raid_disks)
3928
                        if (info->state & (1<<MD_DISK_SYNC))
3929
                                set_bit(In_sync, &rdev->flags);
3930
 
3931
                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
3932
                        set_bit(WriteMostly, &rdev->flags);
3933
 
3934
                if (!mddev->persistent) {
3935
                        printk(KERN_INFO "md: nonpersistent superblock ...\n");
3936
                        rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
3937
                } else
3938
                        rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
3939
                rdev->size = calc_dev_size(rdev, mddev->chunk_size);
3940
 
3941
                err = bind_rdev_to_array(rdev, mddev);
3942
                if (err) {
3943
                        export_rdev(rdev);
3944
                        return err;
3945
                }
3946
        }
3947
 
3948
        return 0;
3949
}
3950
 
3951
static int hot_remove_disk(mddev_t * mddev, dev_t dev)
3952
{
3953
        char b[BDEVNAME_SIZE];
3954
        mdk_rdev_t *rdev;
3955
 
3956
        if (!mddev->pers)
3957
                return -ENODEV;
3958
 
3959
        rdev = find_rdev(mddev, dev);
3960
        if (!rdev)
3961
                return -ENXIO;
3962
 
3963
        if (rdev->raid_disk >= 0)
3964
                goto busy;
3965
 
3966
        kick_rdev_from_array(rdev);
3967
        md_update_sb(mddev, 1);
3968
        md_new_event(mddev);
3969
 
3970
        return 0;
3971
busy:
3972
        printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n",
3973
                bdevname(rdev->bdev,b), mdname(mddev));
3974
        return -EBUSY;
3975
}
3976
 
3977
static int hot_add_disk(mddev_t * mddev, dev_t dev)
3978
{
3979
        char b[BDEVNAME_SIZE];
3980
        int err;
3981
        unsigned int size;
3982
        mdk_rdev_t *rdev;
3983
 
3984
        if (!mddev->pers)
3985
                return -ENODEV;
3986
 
3987
        if (mddev->major_version != 0) {
3988
                printk(KERN_WARNING "%s: HOT_ADD may only be used with"
3989
                        " version-0 superblocks.\n",
3990
                        mdname(mddev));
3991
                return -EINVAL;
3992
        }
3993
        if (!mddev->pers->hot_add_disk) {
3994
                printk(KERN_WARNING
3995
                        "%s: personality does not support diskops!\n",
3996
                        mdname(mddev));
3997
                return -EINVAL;
3998
        }
3999
 
4000
        rdev = md_import_device (dev, -1, 0);
4001
        if (IS_ERR(rdev)) {
4002
                printk(KERN_WARNING
4003
                        "md: error, md_import_device() returned %ld\n",
4004
                        PTR_ERR(rdev));
4005
                return -EINVAL;
4006
        }
4007
 
4008
        if (mddev->persistent)
4009
                rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
4010
        else
4011
                rdev->sb_offset =
4012
                        rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
4013
 
4014
        size = calc_dev_size(rdev, mddev->chunk_size);
4015
        rdev->size = size;
4016
 
4017
        if (test_bit(Faulty, &rdev->flags)) {
4018
                printk(KERN_WARNING
4019
                        "md: can not hot-add faulty %s disk to %s!\n",
4020
                        bdevname(rdev->bdev,b), mdname(mddev));
4021
                err = -EINVAL;
4022
                goto abort_export;
4023
        }
4024
        clear_bit(In_sync, &rdev->flags);
4025
        rdev->desc_nr = -1;
4026
        rdev->saved_raid_disk = -1;
4027
        err = bind_rdev_to_array(rdev, mddev);
4028
        if (err)
4029
                goto abort_export;
4030
 
4031
        /*
4032
         * The rest should better be atomic, we can have disk failures
4033
         * noticed in interrupt contexts ...
4034
         */
4035
 
4036
        if (rdev->desc_nr == mddev->max_disks) {
4037
                printk(KERN_WARNING "%s: can not hot-add to full array!\n",
4038
                        mdname(mddev));
4039
                err = -EBUSY;
4040
                goto abort_unbind_export;
4041
        }
4042
 
4043
        rdev->raid_disk = -1;
4044
 
4045
        md_update_sb(mddev, 1);
4046
 
4047
        /*
4048
         * Kick recovery, maybe this spare has to be added to the
4049
         * array immediately.
4050
         */
4051
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4052
        md_wakeup_thread(mddev->thread);
4053
        md_new_event(mddev);
4054
        return 0;
4055
 
4056
abort_unbind_export:
4057
        unbind_rdev_from_array(rdev);
4058
 
4059
abort_export:
4060
        export_rdev(rdev);
4061
        return err;
4062
}
4063
 
4064
static int set_bitmap_file(mddev_t *mddev, int fd)
4065
{
4066
        int err;
4067
 
4068
        if (mddev->pers) {
4069
                if (!mddev->pers->quiesce)
4070
                        return -EBUSY;
4071
                if (mddev->recovery || mddev->sync_thread)
4072
                        return -EBUSY;
4073
                /* we should be able to change the bitmap.. */
4074
        }
4075
 
4076
 
4077
        if (fd >= 0) {
4078
                if (mddev->bitmap)
4079
                        return -EEXIST; /* cannot add when bitmap is present */
4080
                mddev->bitmap_file = fget(fd);
4081
 
4082
                if (mddev->bitmap_file == NULL) {
4083
                        printk(KERN_ERR "%s: error: failed to get bitmap file\n",
4084
                               mdname(mddev));
4085
                        return -EBADF;
4086
                }
4087
 
4088
                err = deny_bitmap_write_access(mddev->bitmap_file);
4089
                if (err) {
4090
                        printk(KERN_ERR "%s: error: bitmap file is already in use\n",
4091
                               mdname(mddev));
4092
                        fput(mddev->bitmap_file);
4093
                        mddev->bitmap_file = NULL;
4094
                        return err;
4095
                }
4096
                mddev->bitmap_offset = 0; /* file overrides offset */
4097
        } else if (mddev->bitmap == NULL)
4098
                return -ENOENT; /* cannot remove what isn't there */
4099
        err = 0;
4100
        if (mddev->pers) {
4101
                mddev->pers->quiesce(mddev, 1);
4102
                if (fd >= 0)
4103
                        err = bitmap_create(mddev);
4104
                if (fd < 0 || err) {
4105
                        bitmap_destroy(mddev);
4106
                        fd = -1; /* make sure to put the file */
4107
                }
4108
                mddev->pers->quiesce(mddev, 0);
4109
        }
4110
        if (fd < 0) {
4111
                if (mddev->bitmap_file) {
4112
                        restore_bitmap_write_access(mddev->bitmap_file);
4113
                        fput(mddev->bitmap_file);
4114
                }
4115
                mddev->bitmap_file = NULL;
4116
        }
4117
 
4118
        return err;
4119
}
4120
 
4121
/*
4122
 * set_array_info is used two different ways
4123
 * The original usage is when creating a new array.
4124
 * In this usage, raid_disks is > 0 and it together with
4125
 *  level, size, not_persistent,layout,chunksize determine the
4126
 *  shape of the array.
4127
 *  This will always create an array with a type-0.90.0 superblock.
4128
 * The newer usage is when assembling an array.
4129
 *  In this case raid_disks will be 0, and the major_version field is
4130
 *  use to determine which style super-blocks are to be found on the devices.
4131
 *  The minor and patch _version numbers are also kept incase the
4132
 *  super_block handler wishes to interpret them.
4133
 */
4134
static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
4135
{
4136
 
4137
        if (info->raid_disks == 0) {
4138
                /* just setting version number for superblock loading */
4139
                if (info->major_version < 0 ||
4140
                    info->major_version >= ARRAY_SIZE(super_types) ||
4141
                    super_types[info->major_version].name == NULL) {
4142
                        /* maybe try to auto-load a module? */
4143
                        printk(KERN_INFO
4144
                                "md: superblock version %d not known\n",
4145
                                info->major_version);
4146
                        return -EINVAL;
4147
                }
4148
                mddev->major_version = info->major_version;
4149
                mddev->minor_version = info->minor_version;
4150
                mddev->patch_version = info->patch_version;
4151
                mddev->persistent = !info->not_persistent;
4152
                return 0;
4153
        }
4154
        mddev->major_version = MD_MAJOR_VERSION;
4155
        mddev->minor_version = MD_MINOR_VERSION;
4156
        mddev->patch_version = MD_PATCHLEVEL_VERSION;
4157
        mddev->ctime         = get_seconds();
4158
 
4159
        mddev->level         = info->level;
4160
        mddev->clevel[0]     = 0;
4161
        mddev->size          = info->size;
4162
        mddev->raid_disks    = info->raid_disks;
4163
        /* don't set md_minor, it is determined by which /dev/md* was
4164
         * openned
4165
         */
4166
        if (info->state & (1<<MD_SB_CLEAN))
4167
                mddev->recovery_cp = MaxSector;
4168
        else
4169
                mddev->recovery_cp = 0;
4170
        mddev->persistent    = ! info->not_persistent;
4171
 
4172
        mddev->layout        = info->layout;
4173
        mddev->chunk_size    = info->chunk_size;
4174
 
4175
        mddev->max_disks     = MD_SB_DISKS;
4176
 
4177
        mddev->flags         = 0;
4178
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
4179
 
4180
        mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
4181
        mddev->bitmap_offset = 0;
4182
 
4183
        mddev->reshape_position = MaxSector;
4184
 
4185
        /*
4186
         * Generate a 128 bit UUID
4187
         */
4188
        get_random_bytes(mddev->uuid, 16);
4189
 
4190
        mddev->new_level = mddev->level;
4191
        mddev->new_chunk = mddev->chunk_size;
4192
        mddev->new_layout = mddev->layout;
4193
        mddev->delta_disks = 0;
4194
 
4195
        return 0;
4196
}
4197
 
4198
static int update_size(mddev_t *mddev, unsigned long size)
4199
{
4200
        mdk_rdev_t * rdev;
4201
        int rv;
4202
        struct list_head *tmp;
4203
        int fit = (size == 0);
4204
 
4205
        if (mddev->pers->resize == NULL)
4206
                return -EINVAL;
4207
        /* The "size" is the amount of each device that is used.
4208
         * This can only make sense for arrays with redundancy.
4209
         * linear and raid0 always use whatever space is available
4210
         * We can only consider changing the size if no resync
4211
         * or reconstruction is happening, and if the new size
4212
         * is acceptable. It must fit before the sb_offset or,
4213
         * if that is <data_offset, it must fit before the
4214
         * size of each device.
4215
         * If size is zero, we find the largest size that fits.
4216
         */
4217
        if (mddev->sync_thread)
4218
                return -EBUSY;
4219
        ITERATE_RDEV(mddev,rdev,tmp) {
4220
                sector_t avail;
4221
                avail = rdev->size * 2;
4222
 
4223
                if (fit && (size == 0 || size > avail/2))
4224
                        size = avail/2;
4225
                if (avail < ((sector_t)size << 1))
4226
                        return -ENOSPC;
4227
        }
4228
        rv = mddev->pers->resize(mddev, (sector_t)size *2);
4229
        if (!rv) {
4230
                struct block_device *bdev;
4231
 
4232
                bdev = bdget_disk(mddev->gendisk, 0);
4233
                if (bdev) {
4234
                        mutex_lock(&bdev->bd_inode->i_mutex);
4235
                        i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10);
4236
                        mutex_unlock(&bdev->bd_inode->i_mutex);
4237
                        bdput(bdev);
4238
                }
4239
        }
4240
        return rv;
4241
}
4242
 
4243
static int update_raid_disks(mddev_t *mddev, int raid_disks)
4244
{
4245
        int rv;
4246
        /* change the number of raid disks */
4247
        if (mddev->pers->check_reshape == NULL)
4248
                return -EINVAL;
4249
        if (raid_disks <= 0 ||
4250
            raid_disks >= mddev->max_disks)
4251
                return -EINVAL;
4252
        if (mddev->sync_thread || mddev->reshape_position != MaxSector)
4253
                return -EBUSY;
4254
        mddev->delta_disks = raid_disks - mddev->raid_disks;
4255
 
4256
        rv = mddev->pers->check_reshape(mddev);
4257
        return rv;
4258
}
4259
 
4260
 
4261
/*
4262
 * update_array_info is used to change the configuration of an
4263
 * on-line array.
4264
 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
4265
 * fields in the info are checked against the array.
4266
 * Any differences that cannot be handled will cause an error.
4267
 * Normally, only one change can be managed at a time.
4268
 */
4269
static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4270
{
4271
        int rv = 0;
4272
        int cnt = 0;
4273
        int state = 0;
4274
 
4275
        /* calculate expected state,ignoring low bits */
4276
        if (mddev->bitmap && mddev->bitmap_offset)
4277
                state |= (1 << MD_SB_BITMAP_PRESENT);
4278
 
4279
        if (mddev->major_version != info->major_version ||
4280
            mddev->minor_version != info->minor_version ||
4281
/*          mddev->patch_version != info->patch_version || */
4282
            mddev->ctime         != info->ctime         ||
4283
            mddev->level         != info->level         ||
4284
/*          mddev->layout        != info->layout        || */
4285
            !mddev->persistent   != info->not_persistent||
4286
            mddev->chunk_size    != info->chunk_size    ||
4287
            /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
4288
            ((state^info->state) & 0xfffffe00)
4289
                )
4290
                return -EINVAL;
4291
        /* Check there is only one change */
4292
        if (info->size >= 0 && mddev->size != info->size) cnt++;
4293
        if (mddev->raid_disks != info->raid_disks) cnt++;
4294
        if (mddev->layout != info->layout) cnt++;
4295
        if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
4296
        if (cnt == 0) return 0;
4297
        if (cnt > 1) return -EINVAL;
4298
 
4299
        if (mddev->layout != info->layout) {
4300
                /* Change layout
4301
                 * we don't need to do anything at the md level, the
4302
                 * personality will take care of it all.
4303
                 */
4304
                if (mddev->pers->reconfig == NULL)
4305
                        return -EINVAL;
4306
                else
4307
                        return mddev->pers->reconfig(mddev, info->layout, -1);
4308
        }
4309
        if (info->size >= 0 && mddev->size != info->size)
4310
                rv = update_size(mddev, info->size);
4311
 
4312
        if (mddev->raid_disks    != info->raid_disks)
4313
                rv = update_raid_disks(mddev, info->raid_disks);
4314
 
4315
        if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
4316
                if (mddev->pers->quiesce == NULL)
4317
                        return -EINVAL;
4318
                if (mddev->recovery || mddev->sync_thread)
4319
                        return -EBUSY;
4320
                if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
4321
                        /* add the bitmap */
4322
                        if (mddev->bitmap)
4323
                                return -EEXIST;
4324
                        if (mddev->default_bitmap_offset == 0)
4325
                                return -EINVAL;
4326
                        mddev->bitmap_offset = mddev->default_bitmap_offset;
4327
                        mddev->pers->quiesce(mddev, 1);
4328
                        rv = bitmap_create(mddev);
4329
                        if (rv)
4330
                                bitmap_destroy(mddev);
4331
                        mddev->pers->quiesce(mddev, 0);
4332
                } else {
4333
                        /* remove the bitmap */
4334
                        if (!mddev->bitmap)
4335
                                return -ENOENT;
4336
                        if (mddev->bitmap->file)
4337
                                return -EINVAL;
4338
                        mddev->pers->quiesce(mddev, 1);
4339
                        bitmap_destroy(mddev);
4340
                        mddev->pers->quiesce(mddev, 0);
4341
                        mddev->bitmap_offset = 0;
4342
                }
4343
        }
4344
        md_update_sb(mddev, 1);
4345
        return rv;
4346
}
4347
 
4348
static int set_disk_faulty(mddev_t *mddev, dev_t dev)
4349
{
4350
        mdk_rdev_t *rdev;
4351
 
4352
        if (mddev->pers == NULL)
4353
                return -ENODEV;
4354
 
4355
        rdev = find_rdev(mddev, dev);
4356
        if (!rdev)
4357
                return -ENODEV;
4358
 
4359
        md_error(mddev, rdev);
4360
        return 0;
4361
}
4362
 
4363
static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
4364
{
4365
        mddev_t *mddev = bdev->bd_disk->private_data;
4366
 
4367
        geo->heads = 2;
4368
        geo->sectors = 4;
4369
        geo->cylinders = get_capacity(mddev->gendisk) / 8;
4370
        return 0;
4371
}
4372
 
4373
static int md_ioctl(struct inode *inode, struct file *file,
4374
                        unsigned int cmd, unsigned long arg)
4375
{
4376
        int err = 0;
4377
        void __user *argp = (void __user *)arg;
4378
        mddev_t *mddev = NULL;
4379
 
4380
        if (!capable(CAP_SYS_ADMIN))
4381
                return -EACCES;
4382
 
4383
        /*
4384
         * Commands dealing with the RAID driver but not any
4385
         * particular array:
4386
         */
4387
        switch (cmd)
4388
        {
4389
                case RAID_VERSION:
4390
                        err = get_version(argp);
4391
                        goto done;
4392
 
4393
                case PRINT_RAID_DEBUG:
4394
                        err = 0;
4395
                        md_print_devices();
4396
                        goto done;
4397
 
4398
#ifndef MODULE
4399
                case RAID_AUTORUN:
4400
                        err = 0;
4401
                        autostart_arrays(arg);
4402
                        goto done;
4403
#endif
4404
                default:;
4405
        }
4406
 
4407
        /*
4408
         * Commands creating/starting a new array:
4409
         */
4410
 
4411
        mddev = inode->i_bdev->bd_disk->private_data;
4412
 
4413
        if (!mddev) {
4414
                BUG();
4415
                goto abort;
4416
        }
4417
 
4418
        err = mddev_lock(mddev);
4419
        if (err) {
4420
                printk(KERN_INFO
4421
                        "md: ioctl lock interrupted, reason %d, cmd %d\n",
4422
                        err, cmd);
4423
                goto abort;
4424
        }
4425
 
4426
        switch (cmd)
4427
        {
4428
                case SET_ARRAY_INFO:
4429
                        {
4430
                                mdu_array_info_t info;
4431
                                if (!arg)
4432
                                        memset(&info, 0, sizeof(info));
4433
                                else if (copy_from_user(&info, argp, sizeof(info))) {
4434
                                        err = -EFAULT;
4435
                                        goto abort_unlock;
4436
                                }
4437
                                if (mddev->pers) {
4438
                                        err = update_array_info(mddev, &info);
4439
                                        if (err) {
4440
                                                printk(KERN_WARNING "md: couldn't update"
4441
                                                       " array info. %d\n", err);
4442
                                                goto abort_unlock;
4443
                                        }
4444
                                        goto done_unlock;
4445
                                }
4446
                                if (!list_empty(&mddev->disks)) {
4447
                                        printk(KERN_WARNING
4448
                                               "md: array %s already has disks!\n",
4449
                                               mdname(mddev));
4450
                                        err = -EBUSY;
4451
                                        goto abort_unlock;
4452
                                }
4453
                                if (mddev->raid_disks) {
4454
                                        printk(KERN_WARNING
4455
                                               "md: array %s already initialised!\n",
4456
                                               mdname(mddev));
4457
                                        err = -EBUSY;
4458
                                        goto abort_unlock;
4459
                                }
4460
                                err = set_array_info(mddev, &info);
4461
                                if (err) {
4462
                                        printk(KERN_WARNING "md: couldn't set"
4463
                                               " array info. %d\n", err);
4464
                                        goto abort_unlock;
4465
                                }
4466
                        }
4467
                        goto done_unlock;
4468
 
4469
                default:;
4470
        }
4471
 
4472
        /*
4473
         * Commands querying/configuring an existing array:
4474
         */
4475
        /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
4476
         * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
4477
        if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
4478
                        && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
4479
                        && cmd != GET_BITMAP_FILE) {
4480
                err = -ENODEV;
4481
                goto abort_unlock;
4482
        }
4483
 
4484
        /*
4485
         * Commands even a read-only array can execute:
4486
         */
4487
        switch (cmd)
4488
        {
4489
                case GET_ARRAY_INFO:
4490
                        err = get_array_info(mddev, argp);
4491
                        goto done_unlock;
4492
 
4493
                case GET_BITMAP_FILE:
4494
                        err = get_bitmap_file(mddev, argp);
4495
                        goto done_unlock;
4496
 
4497
                case GET_DISK_INFO:
4498
                        err = get_disk_info(mddev, argp);
4499
                        goto done_unlock;
4500
 
4501
                case RESTART_ARRAY_RW:
4502
                        err = restart_array(mddev);
4503
                        goto done_unlock;
4504
 
4505
                case STOP_ARRAY:
4506
                        err = do_md_stop (mddev, 0);
4507
                        goto done_unlock;
4508
 
4509
                case STOP_ARRAY_RO:
4510
                        err = do_md_stop (mddev, 1);
4511
                        goto done_unlock;
4512
 
4513
        /*
4514
         * We have a problem here : there is no easy way to give a CHS
4515
         * virtual geometry. We currently pretend that we have a 2 heads
4516
         * 4 sectors (with a BIG number of cylinders...). This drives
4517
         * dosfs just mad... ;-)
4518
         */
4519
        }
4520
 
4521
        /*
4522
         * The remaining ioctls are changing the state of the
4523
         * superblock, so we do not allow them on read-only arrays.
4524
         * However non-MD ioctls (e.g. get-size) will still come through
4525
         * here and hit the 'default' below, so only disallow
4526
         * 'md' ioctls, and switch to rw mode if started auto-readonly.
4527
         */
4528
        if (_IOC_TYPE(cmd) == MD_MAJOR &&
4529
            mddev->ro && mddev->pers) {
4530
                if (mddev->ro == 2) {
4531
                        mddev->ro = 0;
4532
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4533
                md_wakeup_thread(mddev->thread);
4534
 
4535
                } else {
4536
                        err = -EROFS;
4537
                        goto abort_unlock;
4538
                }
4539
        }
4540
 
4541
        switch (cmd)
4542
        {
4543
                case ADD_NEW_DISK:
4544
                {
4545
                        mdu_disk_info_t info;
4546
                        if (copy_from_user(&info, argp, sizeof(info)))
4547
                                err = -EFAULT;
4548
                        else
4549
                                err = add_new_disk(mddev, &info);
4550
                        goto done_unlock;
4551
                }
4552
 
4553
                case HOT_REMOVE_DISK:
4554
                        err = hot_remove_disk(mddev, new_decode_dev(arg));
4555
                        goto done_unlock;
4556
 
4557
                case HOT_ADD_DISK:
4558
                        err = hot_add_disk(mddev, new_decode_dev(arg));
4559
                        goto done_unlock;
4560
 
4561
                case SET_DISK_FAULTY:
4562
                        err = set_disk_faulty(mddev, new_decode_dev(arg));
4563
                        goto done_unlock;
4564
 
4565
                case RUN_ARRAY:
4566
                        err = do_md_run (mddev);
4567
                        goto done_unlock;
4568
 
4569
                case SET_BITMAP_FILE:
4570
                        err = set_bitmap_file(mddev, (int)arg);
4571
                        goto done_unlock;
4572
 
4573
                default:
4574
                        err = -EINVAL;
4575
                        goto abort_unlock;
4576
        }
4577
 
4578
done_unlock:
4579
abort_unlock:
4580
        mddev_unlock(mddev);
4581
 
4582
        return err;
4583
done:
4584
        if (err)
4585
                MD_BUG();
4586
abort:
4587
        return err;
4588
}
4589
 
4590
static int md_open(struct inode *inode, struct file *file)
4591
{
4592
        /*
4593
         * Succeed if we can lock the mddev, which confirms that
4594
         * it isn't being stopped right now.
4595
         */
4596
        mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4597
        int err;
4598
 
4599
        if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
4600
                goto out;
4601
 
4602
        err = 0;
4603
        mddev_get(mddev);
4604
        mddev_unlock(mddev);
4605
 
4606
        check_disk_change(inode->i_bdev);
4607
 out:
4608
        return err;
4609
}
4610
 
4611
static int md_release(struct inode *inode, struct file * file)
4612
{
4613
        mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4614
 
4615
        BUG_ON(!mddev);
4616
        mddev_put(mddev);
4617
 
4618
        return 0;
4619
}
4620
 
4621
static int md_media_changed(struct gendisk *disk)
4622
{
4623
        mddev_t *mddev = disk->private_data;
4624
 
4625
        return mddev->changed;
4626
}
4627
 
4628
static int md_revalidate(struct gendisk *disk)
4629
{
4630
        mddev_t *mddev = disk->private_data;
4631
 
4632
        mddev->changed = 0;
4633
        return 0;
4634
}
4635
static struct block_device_operations md_fops =
4636
{
4637
        .owner          = THIS_MODULE,
4638
        .open           = md_open,
4639
        .release        = md_release,
4640
        .ioctl          = md_ioctl,
4641
        .getgeo         = md_getgeo,
4642
        .media_changed  = md_media_changed,
4643
        .revalidate_disk= md_revalidate,
4644
};
4645
 
4646
static int md_thread(void * arg)
4647
{
4648
        mdk_thread_t *thread = arg;
4649
 
4650
        /*
4651
         * md_thread is a 'system-thread', it's priority should be very
4652
         * high. We avoid resource deadlocks individually in each
4653
         * raid personality. (RAID5 does preallocation) We also use RR and
4654
         * the very same RT priority as kswapd, thus we will never get
4655
         * into a priority inversion deadlock.
4656
         *
4657
         * we definitely have to have equal or higher priority than
4658
         * bdflush, otherwise bdflush will deadlock if there are too
4659
         * many dirty RAID5 blocks.
4660
         */
4661
 
4662
        allow_signal(SIGKILL);
4663
        while (!kthread_should_stop()) {
4664
 
4665
                /* We need to wait INTERRUPTIBLE so that
4666
                 * we don't add to the load-average.
4667
                 * That means we need to be sure no signals are
4668
                 * pending
4669
                 */
4670
                if (signal_pending(current))
4671
                        flush_signals(current);
4672
 
4673
                wait_event_interruptible_timeout
4674
                        (thread->wqueue,
4675
                         test_bit(THREAD_WAKEUP, &thread->flags)
4676
                         || kthread_should_stop(),
4677
                         thread->timeout);
4678
 
4679
                clear_bit(THREAD_WAKEUP, &thread->flags);
4680
 
4681
                thread->run(thread->mddev);
4682
        }
4683
 
4684
        return 0;
4685
}
4686
 
4687
void md_wakeup_thread(mdk_thread_t *thread)
4688
{
4689
        if (thread) {
4690
                dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
4691
                set_bit(THREAD_WAKEUP, &thread->flags);
4692
                wake_up(&thread->wqueue);
4693
        }
4694
}
4695
 
4696
mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
4697
                                 const char *name)
4698
{
4699
        mdk_thread_t *thread;
4700
 
4701
        thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
4702
        if (!thread)
4703
                return NULL;
4704
 
4705
        init_waitqueue_head(&thread->wqueue);
4706
 
4707
        thread->run = run;
4708
        thread->mddev = mddev;
4709
        thread->timeout = MAX_SCHEDULE_TIMEOUT;
4710
        thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
4711
        if (IS_ERR(thread->tsk)) {
4712
                kfree(thread);
4713
                return NULL;
4714
        }
4715
        return thread;
4716
}
4717
 
4718
void md_unregister_thread(mdk_thread_t *thread)
4719
{
4720
        dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
4721
 
4722
        kthread_stop(thread->tsk);
4723
        kfree(thread);
4724
}
4725
 
4726
void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
4727
{
4728
        if (!mddev) {
4729
                MD_BUG();
4730
                return;
4731
        }
4732
 
4733
        if (!rdev || test_bit(Faulty, &rdev->flags))
4734
                return;
4735
/*
4736
        dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
4737
                mdname(mddev),
4738
                MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
4739
                __builtin_return_address(0),__builtin_return_address(1),
4740
                __builtin_return_address(2),__builtin_return_address(3));
4741
*/
4742
        if (!mddev->pers)
4743
                return;
4744
        if (!mddev->pers->error_handler)
4745
                return;
4746
        mddev->pers->error_handler(mddev,rdev);
4747
        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4748
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4749
        md_wakeup_thread(mddev->thread);
4750
        md_new_event_inintr(mddev);
4751
}
4752
 
4753
/* seq_file implementation /proc/mdstat */
4754
 
4755
static void status_unused(struct seq_file *seq)
4756
{
4757
        int i = 0;
4758
        mdk_rdev_t *rdev;
4759
        struct list_head *tmp;
4760
 
4761
        seq_printf(seq, "unused devices: ");
4762
 
4763
        ITERATE_RDEV_PENDING(rdev,tmp) {
4764
                char b[BDEVNAME_SIZE];
4765
                i++;
4766
                seq_printf(seq, "%s ",
4767
                              bdevname(rdev->bdev,b));
4768
        }
4769
        if (!i)
4770
                seq_printf(seq, "<none>");
4771
 
4772
        seq_printf(seq, "\n");
4773
}
4774
 
4775
 
4776
static void status_resync(struct seq_file *seq, mddev_t * mddev)
4777
{
4778
        sector_t max_blocks, resync, res;
4779
        unsigned long dt, db, rt;
4780
        int scale;
4781
        unsigned int per_milli;
4782
 
4783
        resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
4784
 
4785
        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
4786
                max_blocks = mddev->resync_max_sectors >> 1;
4787
        else
4788
                max_blocks = mddev->size;
4789
 
4790
        /*
4791
         * Should not happen.
4792
         */
4793
        if (!max_blocks) {
4794
                MD_BUG();
4795
                return;
4796
        }
4797
        /* Pick 'scale' such that (resync>>scale)*1000 will fit
4798
         * in a sector_t, and (max_blocks>>scale) will fit in a
4799
         * u32, as those are the requirements for sector_div.
4800
         * Thus 'scale' must be at least 10
4801
         */
4802
        scale = 10;
4803
        if (sizeof(sector_t) > sizeof(unsigned long)) {
4804
                while ( max_blocks/2 > (1ULL<<(scale+32)))
4805
                        scale++;
4806
        }
4807
        res = (resync>>scale)*1000;
4808
        sector_div(res, (u32)((max_blocks>>scale)+1));
4809
 
4810
        per_milli = res;
4811
        {
4812
                int i, x = per_milli/50, y = 20-x;
4813
                seq_printf(seq, "[");
4814
                for (i = 0; i < x; i++)
4815
                        seq_printf(seq, "=");
4816
                seq_printf(seq, ">");
4817
                for (i = 0; i < y; i++)
4818
                        seq_printf(seq, ".");
4819
                seq_printf(seq, "] ");
4820
        }
4821
        seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
4822
                   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
4823
                    "reshape" :
4824
                    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
4825
                     "check" :
4826
                     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
4827
                      "resync" : "recovery"))),
4828
                   per_milli/10, per_milli % 10,
4829
                   (unsigned long long) resync,
4830
                   (unsigned long long) max_blocks);
4831
 
4832
        /*
4833
         * We do not want to overflow, so the order of operands and
4834
         * the * 100 / 100 trick are important. We do a +1 to be
4835
         * safe against division by zero. We only estimate anyway.
4836
         *
4837
         * dt: time from mark until now
4838
         * db: blocks written from mark until now
4839
         * rt: remaining time
4840
         */
4841
        dt = ((jiffies - mddev->resync_mark) / HZ);
4842
        if (!dt) dt++;
4843
        db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
4844
                - mddev->resync_mark_cnt;
4845
        rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100;
4846
 
4847
        seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
4848
 
4849
        seq_printf(seq, " speed=%ldK/sec", db/2/dt);
4850
}
4851
 
4852
static void *md_seq_start(struct seq_file *seq, loff_t *pos)
4853
{
4854
        struct list_head *tmp;
4855
        loff_t l = *pos;
4856
        mddev_t *mddev;
4857
 
4858
        if (l >= 0x10000)
4859
                return NULL;
4860
        if (!l--)
4861
                /* header */
4862
                return (void*)1;
4863
 
4864
        spin_lock(&all_mddevs_lock);
4865
        list_for_each(tmp,&all_mddevs)
4866
                if (!l--) {
4867
                        mddev = list_entry(tmp, mddev_t, all_mddevs);
4868
                        mddev_get(mddev);
4869
                        spin_unlock(&all_mddevs_lock);
4870
                        return mddev;
4871
                }
4872
        spin_unlock(&all_mddevs_lock);
4873
        if (!l--)
4874
                return (void*)2;/* tail */
4875
        return NULL;
4876
}
4877
 
4878
static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4879
{
4880
        struct list_head *tmp;
4881
        mddev_t *next_mddev, *mddev = v;
4882
 
4883
        ++*pos;
4884
        if (v == (void*)2)
4885
                return NULL;
4886
 
4887
        spin_lock(&all_mddevs_lock);
4888
        if (v == (void*)1)
4889
                tmp = all_mddevs.next;
4890
        else
4891
                tmp = mddev->all_mddevs.next;
4892
        if (tmp != &all_mddevs)
4893
                next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
4894
        else {
4895
                next_mddev = (void*)2;
4896
                *pos = 0x10000;
4897
        }
4898
        spin_unlock(&all_mddevs_lock);
4899
 
4900
        if (v != (void*)1)
4901
                mddev_put(mddev);
4902
        return next_mddev;
4903
 
4904
}
4905
 
4906
static void md_seq_stop(struct seq_file *seq, void *v)
4907
{
4908
        mddev_t *mddev = v;
4909
 
4910
        if (mddev && v != (void*)1 && v != (void*)2)
4911
                mddev_put(mddev);
4912
}
4913
 
4914
struct mdstat_info {
4915
        int event;
4916
};
4917
 
4918
static int md_seq_show(struct seq_file *seq, void *v)
4919
{
4920
        mddev_t *mddev = v;
4921
        sector_t size;
4922
        struct list_head *tmp2;
4923
        mdk_rdev_t *rdev;
4924
        struct mdstat_info *mi = seq->private;
4925
        struct bitmap *bitmap;
4926
 
4927
        if (v == (void*)1) {
4928
                struct mdk_personality *pers;
4929
                seq_printf(seq, "Personalities : ");
4930
                spin_lock(&pers_lock);
4931
                list_for_each_entry(pers, &pers_list, list)
4932
                        seq_printf(seq, "[%s] ", pers->name);
4933
 
4934
                spin_unlock(&pers_lock);
4935
                seq_printf(seq, "\n");
4936
                mi->event = atomic_read(&md_event_count);
4937
                return 0;
4938
        }
4939
        if (v == (void*)2) {
4940
                status_unused(seq);
4941
                return 0;
4942
        }
4943
 
4944
        if (mddev_lock(mddev) < 0)
4945
                return -EINTR;
4946
 
4947
        if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
4948
                seq_printf(seq, "%s : %sactive", mdname(mddev),
4949
                                                mddev->pers ? "" : "in");
4950
                if (mddev->pers) {
4951
                        if (mddev->ro==1)
4952
                                seq_printf(seq, " (read-only)");
4953
                        if (mddev->ro==2)
4954
                                seq_printf(seq, "(auto-read-only)");
4955
                        seq_printf(seq, " %s", mddev->pers->name);
4956
                }
4957
 
4958
                size = 0;
4959
                ITERATE_RDEV(mddev,rdev,tmp2) {
4960
                        char b[BDEVNAME_SIZE];
4961
                        seq_printf(seq, " %s[%d]",
4962
                                bdevname(rdev->bdev,b), rdev->desc_nr);
4963
                        if (test_bit(WriteMostly, &rdev->flags))
4964
                                seq_printf(seq, "(W)");
4965
                        if (test_bit(Faulty, &rdev->flags)) {
4966
                                seq_printf(seq, "(F)");
4967
                                continue;
4968
                        } else if (rdev->raid_disk < 0)
4969
                                seq_printf(seq, "(S)"); /* spare */
4970
                        size += rdev->size;
4971
                }
4972
 
4973
                if (!list_empty(&mddev->disks)) {
4974
                        if (mddev->pers)
4975
                                seq_printf(seq, "\n      %llu blocks",
4976
                                        (unsigned long long)mddev->array_size);
4977
                        else
4978
                                seq_printf(seq, "\n      %llu blocks",
4979
                                        (unsigned long long)size);
4980
                }
4981
                if (mddev->persistent) {
4982
                        if (mddev->major_version != 0 ||
4983
                            mddev->minor_version != 90) {
4984
                                seq_printf(seq," super %d.%d",
4985
                                           mddev->major_version,
4986
                                           mddev->minor_version);
4987
                        }
4988
                } else
4989
                        seq_printf(seq, " super non-persistent");
4990
 
4991
                if (mddev->pers) {
4992
                        mddev->pers->status (seq, mddev);
4993
                        seq_printf(seq, "\n      ");
4994
                        if (mddev->pers->sync_request) {
4995
                                if (mddev->curr_resync > 2) {
4996
                                        status_resync (seq, mddev);
4997
                                        seq_printf(seq, "\n      ");
4998
                                } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
4999
                                        seq_printf(seq, "\tresync=DELAYED\n      ");
5000
                                else if (mddev->recovery_cp < MaxSector)
5001
                                        seq_printf(seq, "\tresync=PENDING\n      ");
5002
                        }
5003
                } else
5004
                        seq_printf(seq, "\n       ");
5005
 
5006
                if ((bitmap = mddev->bitmap)) {
5007
                        unsigned long chunk_kb;
5008
                        unsigned long flags;
5009
                        spin_lock_irqsave(&bitmap->lock, flags);
5010
                        chunk_kb = bitmap->chunksize >> 10;
5011
                        seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
5012
                                "%lu%s chunk",
5013
                                bitmap->pages - bitmap->missing_pages,
5014
                                bitmap->pages,
5015
                                (bitmap->pages - bitmap->missing_pages)
5016
                                        << (PAGE_SHIFT - 10),
5017
                                chunk_kb ? chunk_kb : bitmap->chunksize,
5018
                                chunk_kb ? "KB" : "B");
5019
                        if (bitmap->file) {
5020
                                seq_printf(seq, ", file: ");
5021
                                seq_path(seq, bitmap->file->f_path.mnt,
5022
                                         bitmap->file->f_path.dentry," \t\n");
5023
                        }
5024
 
5025
                        seq_printf(seq, "\n");
5026
                        spin_unlock_irqrestore(&bitmap->lock, flags);
5027
                }
5028
 
5029
                seq_printf(seq, "\n");
5030
        }
5031
        mddev_unlock(mddev);
5032
 
5033
        return 0;
5034
}
5035
 
5036
static struct seq_operations md_seq_ops = {
5037
        .start  = md_seq_start,
5038
        .next   = md_seq_next,
5039
        .stop   = md_seq_stop,
5040
        .show   = md_seq_show,
5041
};
5042
 
5043
static int md_seq_open(struct inode *inode, struct file *file)
5044
{
5045
        int error;
5046
        struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
5047
        if (mi == NULL)
5048
                return -ENOMEM;
5049
 
5050
        error = seq_open(file, &md_seq_ops);
5051
        if (error)
5052
                kfree(mi);
5053
        else {
5054
                struct seq_file *p = file->private_data;
5055
                p->private = mi;
5056
                mi->event = atomic_read(&md_event_count);
5057
        }
5058
        return error;
5059
}
5060
 
5061
static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
5062
{
5063
        struct seq_file *m = filp->private_data;
5064
        struct mdstat_info *mi = m->private;
5065
        int mask;
5066
 
5067
        poll_wait(filp, &md_event_waiters, wait);
5068
 
5069
        /* always allow read */
5070
        mask = POLLIN | POLLRDNORM;
5071
 
5072
        if (mi->event != atomic_read(&md_event_count))
5073
                mask |= POLLERR | POLLPRI;
5074
        return mask;
5075
}
5076
 
5077
static const struct file_operations md_seq_fops = {
5078
        .owner          = THIS_MODULE,
5079
        .open           = md_seq_open,
5080
        .read           = seq_read,
5081
        .llseek         = seq_lseek,
5082
        .release        = seq_release_private,
5083
        .poll           = mdstat_poll,
5084
};
5085
 
5086
int register_md_personality(struct mdk_personality *p)
5087
{
5088
        spin_lock(&pers_lock);
5089
        list_add_tail(&p->list, &pers_list);
5090
        printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
5091
        spin_unlock(&pers_lock);
5092
        return 0;
5093
}
5094
 
5095
int unregister_md_personality(struct mdk_personality *p)
5096
{
5097
        printk(KERN_INFO "md: %s personality unregistered\n", p->name);
5098
        spin_lock(&pers_lock);
5099
        list_del_init(&p->list);
5100
        spin_unlock(&pers_lock);
5101
        return 0;
5102
}
5103
 
5104
static int is_mddev_idle(mddev_t *mddev)
5105
{
5106
        mdk_rdev_t * rdev;
5107
        struct list_head *tmp;
5108
        int idle;
5109
        long curr_events;
5110
 
5111
        idle = 1;
5112
        ITERATE_RDEV(mddev,rdev,tmp) {
5113
                struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
5114
                curr_events = disk_stat_read(disk, sectors[0]) +
5115
                                disk_stat_read(disk, sectors[1]) -
5116
                                atomic_read(&disk->sync_io);
5117
                /* sync IO will cause sync_io to increase before the disk_stats
5118
                 * as sync_io is counted when a request starts, and
5119
                 * disk_stats is counted when it completes.
5120
                 * So resync activity will cause curr_events to be smaller than
5121
                 * when there was no such activity.
5122
                 * non-sync IO will cause disk_stat to increase without
5123
                 * increasing sync_io so curr_events will (eventually)
5124
                 * be larger than it was before.  Once it becomes
5125
                 * substantially larger, the test below will cause
5126
                 * the array to appear non-idle, and resync will slow
5127
                 * down.
5128
                 * If there is a lot of outstanding resync activity when
5129
                 * we set last_event to curr_events, then all that activity
5130
                 * completing might cause the array to appear non-idle
5131
                 * and resync will be slowed down even though there might
5132
                 * not have been non-resync activity.  This will only
5133
                 * happen once though.  'last_events' will soon reflect
5134
                 * the state where there is little or no outstanding
5135
                 * resync requests, and further resync activity will
5136
                 * always make curr_events less than last_events.
5137
                 *
5138
                 */
5139
                if (curr_events - rdev->last_events > 4096) {
5140
                        rdev->last_events = curr_events;
5141
                        idle = 0;
5142
                }
5143
        }
5144
        return idle;
5145
}
5146
 
5147
void md_done_sync(mddev_t *mddev, int blocks, int ok)
5148
{
5149
        /* another "blocks" (512byte) blocks have been synced */
5150
        atomic_sub(blocks, &mddev->recovery_active);
5151
        wake_up(&mddev->recovery_wait);
5152
        if (!ok) {
5153
                set_bit(MD_RECOVERY_ERR, &mddev->recovery);
5154
                md_wakeup_thread(mddev->thread);
5155
                // stop recovery, signal do_sync ....
5156
        }
5157
}
5158
 
5159
 
5160
/* md_write_start(mddev, bi)
5161
 * If we need to update some array metadata (e.g. 'active' flag
5162
 * in superblock) before writing, schedule a superblock update
5163
 * and wait for it to complete.
5164
 */
5165
void md_write_start(mddev_t *mddev, struct bio *bi)
5166
{
5167
        if (bio_data_dir(bi) != WRITE)
5168
                return;
5169
 
5170
        BUG_ON(mddev->ro == 1);
5171
        if (mddev->ro == 2) {
5172
                /* need to switch to read/write */
5173
                mddev->ro = 0;
5174
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5175
                md_wakeup_thread(mddev->thread);
5176
        }
5177
        atomic_inc(&mddev->writes_pending);
5178
        if (mddev->in_sync) {
5179
                spin_lock_irq(&mddev->write_lock);
5180
                if (mddev->in_sync) {
5181
                        mddev->in_sync = 0;
5182
                        set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5183
                        md_wakeup_thread(mddev->thread);
5184
                }
5185
                spin_unlock_irq(&mddev->write_lock);
5186
        }
5187
        wait_event(mddev->sb_wait, mddev->flags==0);
5188
}
5189
 
5190
void md_write_end(mddev_t *mddev)
5191
{
5192
        if (atomic_dec_and_test(&mddev->writes_pending)) {
5193
                if (mddev->safemode == 2)
5194
                        md_wakeup_thread(mddev->thread);
5195
                else if (mddev->safemode_delay)
5196
                        mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
5197
        }
5198
}
5199
 
5200
/* md_allow_write(mddev)
5201
 * Calling this ensures that the array is marked 'active' so that writes
5202
 * may proceed without blocking.  It is important to call this before
5203
 * attempting a GFP_KERNEL allocation while holding the mddev lock.
5204
 * Must be called with mddev_lock held.
5205
 */
5206
void md_allow_write(mddev_t *mddev)
5207
{
5208
        if (!mddev->pers)
5209
                return;
5210
        if (mddev->ro)
5211
                return;
5212
 
5213
        spin_lock_irq(&mddev->write_lock);
5214
        if (mddev->in_sync) {
5215
                mddev->in_sync = 0;
5216
                set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5217
                if (mddev->safemode_delay &&
5218
                    mddev->safemode == 0)
5219
                        mddev->safemode = 1;
5220
                spin_unlock_irq(&mddev->write_lock);
5221
                md_update_sb(mddev, 0);
5222
        } else
5223
                spin_unlock_irq(&mddev->write_lock);
5224
}
5225
EXPORT_SYMBOL_GPL(md_allow_write);
5226
 
5227
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
5228
 
5229
#define SYNC_MARKS      10
5230
#define SYNC_MARK_STEP  (3*HZ)
5231
void md_do_sync(mddev_t *mddev)
5232
{
5233
        mddev_t *mddev2;
5234
        unsigned int currspeed = 0,
5235
                 window;
5236
        sector_t max_sectors,j, io_sectors;
5237
        unsigned long mark[SYNC_MARKS];
5238
        sector_t mark_cnt[SYNC_MARKS];
5239
        int last_mark,m;
5240
        struct list_head *tmp;
5241
        sector_t last_check;
5242
        int skipped = 0;
5243
        struct list_head *rtmp;
5244
        mdk_rdev_t *rdev;
5245
        char *desc;
5246
 
5247
        /* just incase thread restarts... */
5248
        if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
5249
                return;
5250
        if (mddev->ro) /* never try to sync a read-only array */
5251
                return;
5252
 
5253
        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5254
                if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
5255
                        desc = "data-check";
5256
                else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5257
                        desc = "requested-resync";
5258
                else
5259
                        desc = "resync";
5260
        } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5261
                desc = "reshape";
5262
        else
5263
                desc = "recovery";
5264
 
5265
        /* we overload curr_resync somewhat here.
5266
         * 0 == not engaged in resync at all
5267
         * 2 == checking that there is no conflict with another sync
5268
         * 1 == like 2, but have yielded to allow conflicting resync to
5269
         *              commense
5270
         * other == active in resync - this many blocks
5271
         *
5272
         * Before starting a resync we must have set curr_resync to
5273
         * 2, and then checked that every "conflicting" array has curr_resync
5274
         * less than ours.  When we find one that is the same or higher
5275
         * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
5276
         * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
5277
         * This will mean we have to start checking from the beginning again.
5278
         *
5279
         */
5280
 
5281
        do {
5282
                mddev->curr_resync = 2;
5283
 
5284
        try_again:
5285
                if (kthread_should_stop()) {
5286
                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5287
                        goto skip;
5288
                }
5289
                ITERATE_MDDEV(mddev2,tmp) {
5290
                        if (mddev2 == mddev)
5291
                                continue;
5292
                        if (mddev2->curr_resync &&
5293
                            match_mddev_units(mddev,mddev2)) {
5294
                                DEFINE_WAIT(wq);
5295
                                if (mddev < mddev2 && mddev->curr_resync == 2) {
5296
                                        /* arbitrarily yield */
5297
                                        mddev->curr_resync = 1;
5298
                                        wake_up(&resync_wait);
5299
                                }
5300
                                if (mddev > mddev2 && mddev->curr_resync == 1)
5301
                                        /* no need to wait here, we can wait the next
5302
                                         * time 'round when curr_resync == 2
5303
                                         */
5304
                                        continue;
5305
                                prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
5306
                                if (!kthread_should_stop() &&
5307
                                    mddev2->curr_resync >= mddev->curr_resync) {
5308
                                        printk(KERN_INFO "md: delaying %s of %s"
5309
                                               " until %s has finished (they"
5310
                                               " share one or more physical units)\n",
5311
                                               desc, mdname(mddev), mdname(mddev2));
5312
                                        mddev_put(mddev2);
5313
                                        schedule();
5314
                                        finish_wait(&resync_wait, &wq);
5315
                                        goto try_again;
5316
                                }
5317
                                finish_wait(&resync_wait, &wq);
5318
                        }
5319
                }
5320
        } while (mddev->curr_resync < 2);
5321
 
5322
        j = 0;
5323
        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5324
                /* resync follows the size requested by the personality,
5325
                 * which defaults to physical size, but can be virtual size
5326
                 */
5327
                max_sectors = mddev->resync_max_sectors;
5328
                mddev->resync_mismatches = 0;
5329
                /* we don't use the checkpoint if there's a bitmap */
5330
                if (!mddev->bitmap &&
5331
                    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5332
                        j = mddev->recovery_cp;
5333
        } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5334
                max_sectors = mddev->size << 1;
5335
        else {
5336
                /* recovery follows the physical size of devices */
5337
                max_sectors = mddev->size << 1;
5338
                j = MaxSector;
5339
                ITERATE_RDEV(mddev,rdev,rtmp)
5340
                        if (rdev->raid_disk >= 0 &&
5341
                            !test_bit(Faulty, &rdev->flags) &&
5342
                            !test_bit(In_sync, &rdev->flags) &&
5343
                            rdev->recovery_offset < j)
5344
                                j = rdev->recovery_offset;
5345
        }
5346
 
5347
        printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
5348
        printk(KERN_INFO "md: minimum _guaranteed_  speed:"
5349
                " %d KB/sec/disk.\n", speed_min(mddev));
5350
        printk(KERN_INFO "md: using maximum available idle IO bandwidth "
5351
               "(but not more than %d KB/sec) for %s.\n",
5352
               speed_max(mddev), desc);
5353
 
5354
        is_mddev_idle(mddev); /* this also initializes IO event counters */
5355
 
5356
        io_sectors = 0;
5357
        for (m = 0; m < SYNC_MARKS; m++) {
5358
                mark[m] = jiffies;
5359
                mark_cnt[m] = io_sectors;
5360
        }
5361
        last_mark = 0;
5362
        mddev->resync_mark = mark[last_mark];
5363
        mddev->resync_mark_cnt = mark_cnt[last_mark];
5364
 
5365
        /*
5366
         * Tune reconstruction:
5367
         */
5368
        window = 32*(PAGE_SIZE/512);
5369
        printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
5370
                window/2,(unsigned long long) max_sectors/2);
5371
 
5372
        atomic_set(&mddev->recovery_active, 0);
5373
        init_waitqueue_head(&mddev->recovery_wait);
5374
        last_check = 0;
5375
 
5376
        if (j>2) {
5377
                printk(KERN_INFO
5378
                       "md: resuming %s of %s from checkpoint.\n",
5379
                       desc, mdname(mddev));
5380
                mddev->curr_resync = j;
5381
        }
5382
 
5383
        while (j < max_sectors) {
5384
                sector_t sectors;
5385
 
5386
                skipped = 0;
5387
                sectors = mddev->pers->sync_request(mddev, j, &skipped,
5388
                                            currspeed < speed_min(mddev));
5389
                if (sectors == 0) {
5390
                        set_bit(MD_RECOVERY_ERR, &mddev->recovery);
5391
                        goto out;
5392
                }
5393
 
5394
                if (!skipped) { /* actual IO requested */
5395
                        io_sectors += sectors;
5396
                        atomic_add(sectors, &mddev->recovery_active);
5397
                }
5398
 
5399
                j += sectors;
5400
                if (j>1) mddev->curr_resync = j;
5401
                mddev->curr_mark_cnt = io_sectors;
5402
                if (last_check == 0)
5403
                        /* this is the earliers that rebuilt will be
5404
                         * visible in /proc/mdstat
5405
                         */
5406
                        md_new_event(mddev);
5407
 
5408
                if (last_check + window > io_sectors || j == max_sectors)
5409
                        continue;
5410
 
5411
                last_check = io_sectors;
5412
 
5413
                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
5414
                    test_bit(MD_RECOVERY_ERR, &mddev->recovery))
5415
                        break;
5416
 
5417
        repeat:
5418
                if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
5419
                        /* step marks */
5420
                        int next = (last_mark+1) % SYNC_MARKS;
5421
 
5422
                        mddev->resync_mark = mark[next];
5423
                        mddev->resync_mark_cnt = mark_cnt[next];
5424
                        mark[next] = jiffies;
5425
                        mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
5426
                        last_mark = next;
5427
                }
5428
 
5429
 
5430
                if (kthread_should_stop()) {
5431
                        /*
5432
                         * got a signal, exit.
5433
                         */
5434
                        printk(KERN_INFO
5435
                                "md: md_do_sync() got signal ... exiting\n");
5436
                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5437
                        goto out;
5438
                }
5439
 
5440
                /*
5441
                 * this loop exits only if either when we are slower than
5442
                 * the 'hard' speed limit, or the system was IO-idle for
5443
                 * a jiffy.
5444
                 * the system might be non-idle CPU-wise, but we only care
5445
                 * about not overloading the IO subsystem. (things like an
5446
                 * e2fsck being done on the RAID array should execute fast)
5447
                 */
5448
                blk_unplug(mddev->queue);
5449
                cond_resched();
5450
 
5451
                currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
5452
                        /((jiffies-mddev->resync_mark)/HZ +1) +1;
5453
 
5454
                if (currspeed > speed_min(mddev)) {
5455
                        if ((currspeed > speed_max(mddev)) ||
5456
                                        !is_mddev_idle(mddev)) {
5457
                                msleep(500);
5458
                                goto repeat;
5459
                        }
5460
                }
5461
        }
5462
        printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
5463
        /*
5464
         * this also signals 'finished resyncing' to md_stop
5465
         */
5466
 out:
5467
        blk_unplug(mddev->queue);
5468
 
5469
        wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
5470
 
5471
        /* tell personality that we are finished */
5472
        mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
5473
 
5474
        if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
5475
            !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
5476
            mddev->curr_resync > 2) {
5477
                if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5478
                        if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5479
                                if (mddev->curr_resync >= mddev->recovery_cp) {
5480
                                        printk(KERN_INFO
5481
                                               "md: checkpointing %s of %s.\n",
5482
                                               desc, mdname(mddev));
5483
                                        mddev->recovery_cp = mddev->curr_resync;
5484
                                }
5485
                        } else
5486
                                mddev->recovery_cp = MaxSector;
5487
                } else {
5488
                        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5489
                                mddev->curr_resync = MaxSector;
5490
                        ITERATE_RDEV(mddev,rdev,rtmp)
5491
                                if (rdev->raid_disk >= 0 &&
5492
                                    !test_bit(Faulty, &rdev->flags) &&
5493
                                    !test_bit(In_sync, &rdev->flags) &&
5494
                                    rdev->recovery_offset < mddev->curr_resync)
5495
                                        rdev->recovery_offset = mddev->curr_resync;
5496
                }
5497
        }
5498
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
5499
 
5500
 skip:
5501
        mddev->curr_resync = 0;
5502
        wake_up(&resync_wait);
5503
        set_bit(MD_RECOVERY_DONE, &mddev->recovery);
5504
        md_wakeup_thread(mddev->thread);
5505
}
5506
EXPORT_SYMBOL_GPL(md_do_sync);
5507
 
5508
 
5509
static int remove_and_add_spares(mddev_t *mddev)
5510
{
5511
        mdk_rdev_t *rdev;
5512
        struct list_head *rtmp;
5513
        int spares = 0;
5514
 
5515
        ITERATE_RDEV(mddev,rdev,rtmp)
5516
                if (rdev->raid_disk >= 0 &&
5517
                    (test_bit(Faulty, &rdev->flags) ||
5518
                     ! test_bit(In_sync, &rdev->flags)) &&
5519
                    atomic_read(&rdev->nr_pending)==0) {
5520
                        if (mddev->pers->hot_remove_disk(
5521
                                    mddev, rdev->raid_disk)==0) {
5522
                                char nm[20];
5523
                                sprintf(nm,"rd%d", rdev->raid_disk);
5524
                                sysfs_remove_link(&mddev->kobj, nm);
5525
                                rdev->raid_disk = -1;
5526
                        }
5527
                }
5528
 
5529
        if (mddev->degraded) {
5530
                ITERATE_RDEV(mddev,rdev,rtmp)
5531
                        if (rdev->raid_disk < 0
5532
                            && !test_bit(Faulty, &rdev->flags)) {
5533
                                rdev->recovery_offset = 0;
5534
                                if (mddev->pers->hot_add_disk(mddev,rdev)) {
5535
                                        char nm[20];
5536
                                        sprintf(nm, "rd%d", rdev->raid_disk);
5537
                                        if (sysfs_create_link(&mddev->kobj,
5538
                                                              &rdev->kobj, nm))
5539
                                                printk(KERN_WARNING
5540
                                                       "md: cannot register "
5541
                                                       "%s for %s\n",
5542
                                                       nm, mdname(mddev));
5543
                                        spares++;
5544
                                        md_new_event(mddev);
5545
                                } else
5546
                                        break;
5547
                        }
5548
        }
5549
        return spares;
5550
}
5551
/*
5552
 * This routine is regularly called by all per-raid-array threads to
5553
 * deal with generic issues like resync and super-block update.
5554
 * Raid personalities that don't have a thread (linear/raid0) do not
5555
 * need this as they never do any recovery or update the superblock.
5556
 *
5557
 * It does not do any resync itself, but rather "forks" off other threads
5558
 * to do that as needed.
5559
 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
5560
 * "->recovery" and create a thread at ->sync_thread.
5561
 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
5562
 * and wakeups up this thread which will reap the thread and finish up.
5563
 * This thread also removes any faulty devices (with nr_pending == 0).
5564
 *
5565
 * The overall approach is:
5566
 *  1/ if the superblock needs updating, update it.
5567
 *  2/ If a recovery thread is running, don't do anything else.
5568
 *  3/ If recovery has finished, clean up, possibly marking spares active.
5569
 *  4/ If there are any faulty devices, remove them.
5570
 *  5/ If array is degraded, try to add spares devices
5571
 *  6/ If array has spares or is not in-sync, start a resync thread.
5572
 */
5573
void md_check_recovery(mddev_t *mddev)
5574
{
5575
        mdk_rdev_t *rdev;
5576
        struct list_head *rtmp;
5577
 
5578
 
5579
        if (mddev->bitmap)
5580
                bitmap_daemon_work(mddev->bitmap);
5581
 
5582
        if (mddev->ro)
5583
                return;
5584
 
5585
        if (signal_pending(current)) {
5586
                if (mddev->pers->sync_request) {
5587
                        printk(KERN_INFO "md: %s in immediate safe mode\n",
5588
                               mdname(mddev));
5589
                        mddev->safemode = 2;
5590
                }
5591
                flush_signals(current);
5592
        }
5593
 
5594
        if ( ! (
5595
                mddev->flags ||
5596
                test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
5597
                test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
5598
                (mddev->safemode == 1) ||
5599
                (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
5600
                 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
5601
                ))
5602
                return;
5603
 
5604
        if (mddev_trylock(mddev)) {
5605
                int spares = 0;
5606
 
5607
                spin_lock_irq(&mddev->write_lock);
5608
                if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
5609
                    !mddev->in_sync && mddev->recovery_cp == MaxSector) {
5610
                        mddev->in_sync = 1;
5611
                        set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5612
                }
5613
                if (mddev->safemode == 1)
5614
                        mddev->safemode = 0;
5615
                spin_unlock_irq(&mddev->write_lock);
5616
 
5617
                if (mddev->flags)
5618
                        md_update_sb(mddev, 0);
5619
 
5620
 
5621
                if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
5622
                    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
5623
                        /* resync/recovery still happening */
5624
                        clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5625
                        goto unlock;
5626
                }
5627
                if (mddev->sync_thread) {
5628
                        /* resync has finished, collect result */
5629
                        md_unregister_thread(mddev->sync_thread);
5630
                        mddev->sync_thread = NULL;
5631
                        if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
5632
                            !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5633
                                /* success...*/
5634
                                /* activate any spares */
5635
                                mddev->pers->spare_active(mddev);
5636
                        }
5637
                        md_update_sb(mddev, 1);
5638
 
5639
                        /* if array is no-longer degraded, then any saved_raid_disk
5640
                         * information must be scrapped
5641
                         */
5642
                        if (!mddev->degraded)
5643
                                ITERATE_RDEV(mddev,rdev,rtmp)
5644
                                        rdev->saved_raid_disk = -1;
5645
 
5646
                        mddev->recovery = 0;
5647
                        /* flag recovery needed just to double check */
5648
                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5649
                        md_new_event(mddev);
5650
                        goto unlock;
5651
                }
5652
                /* Clear some bits that don't mean anything, but
5653
                 * might be left set
5654
                 */
5655
                clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5656
                clear_bit(MD_RECOVERY_ERR, &mddev->recovery);
5657
                clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
5658
                clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
5659
 
5660
                if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
5661
                        goto unlock;
5662
                /* no recovery is running.
5663
                 * remove any failed drives, then
5664
                 * add spares if possible.
5665
                 * Spare are also removed and re-added, to allow
5666
                 * the personality to fail the re-add.
5667
                 */
5668
 
5669
                if (mddev->reshape_position != MaxSector) {
5670
                        if (mddev->pers->check_reshape(mddev) != 0)
5671
                                /* Cannot proceed */
5672
                                goto unlock;
5673
                        set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
5674
                } else if ((spares = remove_and_add_spares(mddev))) {
5675
                        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5676
                        clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5677
                } else if (mddev->recovery_cp < MaxSector) {
5678
                        set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5679
                } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5680
                        /* nothing to be done ... */
5681
                        goto unlock;
5682
 
5683
                if (mddev->pers->sync_request) {
5684
                        set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
5685
                        if (spares && mddev->bitmap && ! mddev->bitmap->file) {
5686
                                /* We are adding a device or devices to an array
5687
                                 * which has the bitmap stored on all devices.
5688
                                 * So make sure all bitmap pages get written
5689
                                 */
5690
                                bitmap_write_all(mddev->bitmap);
5691
                        }
5692
                        mddev->sync_thread = md_register_thread(md_do_sync,
5693
                                                                mddev,
5694
                                                                "%s_resync");
5695
                        if (!mddev->sync_thread) {
5696
                                printk(KERN_ERR "%s: could not start resync"
5697
                                        " thread...\n",
5698
                                        mdname(mddev));
5699
                                /* leave the spares where they are, it shouldn't hurt */
5700
                                mddev->recovery = 0;
5701
                        } else
5702
                                md_wakeup_thread(mddev->sync_thread);
5703
                        md_new_event(mddev);
5704
                }
5705
        unlock:
5706
                mddev_unlock(mddev);
5707
        }
5708
}
5709
 
5710
static int md_notify_reboot(struct notifier_block *this,
5711
                            unsigned long code, void *x)
5712
{
5713
        struct list_head *tmp;
5714
        mddev_t *mddev;
5715
 
5716
        if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
5717
 
5718
                printk(KERN_INFO "md: stopping all md devices.\n");
5719
 
5720
                ITERATE_MDDEV(mddev,tmp)
5721
                        if (mddev_trylock(mddev)) {
5722
                                do_md_stop (mddev, 1);
5723
                                mddev_unlock(mddev);
5724
                        }
5725
                /*
5726
                 * certain more exotic SCSI devices are known to be
5727
                 * volatile wrt too early system reboots. While the
5728
                 * right place to handle this issue is the given
5729
                 * driver, we do want to have a safe RAID driver ...
5730
                 */
5731
                mdelay(1000*1);
5732
        }
5733
        return NOTIFY_DONE;
5734
}
5735
 
5736
static struct notifier_block md_notifier = {
5737
        .notifier_call  = md_notify_reboot,
5738
        .next           = NULL,
5739
        .priority       = INT_MAX, /* before any real devices */
5740
};
5741
 
5742
static void md_geninit(void)
5743
{
5744
        struct proc_dir_entry *p;
5745
 
5746
        dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
5747
 
5748
        p = create_proc_entry("mdstat", S_IRUGO, NULL);
5749
        if (p)
5750
                p->proc_fops = &md_seq_fops;
5751
}
5752
 
5753
static int __init md_init(void)
5754
{
5755
        if (register_blkdev(MAJOR_NR, "md"))
5756
                return -1;
5757
        if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
5758
                unregister_blkdev(MAJOR_NR, "md");
5759
                return -1;
5760
        }
5761
        blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE,
5762
                            md_probe, NULL, NULL);
5763
        blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
5764
                            md_probe, NULL, NULL);
5765
 
5766
        register_reboot_notifier(&md_notifier);
5767
        raid_table_header = register_sysctl_table(raid_root_table);
5768
 
5769
        md_geninit();
5770
        return (0);
5771
}
5772
 
5773
 
5774
#ifndef MODULE
5775
 
5776
/*
5777
 * Searches all registered partitions for autorun RAID arrays
5778
 * at boot time.
5779
 */
5780
 
5781
static LIST_HEAD(all_detected_devices);
5782
struct detected_devices_node {
5783
        struct list_head list;
5784
        dev_t dev;
5785
};
5786
 
5787
void md_autodetect_dev(dev_t dev)
5788
{
5789
        struct detected_devices_node *node_detected_dev;
5790
 
5791
        node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
5792
        if (node_detected_dev) {
5793
                node_detected_dev->dev = dev;
5794
                list_add_tail(&node_detected_dev->list, &all_detected_devices);
5795
        } else {
5796
                printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
5797
                        ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
5798
        }
5799
}
5800
 
5801
 
5802
static void autostart_arrays(int part)
5803
{
5804
        mdk_rdev_t *rdev;
5805
        struct detected_devices_node *node_detected_dev;
5806
        dev_t dev;
5807
        int i_scanned, i_passed;
5808
 
5809
        i_scanned = 0;
5810
        i_passed = 0;
5811
 
5812
        printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
5813
 
5814
        while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
5815
                i_scanned++;
5816
                node_detected_dev = list_entry(all_detected_devices.next,
5817
                                        struct detected_devices_node, list);
5818
                list_del(&node_detected_dev->list);
5819
                dev = node_detected_dev->dev;
5820
                kfree(node_detected_dev);
5821
                rdev = md_import_device(dev,0, 90);
5822
                if (IS_ERR(rdev))
5823
                        continue;
5824
 
5825
                if (test_bit(Faulty, &rdev->flags)) {
5826
                        MD_BUG();
5827
                        continue;
5828
                }
5829
                list_add(&rdev->same_set, &pending_raid_disks);
5830
                i_passed++;
5831
        }
5832
 
5833
        printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
5834
                                                i_scanned, i_passed);
5835
 
5836
        autorun_devices(part);
5837
}
5838
 
5839
#endif /* !MODULE */
5840
 
5841
static __exit void md_exit(void)
5842
{
5843
        mddev_t *mddev;
5844
        struct list_head *tmp;
5845
 
5846
        blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS);
5847
        blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
5848
 
5849
        unregister_blkdev(MAJOR_NR,"md");
5850
        unregister_blkdev(mdp_major, "mdp");
5851
        unregister_reboot_notifier(&md_notifier);
5852
        unregister_sysctl_table(raid_table_header);
5853
        remove_proc_entry("mdstat", NULL);
5854
        ITERATE_MDDEV(mddev,tmp) {
5855
                struct gendisk *disk = mddev->gendisk;
5856
                if (!disk)
5857
                        continue;
5858
                export_array(mddev);
5859
                del_gendisk(disk);
5860
                put_disk(disk);
5861
                mddev->gendisk = NULL;
5862
                mddev_put(mddev);
5863
        }
5864
}
5865
 
5866
subsys_initcall(md_init);
5867
module_exit(md_exit)
5868
 
5869
static int get_ro(char *buffer, struct kernel_param *kp)
5870
{
5871
        return sprintf(buffer, "%d", start_readonly);
5872
}
5873
static int set_ro(const char *val, struct kernel_param *kp)
5874
{
5875
        char *e;
5876
        int num = simple_strtoul(val, &e, 10);
5877
        if (*val && (*e == '\0' || *e == '\n')) {
5878
                start_readonly = num;
5879
                return 0;
5880
        }
5881
        return -EINVAL;
5882
}
5883
 
5884
module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
5885
module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
5886
 
5887
 
5888
EXPORT_SYMBOL(register_md_personality);
5889
EXPORT_SYMBOL(unregister_md_personality);
5890
EXPORT_SYMBOL(md_error);
5891
EXPORT_SYMBOL(md_done_sync);
5892
EXPORT_SYMBOL(md_write_start);
5893
EXPORT_SYMBOL(md_write_end);
5894
EXPORT_SYMBOL(md_register_thread);
5895
EXPORT_SYMBOL(md_unregister_thread);
5896
EXPORT_SYMBOL(md_wakeup_thread);
5897
EXPORT_SYMBOL(md_check_recovery);
5898
MODULE_LICENSE("GPL");
5899
MODULE_ALIAS("md");
5900
MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.