OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [drivers/] [md/] [multipath.c] - Blame information for rev 1765

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1275 phoenix
/*
2
 * multipath.c : Multiple Devices driver for Linux
3
 *
4
 * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
5
 *
6
 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7
 *
8
 * MULTIPATH management functions.
9
 *
10
 * derived from raid1.c.
11
 *
12
 * This program is free software; you can redistribute it and/or modify
13
 * it under the terms of the GNU General Public License as published by
14
 * the Free Software Foundation; either version 2, or (at your option)
15
 * any later version.
16
 *
17
 * You should have received a copy of the GNU General Public License
18
 * (for example /usr/src/linux/COPYING); if not, write to the Free
19
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20
 */
21
 
22
#include <linux/module.h>
23
#include <linux/slab.h>
24
#include <linux/raid/multipath.h>
25
#include <asm/atomic.h>
26
 
27
#define MAJOR_NR MD_MAJOR
28
#define MD_DRIVER
29
#define MD_PERSONALITY
30
 
31
#define MAX_WORK_PER_DISK 128
32
 
33
#define NR_RESERVED_BUFS        32
34
 
35
 
36
/*
37
 * The following can be used to debug the driver
38
 */
39
#define MULTIPATH_DEBUG 0
40
 
41
#if MULTIPATH_DEBUG
42
#define PRINTK(x...)   printk(x)
43
#define inline
44
#define __inline__
45
#else
46
#define PRINTK(x...)  do { } while (0)
47
#endif
48
 
49
 
50
static mdk_personality_t multipath_personality;
51
static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
52
struct multipath_bh *multipath_retry_list = NULL, **multipath_retry_tail;
53
 
54
static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state);
55
 
56
 
57
 
58
static struct multipath_bh *multipath_alloc_mpbh(multipath_conf_t *conf)
59
{
60
        struct multipath_bh *mp_bh = NULL;
61
 
62
        do {
63
                md_spin_lock_irq(&conf->device_lock);
64
                if (!conf->freer1_blocked && conf->freer1) {
65
                        mp_bh = conf->freer1;
66
                        conf->freer1 = mp_bh->next_mp;
67
                        conf->freer1_cnt--;
68
                        mp_bh->next_mp = NULL;
69
                        mp_bh->state = (1 << MPBH_PreAlloc);
70
                        mp_bh->bh_req.b_state = 0;
71
                }
72
                md_spin_unlock_irq(&conf->device_lock);
73
                if (mp_bh)
74
                        return mp_bh;
75
                mp_bh = (struct multipath_bh *) kmalloc(sizeof(struct multipath_bh),
76
                                        GFP_NOIO);
77
                if (mp_bh) {
78
                        memset(mp_bh, 0, sizeof(*mp_bh));
79
                        return mp_bh;
80
                }
81
                conf->freer1_blocked = 1;
82
                wait_disk_event(conf->wait_buffer,
83
                                !conf->freer1_blocked ||
84
                                conf->freer1_cnt > NR_RESERVED_BUFS/2
85
                    );
86
                conf->freer1_blocked = 0;
87
        } while (1);
88
}
89
 
90
static inline void multipath_free_mpbh(struct multipath_bh *mp_bh)
91
{
92
        multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
93
 
94
        if (test_bit(MPBH_PreAlloc, &mp_bh->state)) {
95
                unsigned long flags;
96
                spin_lock_irqsave(&conf->device_lock, flags);
97
                mp_bh->next_mp = conf->freer1;
98
                conf->freer1 = mp_bh;
99
                conf->freer1_cnt++;
100
                spin_unlock_irqrestore(&conf->device_lock, flags);
101
                wake_up(&conf->wait_buffer);
102
        } else {
103
                kfree(mp_bh);
104
        }
105
}
106
 
107
static int multipath_grow_mpbh (multipath_conf_t *conf, int cnt)
108
{
109
        int i = 0;
110
 
111
        while (i < cnt) {
112
                struct multipath_bh *mp_bh;
113
                mp_bh = (struct multipath_bh*)kmalloc(sizeof(*mp_bh), GFP_KERNEL);
114
                if (!mp_bh)
115
                        break;
116
                memset(mp_bh, 0, sizeof(*mp_bh));
117
                set_bit(MPBH_PreAlloc, &mp_bh->state);
118
                mp_bh->mddev = conf->mddev;
119
 
120
                multipath_free_mpbh(mp_bh);
121
                i++;
122
        }
123
        return i;
124
}
125
 
126
static void multipath_shrink_mpbh(multipath_conf_t *conf)
127
{
128
        md_spin_lock_irq(&conf->device_lock);
129
        while (conf->freer1) {
130
                struct multipath_bh *mp_bh = conf->freer1;
131
                conf->freer1 = mp_bh->next_mp;
132
                conf->freer1_cnt--;
133
                kfree(mp_bh);
134
        }
135
        md_spin_unlock_irq(&conf->device_lock);
136
}
137
 
138
 
139
static int multipath_map (mddev_t *mddev, kdev_t *rdev)
140
{
141
        multipath_conf_t *conf = mddev_to_conf(mddev);
142
        int i, disks = MD_SB_DISKS;
143
 
144
        /*
145
         * Later we do read balancing on the read side
146
         * now we use the first available disk.
147
         */
148
 
149
        for (i = 0; i < disks; i++) {
150
                if (conf->multipaths[i].operational) {
151
                        *rdev = conf->multipaths[i].dev;
152
                        return (0);
153
                }
154
        }
155
 
156
        printk (KERN_ERR "multipath_map(): no more operational IO paths?\n");
157
        return (-1);
158
}
159
 
160
static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
161
{
162
        unsigned long flags;
163
        mddev_t *mddev = mp_bh->mddev;
164
        multipath_conf_t *conf = mddev_to_conf(mddev);
165
 
166
        md_spin_lock_irqsave(&retry_list_lock, flags);
167
        if (multipath_retry_list == NULL)
168
                multipath_retry_tail = &multipath_retry_list;
169
        *multipath_retry_tail = mp_bh;
170
        multipath_retry_tail = &mp_bh->next_mp;
171
        mp_bh->next_mp = NULL;
172
        md_spin_unlock_irqrestore(&retry_list_lock, flags);
173
        md_wakeup_thread(conf->thread);
174
}
175
 
176
 
177
/*
178
 * multipath_end_bh_io() is called when we have finished servicing a multipathed
179
 * operation and are ready to return a success/failure code to the buffer
180
 * cache layer.
181
 */
182
static void multipath_end_bh_io (struct multipath_bh *mp_bh, int uptodate)
183
{
184
        struct buffer_head *bh = mp_bh->master_bh;
185
 
186
        bh->b_end_io(bh, uptodate);
187
        multipath_free_mpbh(mp_bh);
188
}
189
 
190
void multipath_end_request (struct buffer_head *bh, int uptodate)
191
{
192
        struct multipath_bh * mp_bh = (struct multipath_bh *)(bh->b_private);
193
 
194
        /*
195
         * this branch is our 'one multipath IO has finished' event handler:
196
         */
197
        if (!uptodate)
198
                md_error (mp_bh->mddev, bh->b_dev);
199
        else
200
                /*
201
                 * Set MPBH_Uptodate in our master buffer_head, so that
202
                 * we will return a good error code for to the higher
203
                 * levels even if IO on some other multipathed buffer fails.
204
                 *
205
                 * The 'master' represents the complex operation to
206
                 * user-side. So if something waits for IO, then it will
207
                 * wait for the 'master' buffer_head.
208
                 */
209
                set_bit (MPBH_Uptodate, &mp_bh->state);
210
 
211
 
212
        if (uptodate) {
213
                multipath_end_bh_io(mp_bh, uptodate);
214
                return;
215
        }
216
        /*
217
         * oops, IO error:
218
         */
219
        printk(KERN_ERR "multipath: %s: rescheduling block %lu\n",
220
                 partition_name(bh->b_dev), bh->b_blocknr);
221
        multipath_reschedule_retry(mp_bh);
222
        return;
223
}
224
 
225
/*
226
 * This routine returns the disk from which the requested read should
227
 * be done.
228
 */
229
 
230
static int multipath_read_balance (multipath_conf_t *conf)
231
{
232
        int disk;
233
 
234
        for (disk = 0; disk < conf->raid_disks; disk++)
235
                if (conf->multipaths[disk].operational)
236
                        return disk;
237
        BUG();
238
        return 0;
239
}
240
 
241
static int multipath_make_request (mddev_t *mddev, int rw,
242
                               struct buffer_head * bh)
243
{
244
        multipath_conf_t *conf = mddev_to_conf(mddev);
245
        struct buffer_head *bh_req;
246
        struct multipath_bh * mp_bh;
247
        struct multipath_info *multipath;
248
 
249
        if (!buffer_locked(bh))
250
                BUG();
251
 
252
/*
253
 * make_request() can abort the operation when READA is being
254
 * used and no empty request is available.
255
 *
256
 * Currently, just replace the command with READ/WRITE.
257
 */
258
        if (rw == READA)
259
                rw = READ;
260
 
261
        mp_bh = multipath_alloc_mpbh (conf);
262
 
263
        mp_bh->master_bh = bh;
264
        mp_bh->mddev = mddev;
265
        mp_bh->cmd = rw;
266
 
267
        /*
268
         * read balancing logic:
269
         */
270
        multipath = conf->multipaths + multipath_read_balance(conf);
271
 
272
        bh_req = &mp_bh->bh_req;
273
        memcpy(bh_req, bh, sizeof(*bh));
274
        bh_req->b_blocknr = bh->b_rsector;
275
        bh_req->b_dev = multipath->dev;
276
        bh_req->b_rdev = multipath->dev;
277
/*      bh_req->b_rsector = bh->n_rsector; */
278
        bh_req->b_end_io = multipath_end_request;
279
        bh_req->b_private = mp_bh;
280
        generic_make_request (rw, bh_req);
281
        return 0;
282
}
283
 
284
static void multipath_status (struct seq_file *seq, mddev_t *mddev)
285
{
286
        multipath_conf_t *conf = mddev_to_conf(mddev);
287
        int i;
288
 
289
        seq_printf (seq, " [%d/%d] [", conf->raid_disks,
290
                                                 conf->working_disks);
291
        for (i = 0; i < conf->raid_disks; i++)
292
                seq_printf (seq, "%s",
293
                        conf->multipaths[i].operational ? "U" : "_");
294
        seq_printf (seq, "]");
295
}
296
 
297
#define LAST_DISK KERN_ALERT \
298
"multipath: only one IO path left and IO error.\n"
299
 
300
#define NO_SPARE_DISK KERN_ALERT \
301
"multipath: no spare IO path left!\n"
302
 
303
#define DISK_FAILED KERN_ALERT \
304
"multipath: IO failure on %s, disabling IO path. \n" \
305
"       Operation continuing on %d IO paths.\n"
306
 
307
static void mark_disk_bad (mddev_t *mddev, int failed)
308
{
309
        multipath_conf_t *conf = mddev_to_conf(mddev);
310
        struct multipath_info *multipath = conf->multipaths+failed;
311
        mdp_super_t *sb = mddev->sb;
312
 
313
        multipath->operational = 0;
314
        mark_disk_faulty(sb->disks+multipath->number);
315
        mark_disk_nonsync(sb->disks+multipath->number);
316
        mark_disk_inactive(sb->disks+multipath->number);
317
        sb->active_disks--;
318
        sb->working_disks--;
319
        sb->failed_disks++;
320
        mddev->sb_dirty = 1;
321
        md_wakeup_thread(conf->thread);
322
        conf->working_disks--;
323
        printk (DISK_FAILED, partition_name (multipath->dev),
324
                                 conf->working_disks);
325
}
326
 
327
/*
328
 * Careful, this can execute in IRQ contexts as well!
329
 */
330
static int multipath_error (mddev_t *mddev, kdev_t dev)
331
{
332
        multipath_conf_t *conf = mddev_to_conf(mddev);
333
        struct multipath_info * multipaths = conf->multipaths;
334
        int disks = MD_SB_DISKS;
335
        int other_paths = 1;
336
        int i;
337
 
338
        if (conf->working_disks == 1) {
339
                other_paths = 0;
340
                for (i = 0; i < disks; i++) {
341
                        if (multipaths[i].spare) {
342
                                other_paths = 1;
343
                                break;
344
                        }
345
                }
346
        }
347
 
348
        if (!other_paths) {
349
                /*
350
                 * Uh oh, we can do nothing if this is our last path, but
351
                 * first check if this is a queued request for a device
352
                 * which has just failed.
353
                 */
354
                for (i = 0; i < disks; i++) {
355
                        if (multipaths[i].dev==dev && !multipaths[i].operational)
356
                                return 0;
357
                }
358
                printk (LAST_DISK);
359
        } else {
360
                /*
361
                 * Mark disk as unusable
362
                 */
363
                for (i = 0; i < disks; i++) {
364
                        if (multipaths[i].dev==dev && multipaths[i].operational) {
365
                                mark_disk_bad(mddev, i);
366
                                break;
367
                        }
368
                }
369
                if (!conf->working_disks) {
370
                        int err = 1;
371
                        mdp_disk_t *spare;
372
                        mdp_super_t *sb = mddev->sb;
373
 
374
                        spare = get_spare(mddev);
375
                        if (spare) {
376
                                err = multipath_diskop(mddev, &spare, DISKOP_SPARE_WRITE);
377
                                printk("got DISKOP_SPARE_WRITE err: %d. (spare_faulty(): %d)\n", err, disk_faulty(spare));
378
                        }
379
                        if (!err && !disk_faulty(spare)) {
380
                                multipath_diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
381
                                mark_disk_sync(spare);
382
                                mark_disk_active(spare);
383
                                sb->active_disks++;
384
                                sb->spare_disks--;
385
                        }
386
                }
387
        }
388
        return 0;
389
}
390
 
391
#undef LAST_DISK
392
#undef NO_SPARE_DISK
393
#undef DISK_FAILED
394
 
395
 
396
static void print_multipath_conf (multipath_conf_t *conf)
397
{
398
        int i;
399
        struct multipath_info *tmp;
400
 
401
        printk("MULTIPATH conf printout:\n");
402
        if (!conf) {
403
                printk("(conf==NULL)\n");
404
                return;
405
        }
406
        printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
407
                         conf->raid_disks, conf->nr_disks);
408
 
409
        for (i = 0; i < MD_SB_DISKS; i++) {
410
                tmp = conf->multipaths + i;
411
                if (tmp->spare || tmp->operational || tmp->number ||
412
                                tmp->raid_disk || tmp->used_slot)
413
                        printk(" disk%d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
414
                                i, tmp->spare,tmp->operational,
415
                                tmp->number,tmp->raid_disk,tmp->used_slot,
416
                                partition_name(tmp->dev));
417
        }
418
}
419
 
420
static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
421
{
422
        int err = 0;
423
        int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
424
        multipath_conf_t *conf = mddev->private;
425
        struct multipath_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
426
        mdp_super_t *sb = mddev->sb;
427
        mdp_disk_t *failed_desc, *spare_desc, *added_desc;
428
        mdk_rdev_t *spare_rdev, *failed_rdev;
429
 
430
        print_multipath_conf(conf);
431
        md_spin_lock_irq(&conf->device_lock);
432
        /*
433
         * find the disk ...
434
         */
435
        switch (state) {
436
 
437
        case DISKOP_SPARE_ACTIVE:
438
 
439
                /*
440
                 * Find the failed disk within the MULTIPATH configuration ...
441
                 * (this can only be in the first conf->working_disks part)
442
                 */
443
                for (i = 0; i < conf->raid_disks; i++) {
444
                        tmp = conf->multipaths + i;
445
                        if ((!tmp->operational && !tmp->spare) ||
446
                                        !tmp->used_slot) {
447
                                failed_disk = i;
448
                                break;
449
                        }
450
                }
451
                /*
452
                 * When we activate a spare disk we _must_ have a disk in
453
                 * the lower (active) part of the array to replace.
454
                 */
455
                if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
456
                        MD_BUG();
457
                        err = 1;
458
                        goto abort;
459
                }
460
                /* fall through */
461
 
462
        case DISKOP_SPARE_WRITE:
463
        case DISKOP_SPARE_INACTIVE:
464
 
465
                /*
466
                 * Find the spare disk ... (can only be in the 'high'
467
                 * area of the array)
468
                 */
469
                for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
470
                        tmp = conf->multipaths + i;
471
                        if (tmp->spare && tmp->number == (*d)->number) {
472
                                spare_disk = i;
473
                                break;
474
                        }
475
                }
476
                if (spare_disk == -1) {
477
                        MD_BUG();
478
                        err = 1;
479
                        goto abort;
480
                }
481
                break;
482
 
483
        case DISKOP_HOT_REMOVE_DISK:
484
 
485
                for (i = 0; i < MD_SB_DISKS; i++) {
486
                        tmp = conf->multipaths + i;
487
                        if (tmp->used_slot && (tmp->number == (*d)->number)) {
488
                                if (tmp->operational) {
489
                                        printk(KERN_ERR "hot-remove-disk, slot %d is identified to be the requested disk (number %d), but is still operational!\n", i, (*d)->number);
490
                                        err = -EBUSY;
491
                                        goto abort;
492
                                }
493
                                removed_disk = i;
494
                                break;
495
                        }
496
                }
497
                if (removed_disk == -1) {
498
                        MD_BUG();
499
                        err = 1;
500
                        goto abort;
501
                }
502
                break;
503
 
504
        case DISKOP_HOT_ADD_DISK:
505
 
506
                for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
507
                        tmp = conf->multipaths + i;
508
                        if (!tmp->used_slot) {
509
                                added_disk = i;
510
                                break;
511
                        }
512
                }
513
                if (added_disk == -1) {
514
                        MD_BUG();
515
                        err = 1;
516
                        goto abort;
517
                }
518
                break;
519
        }
520
 
521
        switch (state) {
522
        /*
523
         * Switch the spare disk to write-only mode:
524
         */
525
        case DISKOP_SPARE_WRITE:
526
                sdisk = conf->multipaths + spare_disk;
527
                sdisk->operational = 1;
528
                break;
529
        /*
530
         * Deactivate a spare disk:
531
         */
532
        case DISKOP_SPARE_INACTIVE:
533
                sdisk = conf->multipaths + spare_disk;
534
                sdisk->operational = 0;
535
                break;
536
        /*
537
         * Activate (mark read-write) the (now sync) spare disk,
538
         * which means we switch it's 'raid position' (->raid_disk)
539
         * with the failed disk. (only the first 'conf->nr_disks'
540
         * slots are used for 'real' disks and we must preserve this
541
         * property)
542
         */
543
        case DISKOP_SPARE_ACTIVE:
544
                sdisk = conf->multipaths + spare_disk;
545
                fdisk = conf->multipaths + failed_disk;
546
 
547
                spare_desc = &sb->disks[sdisk->number];
548
                failed_desc = &sb->disks[fdisk->number];
549
 
550
                if (spare_desc != *d) {
551
                        MD_BUG();
552
                        err = 1;
553
                        goto abort;
554
                }
555
 
556
                if (spare_desc->raid_disk != sdisk->raid_disk) {
557
                        MD_BUG();
558
                        err = 1;
559
                        goto abort;
560
                }
561
 
562
                if (sdisk->raid_disk != spare_disk) {
563
                        MD_BUG();
564
                        err = 1;
565
                        goto abort;
566
                }
567
 
568
                if (failed_desc->raid_disk != fdisk->raid_disk) {
569
                        MD_BUG();
570
                        err = 1;
571
                        goto abort;
572
                }
573
 
574
                if (fdisk->raid_disk != failed_disk) {
575
                        MD_BUG();
576
                        err = 1;
577
                        goto abort;
578
                }
579
 
580
                /*
581
                 * do the switch finally
582
                 */
583
                spare_rdev = find_rdev_nr(mddev, spare_desc->number);
584
                failed_rdev = find_rdev_nr(mddev, failed_desc->number);
585
                xchg_values(spare_rdev->desc_nr, failed_rdev->desc_nr);
586
                spare_rdev->alias_device = 0;
587
                failed_rdev->alias_device = 1;
588
 
589
                xchg_values(*spare_desc, *failed_desc);
590
                xchg_values(*fdisk, *sdisk);
591
 
592
                /*
593
                 * (careful, 'failed' and 'spare' are switched from now on)
594
                 *
595
                 * we want to preserve linear numbering and we want to
596
                 * give the proper raid_disk number to the now activated
597
                 * disk. (this means we switch back these values)
598
                 */
599
 
600
                xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
601
                xchg_values(sdisk->raid_disk, fdisk->raid_disk);
602
                xchg_values(spare_desc->number, failed_desc->number);
603
                xchg_values(sdisk->number, fdisk->number);
604
 
605
                *d = failed_desc;
606
 
607
                if (sdisk->dev == MKDEV(0,0))
608
                        sdisk->used_slot = 0;
609
                /*
610
                 * this really activates the spare.
611
                 */
612
                fdisk->spare = 0;
613
 
614
                /*
615
                 * if we activate a spare, we definitely replace a
616
                 * non-operational disk slot in the 'low' area of
617
                 * the disk array.
618
                 */
619
 
620
                conf->working_disks++;
621
 
622
                break;
623
 
624
        case DISKOP_HOT_REMOVE_DISK:
625
                rdisk = conf->multipaths + removed_disk;
626
 
627
                if (rdisk->spare && (removed_disk < conf->raid_disks)) {
628
                        MD_BUG();
629
                        err = 1;
630
                        goto abort;
631
                }
632
                rdisk->dev = MKDEV(0,0);
633
                rdisk->used_slot = 0;
634
                conf->nr_disks--;
635
                break;
636
 
637
        case DISKOP_HOT_ADD_DISK:
638
                adisk = conf->multipaths + added_disk;
639
                added_desc = *d;
640
 
641
                if (added_disk != added_desc->number) {
642
                        MD_BUG();
643
                        err = 1;
644
                        goto abort;
645
                }
646
 
647
                adisk->number = added_desc->number;
648
                adisk->raid_disk = added_desc->raid_disk;
649
                adisk->dev = MKDEV(added_desc->major,added_desc->minor);
650
 
651
                adisk->operational = 0;
652
                adisk->spare = 1;
653
                adisk->used_slot = 1;
654
                conf->nr_disks++;
655
 
656
                break;
657
 
658
        default:
659
                MD_BUG();
660
                err = 1;
661
                goto abort;
662
        }
663
abort:
664
        md_spin_unlock_irq(&conf->device_lock);
665
 
666
        print_multipath_conf(conf);
667
        return err;
668
}
669
 
670
 
671
#define IO_ERROR KERN_ALERT \
672
"multipath: %s: unrecoverable IO read error for block %lu\n"
673
 
674
#define REDIRECT_SECTOR KERN_ERR \
675
"multipath: %s: redirecting sector %lu to another IO path\n"
676
 
677
/*
678
 * This is a kernel thread which:
679
 *
680
 *      1.      Retries failed read operations on working multipaths.
681
 *      2.      Updates the raid superblock when problems encounter.
682
 *      3.      Performs writes following reads for array syncronising.
683
 */
684
 
685
static void multipathd (void *data)
686
{
687
        struct multipath_bh *mp_bh;
688
        struct buffer_head *bh;
689
        unsigned long flags;
690
        mddev_t *mddev;
691
        kdev_t dev;
692
 
693
 
694
        for (;;) {
695
                md_spin_lock_irqsave(&retry_list_lock, flags);
696
                mp_bh = multipath_retry_list;
697
                if (!mp_bh)
698
                        break;
699
                multipath_retry_list = mp_bh->next_mp;
700
                md_spin_unlock_irqrestore(&retry_list_lock, flags);
701
 
702
                mddev = mp_bh->mddev;
703
                if (mddev->sb_dirty)
704
                        md_update_sb(mddev);
705
                bh = &mp_bh->bh_req;
706
                dev = bh->b_dev;
707
 
708
                multipath_map (mddev, &bh->b_dev);
709
                if (bh->b_dev == dev) {
710
                        printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
711
                        multipath_end_bh_io(mp_bh, 0);
712
                } else {
713
                        printk (REDIRECT_SECTOR,
714
                                partition_name(bh->b_dev), bh->b_blocknr);
715
                        bh->b_rdev = bh->b_dev;
716
                        bh->b_rsector = bh->b_blocknr;
717
                        generic_make_request (mp_bh->cmd, bh);
718
                }
719
        }
720
        md_spin_unlock_irqrestore(&retry_list_lock, flags);
721
}
722
#undef IO_ERROR
723
#undef REDIRECT_SECTOR
724
 
725
/*
726
 * This will catch the scenario in which one of the multipaths was
727
 * mounted as a normal device rather than as a part of a raid set.
728
 *
729
 * check_consistency is very personality-dependent, eg. RAID5 cannot
730
 * do this check, it uses another method.
731
 */
732
static int __check_consistency (mddev_t *mddev, int row)
733
{
734
        multipath_conf_t *conf = mddev_to_conf(mddev);
735
        int disks = MD_SB_DISKS;
736
        kdev_t dev;
737
        struct buffer_head *bh = NULL;
738
        int i, rc = 0;
739
        char *buffer = NULL;
740
 
741
        for (i = 0; i < disks; i++) {
742
                if (!conf->multipaths[i].operational)
743
                        continue;
744
                printk("(checking disk %d)\n",i);
745
                dev = conf->multipaths[i].dev;
746
                set_blocksize(dev, 4096);
747
                if ((bh = bread(dev, row / 4, 4096)) == NULL)
748
                        break;
749
                if (!buffer) {
750
                        buffer = (char *) __get_free_page(GFP_KERNEL);
751
                        if (!buffer)
752
                                break;
753
                        memcpy(buffer, bh->b_data, 4096);
754
                } else if (memcmp(buffer, bh->b_data, 4096)) {
755
                        rc = 1;
756
                        break;
757
                }
758
                bforget(bh);
759
                fsync_dev(dev);
760
                invalidate_buffers(dev);
761
                bh = NULL;
762
        }
763
        if (buffer)
764
                free_page((unsigned long) buffer);
765
        if (bh) {
766
                dev = bh->b_dev;
767
                bforget(bh);
768
                fsync_dev(dev);
769
                invalidate_buffers(dev);
770
        }
771
        return rc;
772
}
773
 
774
static int check_consistency (mddev_t *mddev)
775
{
776
        if (__check_consistency(mddev, 0))
777
/*
778
 * we do not do this currently, as it's perfectly possible to
779
 * have an inconsistent array when it's freshly created. Only
780
 * newly written data has to be consistent.
781
 */
782
                return 0;
783
 
784
        return 0;
785
}
786
 
787
#define INVALID_LEVEL KERN_WARNING \
788
"multipath: md%d: raid level not set to multipath IO (%d)\n"
789
 
790
#define NO_SB KERN_ERR \
791
"multipath: disabled IO path %s (couldn't access raid superblock)\n"
792
 
793
#define ERRORS KERN_ERR \
794
"multipath: disabled IO path %s (errors detected)\n"
795
 
796
#define NOT_IN_SYNC KERN_ERR \
797
"multipath: making IO path %s a spare path (not in sync)\n"
798
 
799
#define INCONSISTENT KERN_ERR \
800
"multipath: disabled IO path %s (inconsistent descriptor)\n"
801
 
802
#define ALREADY_RUNNING KERN_ERR \
803
"multipath: disabled IO path %s (multipath %d already operational)\n"
804
 
805
#define OPERATIONAL KERN_INFO \
806
"multipath: device %s operational as IO path %d\n"
807
 
808
#define MEM_ERROR KERN_ERR \
809
"multipath: couldn't allocate memory for md%d\n"
810
 
811
#define SPARE KERN_INFO \
812
"multipath: spare IO path %s\n"
813
 
814
#define NONE_OPERATIONAL KERN_ERR \
815
"multipath: no operational IO paths for md%d\n"
816
 
817
#define SB_DIFFERENCES KERN_ERR \
818
"multipath: detected IO path differences!\n"
819
 
820
#define ARRAY_IS_ACTIVE KERN_INFO \
821
"multipath: array md%d active with %d out of %d IO paths (%d spare IO paths)\n"
822
 
823
#define THREAD_ERROR KERN_ERR \
824
"multipath: couldn't allocate thread for md%d\n"
825
 
826
static int multipath_run (mddev_t *mddev)
827
{
828
        multipath_conf_t *conf;
829
        int i, j, disk_idx;
830
        struct multipath_info *disk, *disk2;
831
        mdp_super_t *sb = mddev->sb;
832
        mdp_disk_t *desc, *desc2;
833
        mdk_rdev_t *rdev, *def_rdev = NULL;
834
        struct md_list_head *tmp;
835
        int num_rdevs = 0;
836
 
837
        MOD_INC_USE_COUNT;
838
 
839
        if (sb->level != -4) {
840
                printk(INVALID_LEVEL, mdidx(mddev), sb->level);
841
                goto out;
842
        }
843
        /*
844
         * copy the already verified devices into our private MULTIPATH
845
         * bookkeeping area. [whatever we allocate in multipath_run(),
846
         * should be freed in multipath_stop()]
847
         */
848
 
849
        conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL);
850
        mddev->private = conf;
851
        if (!conf) {
852
                printk(MEM_ERROR, mdidx(mddev));
853
                goto out;
854
        }
855
        memset(conf, 0, sizeof(*conf));
856
 
857
        ITERATE_RDEV(mddev,rdev,tmp) {
858
                if (rdev->faulty) {
859
                        /* this is a "should never happen" case and if it */
860
                        /* ever does happen, a continue; won't help */
861
                        printk(ERRORS, partition_name(rdev->dev));
862
                        continue;
863
                } else {
864
                        /* this is a "should never happen" case and if it */
865
                        /* ever does happen, a continue; won't help */
866
                        if (!rdev->sb) {
867
                                MD_BUG();
868
                                continue;
869
                        }
870
                }
871
                if (rdev->desc_nr == -1) {
872
                        MD_BUG();
873
                        continue;
874
                }
875
 
876
                desc = &sb->disks[rdev->desc_nr];
877
                disk_idx = desc->raid_disk;
878
                disk = conf->multipaths + disk_idx;
879
 
880
                if (!disk_sync(desc))
881
                        printk(NOT_IN_SYNC, partition_name(rdev->dev));
882
 
883
                /*
884
                 * Mark all disks as spare to start with, then pick our
885
                 * active disk.  If we have a disk that is marked active
886
                 * in the sb, then use it, else use the first rdev.
887
                 */
888
                disk->number = desc->number;
889
                disk->raid_disk = desc->raid_disk;
890
                disk->dev = rdev->dev;
891
                disk->operational = 0;
892
                disk->spare = 1;
893
                disk->used_slot = 1;
894
                mark_disk_sync(desc);
895
 
896
                if (disk_active(desc)) {
897
                        if(!conf->working_disks) {
898
                                printk(OPERATIONAL, partition_name(rdev->dev),
899
                                        desc->raid_disk);
900
                                disk->operational = 1;
901
                                disk->spare = 0;
902
                                conf->working_disks++;
903
                                def_rdev = rdev;
904
                        } else {
905
                                mark_disk_spare(desc);
906
                        }
907
                } else
908
                        mark_disk_spare(desc);
909
 
910
                if(!num_rdevs++) def_rdev = rdev;
911
        }
912
        if(!conf->working_disks && num_rdevs) {
913
                desc = &sb->disks[def_rdev->desc_nr];
914
                disk = conf->multipaths + desc->raid_disk;
915
                printk(OPERATIONAL, partition_name(def_rdev->dev),
916
                        disk->raid_disk);
917
                disk->operational = 1;
918
                disk->spare = 0;
919
                conf->working_disks++;
920
                mark_disk_active(desc);
921
        }
922
        /*
923
         * Make sure our active path is in desc spot 0
924
         */
925
        if(def_rdev->desc_nr != 0) {
926
                rdev = find_rdev_nr(mddev, 0);
927
                desc = &sb->disks[def_rdev->desc_nr];
928
                desc2 = sb->disks;
929
                disk = conf->multipaths + desc->raid_disk;
930
                disk2 = conf->multipaths + desc2->raid_disk;
931
                xchg_values(*desc2,*desc);
932
                xchg_values(*disk2,*disk);
933
                xchg_values(desc2->number, desc->number);
934
                xchg_values(disk2->number, disk->number);
935
                xchg_values(desc2->raid_disk, desc->raid_disk);
936
                xchg_values(disk2->raid_disk, disk->raid_disk);
937
                if(rdev) {
938
                        xchg_values(def_rdev->desc_nr,rdev->desc_nr);
939
                } else {
940
                        def_rdev->desc_nr = 0;
941
                }
942
        }
943
        conf->raid_disks = sb->raid_disks = sb->active_disks = 1;
944
        conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs;
945
        sb->failed_disks = 0;
946
        sb->spare_disks = num_rdevs - 1;
947
        mddev->sb_dirty = 1;
948
        conf->mddev = mddev;
949
        conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
950
 
951
        init_waitqueue_head(&conf->wait_buffer);
952
 
953
        if (!conf->working_disks) {
954
                printk(NONE_OPERATIONAL, mdidx(mddev));
955
                goto out_free_conf;
956
        }
957
 
958
 
959
        /* pre-allocate some buffer_head structures.
960
         * As a minimum, 1 mpbh and raid_disks buffer_heads
961
         * would probably get us by in tight memory situations,
962
         * but a few more is probably a good idea.
963
         * For now, try NR_RESERVED_BUFS mpbh and
964
         * NR_RESERVED_BUFS*raid_disks bufferheads
965
         * This will allow at least NR_RESERVED_BUFS concurrent
966
         * reads or writes even if kmalloc starts failing
967
         */
968
        if (multipath_grow_mpbh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS) {
969
                printk(MEM_ERROR, mdidx(mddev));
970
                goto out_free_conf;
971
        }
972
 
973
        if ((sb->state & (1 << MD_SB_CLEAN))) {
974
                /*
975
                 * we do sanity checks even if the device says
976
                 * it's clean ...
977
                 */
978
                if (check_consistency(mddev)) {
979
                        printk(SB_DIFFERENCES);
980
                        sb->state &= ~(1 << MD_SB_CLEAN);
981
                }
982
        }
983
 
984
        {
985
                const char * name = "multipathd";
986
 
987
                conf->thread = md_register_thread(multipathd, conf, name);
988
                if (!conf->thread) {
989
                        printk(THREAD_ERROR, mdidx(mddev));
990
                        goto out_free_conf;
991
                }
992
        }
993
 
994
        /*
995
         * Regenerate the "device is in sync with the raid set" bit for
996
         * each device.
997
         */
998
        for (i = 0; i < MD_SB_DISKS; i++) {
999
                mark_disk_nonsync(sb->disks+i);
1000
                for (j = 0; j < sb->raid_disks; j++) {
1001
                        if (sb->disks[i].number == conf->multipaths[j].number)
1002
                                mark_disk_sync(sb->disks+i);
1003
                }
1004
        }
1005
 
1006
        printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks,
1007
                        sb->raid_disks, sb->spare_disks);
1008
        /*
1009
         * Ok, everything is just fine now
1010
         */
1011
        return 0;
1012
 
1013
out_free_conf:
1014
        multipath_shrink_mpbh(conf);
1015
        kfree(conf);
1016
        mddev->private = NULL;
1017
out:
1018
        MOD_DEC_USE_COUNT;
1019
        return -EIO;
1020
}
1021
 
1022
#undef INVALID_LEVEL
1023
#undef NO_SB
1024
#undef ERRORS
1025
#undef NOT_IN_SYNC
1026
#undef INCONSISTENT
1027
#undef ALREADY_RUNNING
1028
#undef OPERATIONAL
1029
#undef SPARE
1030
#undef NONE_OPERATIONAL
1031
#undef SB_DIFFERENCES
1032
#undef ARRAY_IS_ACTIVE
1033
 
1034
static int multipath_stop (mddev_t *mddev)
1035
{
1036
        multipath_conf_t *conf = mddev_to_conf(mddev);
1037
 
1038
        md_unregister_thread(conf->thread);
1039
        multipath_shrink_mpbh(conf);
1040
        kfree(conf);
1041
        mddev->private = NULL;
1042
        MOD_DEC_USE_COUNT;
1043
        return 0;
1044
}
1045
 
1046
static mdk_personality_t multipath_personality=
1047
{
1048
        name:           "multipath",
1049
        make_request:   multipath_make_request,
1050
        run:            multipath_run,
1051
        stop:           multipath_stop,
1052
        status:         multipath_status,
1053
        error_handler:  multipath_error,
1054
        diskop:         multipath_diskop,
1055
};
1056
 
1057
static int md__init multipath_init (void)
1058
{
1059
        return register_md_personality (MULTIPATH, &multipath_personality);
1060
}
1061
 
1062
static void multipath_exit (void)
1063
{
1064
        unregister_md_personality (MULTIPATH);
1065
}
1066
 
1067
module_init(multipath_init);
1068
module_exit(multipath_exit);
1069
MODULE_LICENSE("GPL");

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.