OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [drivers/] [md/] [raid5.c] - Blame information for rev 1765

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1275 phoenix
/*
2
 * raid5.c : Multiple Devices driver for Linux
3
 *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4
 *         Copyright (C) 1999, 2000 Ingo Molnar
5
 *
6
 * RAID-5 management functions.
7
 *
8
 * This program is free software; you can redistribute it and/or modify
9
 * it under the terms of the GNU General Public License as published by
10
 * the Free Software Foundation; either version 2, or (at your option)
11
 * any later version.
12
 *
13
 * You should have received a copy of the GNU General Public License
14
 * (for example /usr/src/linux/COPYING); if not, write to the Free
15
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16
 */
17
 
18
 
19
#include <linux/config.h>
20
#include <linux/module.h>
21
#include <linux/locks.h>
22
#include <linux/slab.h>
23
#include <linux/raid/raid5.h>
24
#include <asm/bitops.h>
25
#include <asm/atomic.h>
26
 
27
static mdk_personality_t raid5_personality;
28
 
29
/*
30
 * Stripe cache
31
 */
32
 
33
#define NR_STRIPES              256
34
#define IO_THRESHOLD            1
35
#define HASH_PAGES              1
36
#define HASH_PAGES_ORDER        0
37
#define NR_HASH                 (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
38
#define HASH_MASK               (NR_HASH - 1)
39
#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) / ((conf)->buffer_size >> 9)) & HASH_MASK])
40
 
41
/*
42
 * The following can be used to debug the driver
43
 */
44
#define RAID5_DEBUG     0
45
#define RAID5_PARANOIA  1
46
#if RAID5_PARANOIA && CONFIG_SMP
47
# define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
48
#else
49
# define CHECK_DEVLOCK()
50
#endif
51
 
52
#if RAID5_DEBUG
53
#define PRINTK(x...) printk(x)
54
#define inline
55
#define __inline__
56
#else
57
#define PRINTK(x...) do { } while (0)
58
#endif
59
 
60
static void print_raid5_conf (raid5_conf_t *conf);
61
 
62
static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
63
{
64
        if (atomic_dec_and_test(&sh->count)) {
65
                if (!list_empty(&sh->lru))
66
                        BUG();
67
                if (atomic_read(&conf->active_stripes)==0)
68
                        BUG();
69
                if (test_bit(STRIPE_HANDLE, &sh->state)) {
70
                        if (test_bit(STRIPE_DELAYED, &sh->state))
71
                                list_add_tail(&sh->lru, &conf->delayed_list);
72
                        else
73
                                list_add_tail(&sh->lru, &conf->handle_list);
74
                        md_wakeup_thread(conf->thread);
75
                } else {
76
                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
77
                                atomic_dec(&conf->preread_active_stripes);
78
                                if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
79
                                        md_wakeup_thread(conf->thread);
80
                        }
81
                        list_add_tail(&sh->lru, &conf->inactive_list);
82
                        atomic_dec(&conf->active_stripes);
83
                        if (!conf->inactive_blocked ||
84
                            atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
85
                                wake_up(&conf->wait_for_stripe);
86
                }
87
        }
88
}
89
static void release_stripe(struct stripe_head *sh)
90
{
91
        raid5_conf_t *conf = sh->raid_conf;
92
        unsigned long flags;
93
 
94
        spin_lock_irqsave(&conf->device_lock, flags);
95
        __release_stripe(conf, sh);
96
        spin_unlock_irqrestore(&conf->device_lock, flags);
97
}
98
 
99
static void remove_hash(struct stripe_head *sh)
100
{
101
        PRINTK("remove_hash(), stripe %lu\n", sh->sector);
102
 
103
        if (sh->hash_pprev) {
104
                if (sh->hash_next)
105
                        sh->hash_next->hash_pprev = sh->hash_pprev;
106
                *sh->hash_pprev = sh->hash_next;
107
                sh->hash_pprev = NULL;
108
        }
109
}
110
 
111
static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
112
{
113
        struct stripe_head **shp = &stripe_hash(conf, sh->sector);
114
 
115
        PRINTK("insert_hash(), stripe %lu\n",sh->sector);
116
 
117
        CHECK_DEVLOCK();
118
        if ((sh->hash_next = *shp) != NULL)
119
                (*shp)->hash_pprev = &sh->hash_next;
120
        *shp = sh;
121
        sh->hash_pprev = shp;
122
}
123
 
124
 
125
/* find an idle stripe, make sure it is unhashed, and return it. */
126
static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
127
{
128
        struct stripe_head *sh = NULL;
129
        struct list_head *first;
130
 
131
        CHECK_DEVLOCK();
132
        if (list_empty(&conf->inactive_list))
133
                goto out;
134
        first = conf->inactive_list.next;
135
        sh = list_entry(first, struct stripe_head, lru);
136
        list_del_init(first);
137
        remove_hash(sh);
138
        atomic_inc(&conf->active_stripes);
139
out:
140
        return sh;
141
}
142
 
143
static void shrink_buffers(struct stripe_head *sh, int num)
144
{
145
        struct buffer_head *bh;
146
        int i;
147
 
148
        for (i=0; i<num ; i++) {
149
                bh = sh->bh_cache[i];
150
                if (!bh)
151
                        return;
152
                sh->bh_cache[i] = NULL;
153
                free_page((unsigned long) bh->b_data);
154
                kfree(bh);
155
        }
156
}
157
 
158
static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority)
159
{
160
        struct buffer_head *bh;
161
        int i;
162
 
163
        for (i=0; i<num; i++) {
164
                struct page *page;
165
                bh = kmalloc(sizeof(struct buffer_head), priority);
166
                if (!bh)
167
                        return 1;
168
                memset(bh, 0, sizeof (struct buffer_head));
169
                init_waitqueue_head(&bh->b_wait);
170
                if ((page = alloc_page(priority)))
171
                        bh->b_data = page_address(page);
172
                else {
173
                        kfree(bh);
174
                        return 1;
175
                }
176
                atomic_set(&bh->b_count, 0);
177
                bh->b_page = page;
178
                sh->bh_cache[i] = bh;
179
 
180
        }
181
        return 0;
182
}
183
 
184
static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i);
185
 
186
static inline void init_stripe(struct stripe_head *sh, unsigned long sector)
187
{
188
        raid5_conf_t *conf = sh->raid_conf;
189
        int disks = conf->raid_disks, i;
190
 
191
        if (atomic_read(&sh->count) != 0)
192
                BUG();
193
        if (test_bit(STRIPE_HANDLE, &sh->state))
194
                BUG();
195
 
196
        CHECK_DEVLOCK();
197
        PRINTK("init_stripe called, stripe %lu\n", sh->sector);
198
 
199
        remove_hash(sh);
200
 
201
        sh->sector = sector;
202
        sh->size = conf->buffer_size;
203
        sh->state = 0;
204
 
205
        for (i=disks; i--; ) {
206
                if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||
207
                    buffer_locked(sh->bh_cache[i])) {
208
                        printk("sector=%lx i=%d %p %p %p %d\n",
209
                               sh->sector, i, sh->bh_read[i],
210
                               sh->bh_write[i], sh->bh_written[i],
211
                               buffer_locked(sh->bh_cache[i]));
212
                        BUG();
213
                }
214
                clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);
215
                raid5_build_block(sh, i);
216
        }
217
        insert_hash(conf, sh);
218
}
219
 
220
/* the buffer size has changed, so unhash all stripes
221
 * as active stripes complete, they will go onto inactive list
222
 */
223
static void shrink_stripe_cache(raid5_conf_t *conf)
224
{
225
        int i;
226
        CHECK_DEVLOCK();
227
        if (atomic_read(&conf->active_stripes))
228
                BUG();
229
        for (i=0; i < NR_HASH; i++) {
230
                struct stripe_head *sh;
231
                while ((sh = conf->stripe_hashtbl[i]))
232
                        remove_hash(sh);
233
        }
234
}
235
 
236
static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
237
{
238
        struct stripe_head *sh;
239
 
240
        CHECK_DEVLOCK();
241
        PRINTK("__find_stripe, sector %lu\n", sector);
242
        for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
243
                if (sh->sector == sector)
244
                        return sh;
245
        PRINTK("__stripe %lu not in cache\n", sector);
246
        return NULL;
247
}
248
 
249
static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector, int size, int noblock)
250
{
251
        struct stripe_head *sh;
252
 
253
        PRINTK("get_stripe, sector %lu\n", sector);
254
 
255
        md_spin_lock_irq(&conf->device_lock);
256
 
257
        do {
258
                if (conf->buffer_size == 0 ||
259
                    (size && size != conf->buffer_size)) {
260
                        /* either the size is being changed (buffer_size==0) or
261
                         * we need to change it.
262
                         * If size==0, we can proceed as soon as buffer_size gets set.
263
                         * If size>0, we can proceed when active_stripes reaches 0, or
264
                         * when someone else sets the buffer_size to size.
265
                         * If someone sets the buffer size to something else, we will need to
266
                         * assert that we want to change it again
267
                         */
268
                        int oldsize = conf->buffer_size;
269
                        PRINTK("get_stripe %ld/%d buffer_size is %d, %d active\n", sector, size, conf->buffer_size, atomic_read(&conf->active_stripes));
270
                        if (size==0)
271
                                wait_event_lock_irq(conf->wait_for_stripe,
272
                                                    conf->buffer_size,
273
                                                    conf->device_lock);
274
                        else {
275
                                while (conf->buffer_size != size && atomic_read(&conf->active_stripes)) {
276
                                        conf->buffer_size = 0;
277
                                        wait_event_lock_irq(conf->wait_for_stripe,
278
                                                            atomic_read(&conf->active_stripes)==0 || conf->buffer_size,
279
                                                            conf->device_lock);
280
                                        PRINTK("waited and now  %ld/%d buffer_size is %d - %d active\n", sector, size,
281
                                               conf->buffer_size, atomic_read(&conf->active_stripes));
282
                                }
283
 
284
                                if (conf->buffer_size != size) {
285
                                        printk("raid5: switching cache buffer size, %d --> %d\n", oldsize, size);
286
                                        shrink_stripe_cache(conf);
287
                                        if (size==0) BUG();
288
                                        conf->buffer_size = size;
289
                                        PRINTK("size now %d\n", conf->buffer_size);
290
                                }
291
                        }
292
                }
293
                if (size == 0)
294
                        sector -= sector & ((conf->buffer_size>>9)-1);
295
 
296
                sh = __find_stripe(conf, sector);
297
                if (!sh) {
298
                        if (!conf->inactive_blocked)
299
                                sh = get_free_stripe(conf);
300
                        if (noblock && sh == NULL)
301
                                break;
302
                        if (!sh) {
303
                                conf->inactive_blocked = 1;
304
                                wait_event_lock_irq(conf->wait_for_stripe,
305
                                                    !list_empty(&conf->inactive_list) &&
306
                                                    (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
307
                                                     || !conf->inactive_blocked),
308
                                                    conf->device_lock);
309
                                conf->inactive_blocked = 0;
310
                        } else
311
                                init_stripe(sh, sector);
312
                } else {
313
                        if (atomic_read(&sh->count)) {
314
                                if (!list_empty(&sh->lru))
315
                                        BUG();
316
                        } else {
317
                                if (!test_bit(STRIPE_HANDLE, &sh->state))
318
                                        atomic_inc(&conf->active_stripes);
319
                                if (list_empty(&sh->lru))
320
                                        BUG();
321
                                list_del_init(&sh->lru);
322
                        }
323
                }
324
        } while (sh == NULL);
325
 
326
        if (sh)
327
                atomic_inc(&sh->count);
328
 
329
        md_spin_unlock_irq(&conf->device_lock);
330
        return sh;
331
}
332
 
333
static int grow_stripes(raid5_conf_t *conf, int num, int priority)
334
{
335
        struct stripe_head *sh;
336
 
337
        while (num--) {
338
                sh = kmalloc(sizeof(struct stripe_head), priority);
339
                if (!sh)
340
                        return 1;
341
                memset(sh, 0, sizeof(*sh));
342
                sh->raid_conf = conf;
343
                sh->lock = SPIN_LOCK_UNLOCKED;
344
 
345
                if (grow_buffers(sh, conf->raid_disks, PAGE_SIZE, priority)) {
346
                        shrink_buffers(sh, conf->raid_disks);
347
                        kfree(sh);
348
                        return 1;
349
                }
350
                /* we just created an active stripe so... */
351
                atomic_set(&sh->count, 1);
352
                atomic_inc(&conf->active_stripes);
353
                INIT_LIST_HEAD(&sh->lru);
354
                release_stripe(sh);
355
        }
356
        return 0;
357
}
358
 
359
static void shrink_stripes(raid5_conf_t *conf, int num)
360
{
361
        struct stripe_head *sh;
362
 
363
        while (num--) {
364
                spin_lock_irq(&conf->device_lock);
365
                sh = get_free_stripe(conf);
366
                spin_unlock_irq(&conf->device_lock);
367
                if (!sh)
368
                        break;
369
                if (atomic_read(&sh->count))
370
                        BUG();
371
                shrink_buffers(sh, conf->raid_disks);
372
                kfree(sh);
373
                atomic_dec(&conf->active_stripes);
374
        }
375
}
376
 
377
 
378
static void raid5_end_read_request (struct buffer_head * bh, int uptodate)
379
{
380
        struct stripe_head *sh = bh->b_private;
381
        raid5_conf_t *conf = sh->raid_conf;
382
        int disks = conf->raid_disks, i;
383
        unsigned long flags;
384
 
385
        for (i=0 ; i<disks; i++)
386
                if (bh == sh->bh_cache[i])
387
                        break;
388
 
389
        PRINTK("end_read_request %lu/%d, count: %d, uptodate %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
390
        if (i == disks) {
391
                BUG();
392
                return;
393
        }
394
 
395
        if (uptodate) {
396
                struct buffer_head *buffer;
397
                spin_lock_irqsave(&conf->device_lock, flags);
398
                /* we can return a buffer if we bypassed the cache or
399
                 * if the top buffer is not in highmem.  If there are
400
                 * multiple buffers, leave the extra work to
401
                 * handle_stripe
402
                 */
403
                buffer = sh->bh_read[i];
404
                if (buffer &&
405
                    (!PageHighMem(buffer->b_page)
406
                     || buffer->b_page == bh->b_page )
407
                        ) {
408
                        sh->bh_read[i] = buffer->b_reqnext;
409
                        buffer->b_reqnext = NULL;
410
                } else
411
                        buffer = NULL;
412
                spin_unlock_irqrestore(&conf->device_lock, flags);
413
                if (sh->bh_page[i]==NULL)
414
                        set_bit(BH_Uptodate, &bh->b_state);
415
                if (buffer) {
416
                        if (buffer->b_page != bh->b_page)
417
                                memcpy(buffer->b_data, bh->b_data, bh->b_size);
418
                        buffer->b_end_io(buffer, 1);
419
                }
420
        } else {
421
                md_error(conf->mddev, bh->b_dev);
422
                clear_bit(BH_Uptodate, &bh->b_state);
423
        }
424
        /* must restore b_page before unlocking buffer... */
425
        if (sh->bh_page[i]) {
426
                bh->b_page = sh->bh_page[i];
427
                bh->b_data = page_address(bh->b_page);
428
                sh->bh_page[i] = NULL;
429
                clear_bit(BH_Uptodate, &bh->b_state);
430
        }
431
        clear_bit(BH_Lock, &bh->b_state);
432
        set_bit(STRIPE_HANDLE, &sh->state);
433
        release_stripe(sh);
434
}
435
 
436
static void raid5_end_write_request (struct buffer_head *bh, int uptodate)
437
{
438
        struct stripe_head *sh = bh->b_private;
439
        raid5_conf_t *conf = sh->raid_conf;
440
        int disks = conf->raid_disks, i;
441
        unsigned long flags;
442
 
443
        for (i=0 ; i<disks; i++)
444
                if (bh == sh->bh_cache[i])
445
                        break;
446
 
447
        PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
448
        if (i == disks) {
449
                BUG();
450
                return;
451
        }
452
 
453
        md_spin_lock_irqsave(&conf->device_lock, flags);
454
        if (!uptodate)
455
                md_error(conf->mddev, bh->b_dev);
456
        clear_bit(BH_Lock, &bh->b_state);
457
        set_bit(STRIPE_HANDLE, &sh->state);
458
        __release_stripe(conf, sh);
459
        md_spin_unlock_irqrestore(&conf->device_lock, flags);
460
}
461
 
462
 
463
 
464
static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)
465
{
466
        raid5_conf_t *conf = sh->raid_conf;
467
        struct buffer_head *bh = sh->bh_cache[i];
468
        unsigned long block = sh->sector / (sh->size >> 9);
469
 
470
        init_buffer(bh, raid5_end_read_request, sh);
471
        bh->b_dev       = conf->disks[i].dev;
472
        bh->b_blocknr   = block;
473
 
474
        bh->b_state     = (1 << BH_Req) | (1 << BH_Mapped);
475
        bh->b_size      = sh->size;
476
        bh->b_list      = BUF_LOCKED;
477
        return bh;
478
}
479
 
480
static int raid5_error (mddev_t *mddev, kdev_t dev)
481
{
482
        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
483
        mdp_super_t *sb = mddev->sb;
484
        struct disk_info *disk;
485
        int i;
486
 
487
        PRINTK("raid5_error called\n");
488
 
489
        for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
490
                if (disk->dev == dev) {
491
                        if (disk->operational) {
492
                                disk->operational = 0;
493
                                mark_disk_faulty(sb->disks+disk->number);
494
                                mark_disk_nonsync(sb->disks+disk->number);
495
                                mark_disk_inactive(sb->disks+disk->number);
496
                                sb->active_disks--;
497
                                sb->working_disks--;
498
                                sb->failed_disks++;
499
                                mddev->sb_dirty = 1;
500
                                conf->working_disks--;
501
                                conf->failed_disks++;
502
                                md_wakeup_thread(conf->thread);
503
                                printk (KERN_ALERT
504
                                        "raid5: Disk failure on %s, disabling device."
505
                                        " Operation continuing on %d devices\n",
506
                                        partition_name (dev), conf->working_disks);
507
                        }
508
                        return 0;
509
                }
510
        }
511
        /*
512
         * handle errors in spares (during reconstruction)
513
         */
514
        if (conf->spare) {
515
                disk = conf->spare;
516
                if (disk->dev == dev) {
517
                        printk (KERN_ALERT
518
                                "raid5: Disk failure on spare %s\n",
519
                                partition_name (dev));
520
                        if (!conf->spare->operational) {
521
                                /* probably a SET_DISK_FAULTY ioctl */
522
                                return -EIO;
523
                        }
524
                        disk->operational = 0;
525
                        disk->write_only = 0;
526
                        conf->spare = NULL;
527
                        mark_disk_faulty(sb->disks+disk->number);
528
                        mark_disk_nonsync(sb->disks+disk->number);
529
                        mark_disk_inactive(sb->disks+disk->number);
530
                        sb->spare_disks--;
531
                        sb->working_disks--;
532
                        sb->failed_disks++;
533
 
534
                        mddev->sb_dirty = 1;
535
                        md_wakeup_thread(conf->thread);
536
 
537
                        return 0;
538
                }
539
        }
540
        MD_BUG();
541
        return -EIO;
542
}
543
 
544
/*
545
 * Input: a 'big' sector number,
546
 * Output: index of the data and parity disk, and the sector # in them.
547
 */
548
static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks,
549
                        unsigned int data_disks, unsigned int * dd_idx,
550
                        unsigned int * pd_idx, raid5_conf_t *conf)
551
{
552
        unsigned long stripe;
553
        unsigned long chunk_number;
554
        unsigned int chunk_offset;
555
        unsigned long new_sector;
556
        int sectors_per_chunk = conf->chunk_size >> 9;
557
 
558
        /* First compute the information on this sector */
559
 
560
        /*
561
         * Compute the chunk number and the sector offset inside the chunk
562
         */
563
        chunk_number = r_sector / sectors_per_chunk;
564
        chunk_offset = r_sector % sectors_per_chunk;
565
 
566
        /*
567
         * Compute the stripe number
568
         */
569
        stripe = chunk_number / data_disks;
570
 
571
        /*
572
         * Compute the data disk and parity disk indexes inside the stripe
573
         */
574
        *dd_idx = chunk_number % data_disks;
575
 
576
        /*
577
         * Select the parity disk based on the user selected algorithm.
578
         */
579
        if (conf->level == 4)
580
                *pd_idx = data_disks;
581
        else switch (conf->algorithm) {
582
                case ALGORITHM_LEFT_ASYMMETRIC:
583
                        *pd_idx = data_disks - stripe % raid_disks;
584
                        if (*dd_idx >= *pd_idx)
585
                                (*dd_idx)++;
586
                        break;
587
                case ALGORITHM_RIGHT_ASYMMETRIC:
588
                        *pd_idx = stripe % raid_disks;
589
                        if (*dd_idx >= *pd_idx)
590
                                (*dd_idx)++;
591
                        break;
592
                case ALGORITHM_LEFT_SYMMETRIC:
593
                        *pd_idx = data_disks - stripe % raid_disks;
594
                        *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
595
                        break;
596
                case ALGORITHM_RIGHT_SYMMETRIC:
597
                        *pd_idx = stripe % raid_disks;
598
                        *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
599
                        break;
600
                default:
601
                        printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
602
        }
603
 
604
        /*
605
         * Finally, compute the new sector number
606
         */
607
        new_sector = stripe * sectors_per_chunk + chunk_offset;
608
        return new_sector;
609
}
610
 
611
#if 0
612
static unsigned long compute_blocknr(struct stripe_head *sh, int i)
613
{
614
        raid5_conf_t *conf = sh->raid_conf;
615
        int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
616
        unsigned long new_sector = sh->sector, check;
617
        int sectors_per_chunk = conf->chunk_size >> 9;
618
        unsigned long stripe = new_sector / sectors_per_chunk;
619
        int chunk_offset = new_sector % sectors_per_chunk;
620
        int chunk_number, dummy1, dummy2, dd_idx = i;
621
        unsigned long r_sector, blocknr;
622
 
623
        switch (conf->algorithm) {
624
                case ALGORITHM_LEFT_ASYMMETRIC:
625
                case ALGORITHM_RIGHT_ASYMMETRIC:
626
                        if (i > sh->pd_idx)
627
                                i--;
628
                        break;
629
                case ALGORITHM_LEFT_SYMMETRIC:
630
                case ALGORITHM_RIGHT_SYMMETRIC:
631
                        if (i < sh->pd_idx)
632
                                i += raid_disks;
633
                        i -= (sh->pd_idx + 1);
634
                        break;
635
                default:
636
                        printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
637
        }
638
 
639
        chunk_number = stripe * data_disks + i;
640
        r_sector = chunk_number * sectors_per_chunk + chunk_offset;
641
        blocknr = r_sector / (sh->size >> 9);
642
 
643
        check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
644
        if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
645
                printk("compute_blocknr: map not correct\n");
646
                return 0;
647
        }
648
        return blocknr;
649
}
650
#endif
651
 
652
#define check_xor()     do {                                    \
653
                           if (count == MAX_XOR_BLOCKS) {       \
654
                                xor_block(count, bh_ptr);       \
655
                                count = 1;                      \
656
                           }                                    \
657
                        } while(0)
658
 
659
 
660
static void compute_block(struct stripe_head *sh, int dd_idx)
661
{
662
        raid5_conf_t *conf = sh->raid_conf;
663
        int i, count, disks = conf->raid_disks;
664
        struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh;
665
 
666
        PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx);
667
 
668
 
669
        memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size);
670
        bh_ptr[0] = sh->bh_cache[dd_idx];
671
        count = 1;
672
        for (i = disks ; i--; ) {
673
                if (i == dd_idx)
674
                        continue;
675
                bh = sh->bh_cache[i];
676
                if (buffer_uptodate(bh))
677
                        bh_ptr[count++] = bh;
678
                else
679
                        printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
680
 
681
                check_xor();
682
        }
683
        if (count != 1)
684
                xor_block(count, bh_ptr);
685
        set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state);
686
}
687
 
688
static void compute_parity(struct stripe_head *sh, int method)
689
{
690
        raid5_conf_t *conf = sh->raid_conf;
691
        int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
692
        struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
693
        struct buffer_head *chosen[MD_SB_DISKS];
694
 
695
        PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method);
696
        memset(chosen, 0, sizeof(chosen));
697
 
698
        count = 1;
699
        bh_ptr[0] = sh->bh_cache[pd_idx];
700
        switch(method) {
701
        case READ_MODIFY_WRITE:
702
                if (!buffer_uptodate(sh->bh_cache[pd_idx]))
703
                        BUG();
704
                for (i=disks ; i-- ;) {
705
                        if (i==pd_idx)
706
                                continue;
707
                        if (sh->bh_write[i] &&
708
                            buffer_uptodate(sh->bh_cache[i])) {
709
                                bh_ptr[count++] = sh->bh_cache[i];
710
                                chosen[i] = sh->bh_write[i];
711
                                sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
712
                                chosen[i]->b_reqnext = sh->bh_written[i];
713
                                sh->bh_written[i] = chosen[i];
714
                                check_xor();
715
                        }
716
                }
717
                break;
718
        case RECONSTRUCT_WRITE:
719
                memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size);
720
                for (i= disks; i-- ;)
721
                        if (i!=pd_idx && sh->bh_write[i]) {
722
                                chosen[i] = sh->bh_write[i];
723
                                sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
724
                                chosen[i]->b_reqnext = sh->bh_written[i];
725
                                sh->bh_written[i] = chosen[i];
726
                        }
727
                break;
728
        case CHECK_PARITY:
729
                break;
730
        }
731
        if (count>1) {
732
                xor_block(count, bh_ptr);
733
                count = 1;
734
        }
735
 
736
        for (i = disks; i--;)
737
                if (chosen[i]) {
738
                        struct buffer_head *bh = sh->bh_cache[i];
739
                        char *bdata;
740
                        bdata = bh_kmap(chosen[i]);
741
                        memcpy(bh->b_data,
742
                               bdata,sh->size);
743
                        bh_kunmap(chosen[i]);
744
                        set_bit(BH_Lock, &bh->b_state);
745
                        mark_buffer_uptodate(bh, 1);
746
                }
747
 
748
        switch(method) {
749
        case RECONSTRUCT_WRITE:
750
        case CHECK_PARITY:
751
                for (i=disks; i--;)
752
                        if (i != pd_idx) {
753
                                bh_ptr[count++] = sh->bh_cache[i];
754
                                check_xor();
755
                        }
756
                break;
757
        case READ_MODIFY_WRITE:
758
                for (i = disks; i--;)
759
                        if (chosen[i]) {
760
                                bh_ptr[count++] = sh->bh_cache[i];
761
                                check_xor();
762
                        }
763
        }
764
        if (count != 1)
765
                xor_block(count, bh_ptr);
766
 
767
        if (method != CHECK_PARITY) {
768
                mark_buffer_uptodate(sh->bh_cache[pd_idx], 1);
769
                set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state);
770
        } else
771
                mark_buffer_uptodate(sh->bh_cache[pd_idx], 0);
772
}
773
 
774
static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
775
{
776
        struct buffer_head **bhp;
777
        raid5_conf_t *conf = sh->raid_conf;
778
 
779
        PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector);
780
 
781
 
782
        spin_lock(&sh->lock);
783
        spin_lock_irq(&conf->device_lock);
784
        bh->b_reqnext = NULL;
785
        if (rw == READ)
786
                bhp = &sh->bh_read[dd_idx];
787
        else
788
                bhp = &sh->bh_write[dd_idx];
789
        while (*bhp) {
790
                printk(KERN_NOTICE "raid5: multiple %d requests for sector %ld\n", rw, sh->sector);
791
                bhp = & (*bhp)->b_reqnext;
792
        }
793
        *bhp = bh;
794
        spin_unlock_irq(&conf->device_lock);
795
        spin_unlock(&sh->lock);
796
 
797
        PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx);
798
}
799
 
800
 
801
 
802
 
803
 
804
/*
805
 * handle_stripe - do things to a stripe.
806
 *
807
 * We lock the stripe and then examine the state of various bits
808
 * to see what needs to be done.
809
 * Possible results:
810
 *    return some read request which now have data
811
 *    return some write requests which are safely on disc
812
 *    schedule a read on some buffers
813
 *    schedule a write of some buffers
814
 *    return confirmation of parity correctness
815
 *
816
 * Parity calculations are done inside the stripe lock
817
 * buffers are taken off read_list or write_list, and bh_cache buffers
818
 * get BH_Lock set before the stripe lock is released.
819
 *
820
 */
821
 
822
static void handle_stripe(struct stripe_head *sh)
823
{
824
        raid5_conf_t *conf = sh->raid_conf;
825
        int disks = conf->raid_disks;
826
        struct buffer_head *return_ok= NULL, *return_fail = NULL;
827
        int action[MD_SB_DISKS];
828
        int i;
829
        int syncing;
830
        int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
831
        int failed_num=0;
832
        struct buffer_head *bh;
833
 
834
        PRINTK("handling stripe %ld, cnt=%d, pd_idx=%d\n", sh->sector, atomic_read(&sh->count), sh->pd_idx);
835
        memset(action, 0, sizeof(action));
836
 
837
        spin_lock(&sh->lock);
838
        clear_bit(STRIPE_HANDLE, &sh->state);
839
        clear_bit(STRIPE_DELAYED, &sh->state);
840
 
841
        syncing = test_bit(STRIPE_SYNCING, &sh->state);
842
        /* Now to look around and see what can be done */
843
 
844
        for (i=disks; i--; ) {
845
                bh = sh->bh_cache[i];
846
                PRINTK("check %d: state 0x%lx read %p write %p written %p\n", i, bh->b_state, sh->bh_read[i], sh->bh_write[i], sh->bh_written[i]);
847
                /* maybe we can reply to a read */
848
                if (buffer_uptodate(bh) && sh->bh_read[i]) {
849
                        struct buffer_head *rbh, *rbh2;
850
                        PRINTK("Return read for disc %d\n", i);
851
                        spin_lock_irq(&conf->device_lock);
852
                        rbh = sh->bh_read[i];
853
                        sh->bh_read[i] = NULL;
854
                        spin_unlock_irq(&conf->device_lock);
855
                        while (rbh) {
856
                                char *bdata;
857
                                bdata = bh_kmap(rbh);
858
                                memcpy(bdata, bh->b_data, bh->b_size);
859
                                bh_kunmap(rbh);
860
                                rbh2 = rbh->b_reqnext;
861
                                rbh->b_reqnext = return_ok;
862
                                return_ok = rbh;
863
                                rbh = rbh2;
864
                        }
865
                }
866
 
867
                /* now count some things */
868
                if (buffer_locked(bh)) locked++;
869
                if (buffer_uptodate(bh)) uptodate++;
870
 
871
 
872
                if (sh->bh_read[i]) to_read++;
873
                if (sh->bh_write[i]) to_write++;
874
                if (sh->bh_written[i]) written++;
875
                if (!conf->disks[i].operational) {
876
                        failed++;
877
                        failed_num = i;
878
                }
879
        }
880
        PRINTK("locked=%d uptodate=%d to_read=%d to_write=%d failed=%d failed_num=%d\n",
881
               locked, uptodate, to_read, to_write, failed, failed_num);
882
        /* check if the array has lost two devices and, if so, some requests might
883
         * need to be failed
884
         */
885
        if (failed > 1 && to_read+to_write+written) {
886
                for (i=disks; i--; ) {
887
                        /* fail all writes first */
888
                        if (sh->bh_write[i]) to_write--;
889
                        while ((bh = sh->bh_write[i])) {
890
                                sh->bh_write[i] = bh->b_reqnext;
891
                                bh->b_reqnext = return_fail;
892
                                return_fail = bh;
893
                        }
894
                        /* and fail all 'written' */
895
                        if (sh->bh_written[i]) written--;
896
                        while ((bh = sh->bh_written[i])) {
897
                                sh->bh_written[i] = bh->b_reqnext;
898
                                bh->b_reqnext = return_fail;
899
                                return_fail = bh;
900
                        }
901
 
902
                        /* fail any reads if this device is non-operational */
903
                        if (!conf->disks[i].operational) {
904
                                spin_lock_irq(&conf->device_lock);
905
                                if (sh->bh_read[i]) to_read--;
906
                                while ((bh = sh->bh_read[i])) {
907
                                        sh->bh_read[i] = bh->b_reqnext;
908
                                        bh->b_reqnext = return_fail;
909
                                        return_fail = bh;
910
                                }
911
                                spin_unlock_irq(&conf->device_lock);
912
                        }
913
                }
914
        }
915
        if (failed > 1 && syncing) {
916
                md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,0);
917
                clear_bit(STRIPE_SYNCING, &sh->state);
918
                syncing = 0;
919
        }
920
 
921
        /* might be able to return some write requests if the parity block
922
         * is safe, or on a failed drive
923
         */
924
        bh = sh->bh_cache[sh->pd_idx];
925
        if ( written &&
926
             ( (conf->disks[sh->pd_idx].operational && !buffer_locked(bh) && buffer_uptodate(bh))
927
               || (failed == 1 && failed_num == sh->pd_idx))
928
            ) {
929
            /* any written block on a uptodate or failed drive can be returned */
930
            for (i=disks; i--; )
931
                if (sh->bh_written[i]) {
932
                    bh = sh->bh_cache[i];
933
                    if (!conf->disks[sh->pd_idx].operational ||
934
                        (!buffer_locked(bh) && buffer_uptodate(bh)) ) {
935
                        /* maybe we can return some write requests */
936
                        struct buffer_head *wbh, *wbh2;
937
                        PRINTK("Return write for disc %d\n", i);
938
                        wbh = sh->bh_written[i];
939
                        sh->bh_written[i] = NULL;
940
                        while (wbh) {
941
                            wbh2 = wbh->b_reqnext;
942
                            wbh->b_reqnext = return_ok;
943
                            return_ok = wbh;
944
                            wbh = wbh2;
945
                        }
946
                    }
947
                }
948
        }
949
 
950
        /* Now we might consider reading some blocks, either to check/generate
951
         * parity, or to satisfy requests
952
         */
953
        if (to_read || (syncing && (uptodate+failed < disks))) {
954
                for (i=disks; i--;) {
955
                        bh = sh->bh_cache[i];
956
                        if (!buffer_locked(bh) && !buffer_uptodate(bh) &&
957
                            (sh->bh_read[i] || syncing || (failed && sh->bh_read[failed_num]))) {
958
                                /* we would like to get this block, possibly
959
                                 * by computing it, but we might not be able to
960
                                 */
961
                                if (uptodate == disks-1) {
962
                                        PRINTK("Computing block %d\n", i);
963
                                        compute_block(sh, i);
964
                                        uptodate++;
965
                                } else if (conf->disks[i].operational) {
966
                                        set_bit(BH_Lock, &bh->b_state);
967
                                        action[i] = READ+1;
968
                                        /* if I am just reading this block and we don't have
969
                                           a failed drive, or any pending writes then sidestep the cache */
970
                                        if (sh->bh_page[i]) BUG();
971
                                        if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
972
                                            ! syncing && !failed && !to_write) {
973
                                                sh->bh_page[i] = sh->bh_cache[i]->b_page;
974
                                                sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;
975
                                                sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;
976
                                        }
977
                                        locked++;
978
                                        PRINTK("Reading block %d (sync=%d)\n", i, syncing);
979
                                        if (syncing)
980
                                                md_sync_acct(conf->disks[i].dev, bh->b_size>>9);
981
                                }
982
                        }
983
                }
984
                set_bit(STRIPE_HANDLE, &sh->state);
985
        }
986
 
987
        /* now to consider writing and what else, if anything should be read */
988
        if (to_write) {
989
                int rmw=0, rcw=0;
990
                for (i=disks ; i--;) {
991
                        /* would I have to read this buffer for read_modify_write */
992
                        bh = sh->bh_cache[i];
993
                        if ((sh->bh_write[i] || i == sh->pd_idx) &&
994
                            (!buffer_locked(bh) || sh->bh_page[i]) &&
995
                            !buffer_uptodate(bh)) {
996
                                if (conf->disks[i].operational
997
/*                                  && !(conf->resync_parity && i == sh->pd_idx) */
998
                                        )
999
                                        rmw++;
1000
                                else rmw += 2*disks;  /* cannot read it */
1001
                        }
1002
                        /* Would I have to read this buffer for reconstruct_write */
1003
                        if (!sh->bh_write[i] && i != sh->pd_idx &&
1004
                            (!buffer_locked(bh) || sh->bh_page[i]) &&
1005
                            !buffer_uptodate(bh)) {
1006
                                if (conf->disks[i].operational) rcw++;
1007
                                else rcw += 2*disks;
1008
                        }
1009
                }
1010
                PRINTK("for sector %ld, rmw=%d rcw=%d\n", sh->sector, rmw, rcw);
1011
                set_bit(STRIPE_HANDLE, &sh->state);
1012
                if (rmw < rcw && rmw > 0)
1013
                        /* prefer read-modify-write, but need to get some data */
1014
                        for (i=disks; i--;) {
1015
                                bh = sh->bh_cache[i];
1016
                                if ((sh->bh_write[i] || i == sh->pd_idx) &&
1017
                                    !buffer_locked(bh) && !buffer_uptodate(bh) &&
1018
                                    conf->disks[i].operational) {
1019
                                        if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1020
                                        {
1021
                                                PRINTK("Read_old block %d for r-m-w\n", i);
1022
                                                set_bit(BH_Lock, &bh->b_state);
1023
                                                action[i] = READ+1;
1024
                                                locked++;
1025
                                        } else {
1026
                                                set_bit(STRIPE_DELAYED, &sh->state);
1027
                                                set_bit(STRIPE_HANDLE, &sh->state);
1028
                                        }
1029
                                }
1030
                        }
1031
                if (rcw <= rmw && rcw > 0)
1032
                        /* want reconstruct write, but need to get some data */
1033
                        for (i=disks; i--;) {
1034
                                bh = sh->bh_cache[i];
1035
                                if (!sh->bh_write[i]  && i != sh->pd_idx &&
1036
                                    !buffer_locked(bh) && !buffer_uptodate(bh) &&
1037
                                    conf->disks[i].operational) {
1038
                                        if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1039
                                        {
1040
                                                PRINTK("Read_old block %d for Reconstruct\n", i);
1041
                                                set_bit(BH_Lock, &bh->b_state);
1042
                                                action[i] = READ+1;
1043
                                                locked++;
1044
                                        } else {
1045
                                                set_bit(STRIPE_DELAYED, &sh->state);
1046
                                                set_bit(STRIPE_HANDLE, &sh->state);
1047
                                        }
1048
                                }
1049
                        }
1050
                /* now if nothing is locked, and if we have enough data, we can start a write request */
1051
                if (locked == 0 && (rcw == 0 ||rmw == 0)) {
1052
                        PRINTK("Computing parity...\n");
1053
                        compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1054
                        /* now every locked buffer is ready to be written */
1055
                        for (i=disks; i--;)
1056
                                if (buffer_locked(sh->bh_cache[i])) {
1057
                                        PRINTK("Writing block %d\n", i);
1058
                                        locked++;
1059
                                        action[i] = WRITE+1;
1060
                                        if (!conf->disks[i].operational
1061
                                            || (i==sh->pd_idx && failed == 0))
1062
                                                set_bit(STRIPE_INSYNC, &sh->state);
1063
                                }
1064
                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1065
                                atomic_dec(&conf->preread_active_stripes);
1066
                                if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
1067
                                        md_wakeup_thread(conf->thread);
1068
                        }
1069
                }
1070
        }
1071
 
1072
        /* maybe we need to check and possibly fix the parity for this stripe
1073
         * Any reads will already have been scheduled, so we just see if enough data
1074
         * is available
1075
         */
1076
        if (syncing && locked == 0 &&
1077
            !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
1078
                set_bit(STRIPE_HANDLE, &sh->state);
1079
                if (failed == 0) {
1080
                        if (uptodate != disks)
1081
                                BUG();
1082
                        compute_parity(sh, CHECK_PARITY);
1083
                        uptodate--;
1084
                        bh = sh->bh_cache[sh->pd_idx];
1085
                        if ((*(u32*)bh->b_data) == 0 &&
1086
                            !memcmp(bh->b_data, bh->b_data+4, bh->b_size-4)) {
1087
                                /* parity is correct (on disc, not in buffer any more) */
1088
                                set_bit(STRIPE_INSYNC, &sh->state);
1089
                        }
1090
                }
1091
                if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1092
                        struct disk_info *spare;
1093
                        if (failed==0)
1094
                                failed_num = sh->pd_idx;
1095
                        /* should be able to compute the missing block and write it to spare */
1096
                        if (!buffer_uptodate(sh->bh_cache[failed_num])) {
1097
                                if (uptodate+1 != disks)
1098
                                        BUG();
1099
                                compute_block(sh, failed_num);
1100
                                uptodate++;
1101
                        }
1102
                        if (uptodate != disks)
1103
                                BUG();
1104
                        bh = sh->bh_cache[failed_num];
1105
                        set_bit(BH_Lock, &bh->b_state);
1106
                        action[failed_num] = WRITE+1;
1107
                        locked++;
1108
                        set_bit(STRIPE_INSYNC, &sh->state);
1109
                        if (conf->disks[failed_num].operational)
1110
                                md_sync_acct(conf->disks[failed_num].dev, bh->b_size>>9);
1111
                        else if ((spare=conf->spare))
1112
                                md_sync_acct(spare->dev, bh->b_size>>9);
1113
 
1114
                }
1115
        }
1116
        if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1117
                md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,1);
1118
                clear_bit(STRIPE_SYNCING, &sh->state);
1119
        }
1120
 
1121
 
1122
        spin_unlock(&sh->lock);
1123
 
1124
        while ((bh=return_ok)) {
1125
                return_ok = bh->b_reqnext;
1126
                bh->b_reqnext = NULL;
1127
                bh->b_end_io(bh, 1);
1128
        }
1129
        while ((bh=return_fail)) {
1130
                return_fail = bh->b_reqnext;
1131
                bh->b_reqnext = NULL;
1132
                bh->b_end_io(bh, 0);
1133
        }
1134
        for (i=disks; i-- ;)
1135
                if (action[i]) {
1136
                        struct buffer_head *bh = sh->bh_cache[i];
1137
                        struct disk_info *spare = conf->spare;
1138
                        int skip = 0;
1139
                        if (action[i] == READ+1)
1140
                                bh->b_end_io = raid5_end_read_request;
1141
                        else
1142
                                bh->b_end_io = raid5_end_write_request;
1143
                        if (conf->disks[i].operational)
1144
                                bh->b_dev = conf->disks[i].dev;
1145
                        else if (spare && action[i] == WRITE+1)
1146
                                bh->b_dev = spare->dev;
1147
                        else skip=1;
1148
                        if (!skip) {
1149
                                PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i);
1150
                                atomic_inc(&sh->count);
1151
                                bh->b_rdev = bh->b_dev;
1152
                                bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
1153
                                generic_make_request(action[i]-1, bh);
1154
                        } else {
1155
                                PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector);
1156
                                clear_bit(BH_Lock, &bh->b_state);
1157
                                set_bit(STRIPE_HANDLE, &sh->state);
1158
                        }
1159
                }
1160
}
1161
 
1162
static inline void raid5_activate_delayed(raid5_conf_t *conf)
1163
{
1164
        if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
1165
                while (!list_empty(&conf->delayed_list)) {
1166
                        struct list_head *l = conf->delayed_list.next;
1167
                        struct stripe_head *sh;
1168
                        sh = list_entry(l, struct stripe_head, lru);
1169
                        list_del_init(l);
1170
                        clear_bit(STRIPE_DELAYED, &sh->state);
1171
                        if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1172
                                atomic_inc(&conf->preread_active_stripes);
1173
                        list_add_tail(&sh->lru, &conf->handle_list);
1174
                }
1175
        }
1176
}
1177
static void raid5_unplug_device(void *data)
1178
{
1179
        raid5_conf_t *conf = (raid5_conf_t *)data;
1180
        unsigned long flags;
1181
 
1182
        spin_lock_irqsave(&conf->device_lock, flags);
1183
 
1184
        raid5_activate_delayed(conf);
1185
 
1186
        conf->plugged = 0;
1187
        md_wakeup_thread(conf->thread);
1188
 
1189
        spin_unlock_irqrestore(&conf->device_lock, flags);
1190
}
1191
 
1192
static inline void raid5_plug_device(raid5_conf_t *conf)
1193
{
1194
        spin_lock_irq(&conf->device_lock);
1195
        if (list_empty(&conf->delayed_list))
1196
                if (!conf->plugged) {
1197
                        conf->plugged = 1;
1198
                        queue_task(&conf->plug_tq, &tq_disk);
1199
                }
1200
        spin_unlock_irq(&conf->device_lock);
1201
}
1202
 
1203
static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
1204
{
1205
        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1206
        const unsigned int raid_disks = conf->raid_disks;
1207
        const unsigned int data_disks = raid_disks - 1;
1208
        unsigned int dd_idx, pd_idx;
1209
        unsigned long new_sector;
1210
        int read_ahead = 0;
1211
 
1212
        struct stripe_head *sh;
1213
 
1214
        if (rw == READA) {
1215
                rw = READ;
1216
                read_ahead=1;
1217
        }
1218
 
1219
        new_sector = raid5_compute_sector(bh->b_rsector,
1220
                        raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1221
 
1222
        PRINTK("raid5_make_request, sector %lu\n", new_sector);
1223
        sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead);
1224
        if (sh) {
1225
                sh->pd_idx = pd_idx;
1226
 
1227
                add_stripe_bh(sh, bh, dd_idx, rw);
1228
 
1229
                raid5_plug_device(conf);
1230
                handle_stripe(sh);
1231
                release_stripe(sh);
1232
        } else
1233
                bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1234
        return 0;
1235
}
1236
 
1237
/*
1238
 * Determine correct block size for this device.
1239
 */
1240
unsigned int device_bsize (kdev_t dev)
1241
{
1242
        unsigned int i, correct_size;
1243
 
1244
        correct_size = BLOCK_SIZE;
1245
        if (blksize_size[MAJOR(dev)]) {
1246
                i = blksize_size[MAJOR(dev)][MINOR(dev)];
1247
                if (i)
1248
                        correct_size = i;
1249
        }
1250
 
1251
        return correct_size;
1252
}
1253
 
1254
static int raid5_sync_request (mddev_t *mddev, unsigned long sector_nr)
1255
{
1256
        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1257
        struct stripe_head *sh;
1258
        int sectors_per_chunk = conf->chunk_size >> 9;
1259
        unsigned long stripe = sector_nr/sectors_per_chunk;
1260
        int chunk_offset = sector_nr % sectors_per_chunk;
1261
        int dd_idx, pd_idx;
1262
        unsigned long first_sector;
1263
        int raid_disks = conf->raid_disks;
1264
        int data_disks = raid_disks-1;
1265
        int redone = 0;
1266
        int bufsize;
1267
 
1268
        sh = get_active_stripe(conf, sector_nr, 0, 0);
1269
        bufsize = sh->size;
1270
        redone = sector_nr - sh->sector;
1271
        first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
1272
                + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1273
        sh->pd_idx = pd_idx;
1274
        spin_lock(&sh->lock);
1275
        set_bit(STRIPE_SYNCING, &sh->state);
1276
        clear_bit(STRIPE_INSYNC, &sh->state);
1277
        sh->sync_redone = redone;
1278
        spin_unlock(&sh->lock);
1279
 
1280
        handle_stripe(sh);
1281
        release_stripe(sh);
1282
 
1283
        return (bufsize>>9)-redone;
1284
}
1285
 
1286
/*
1287
 * This is our raid5 kernel thread.
1288
 *
1289
 * We scan the hash table for stripes which can be handled now.
1290
 * During the scan, completed stripes are saved for us by the interrupt
1291
 * handler, so that they will not have to wait for our next wakeup.
1292
 */
1293
static void raid5d (void *data)
1294
{
1295
        struct stripe_head *sh;
1296
        raid5_conf_t *conf = data;
1297
        mddev_t *mddev = conf->mddev;
1298
        int handled;
1299
 
1300
        PRINTK("+++ raid5d active\n");
1301
 
1302
        handled = 0;
1303
 
1304
        if (mddev->sb_dirty)
1305
                md_update_sb(mddev);
1306
        md_spin_lock_irq(&conf->device_lock);
1307
        while (1) {
1308
                struct list_head *first;
1309
 
1310
                if (list_empty(&conf->handle_list) &&
1311
                    atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
1312
                    !conf->plugged &&
1313
                    !list_empty(&conf->delayed_list))
1314
                        raid5_activate_delayed(conf);
1315
 
1316
                if (list_empty(&conf->handle_list))
1317
                        break;
1318
 
1319
                first = conf->handle_list.next;
1320
                sh = list_entry(first, struct stripe_head, lru);
1321
 
1322
                list_del_init(first);
1323
                atomic_inc(&sh->count);
1324
                if (atomic_read(&sh->count)!= 1)
1325
                        BUG();
1326
                md_spin_unlock_irq(&conf->device_lock);
1327
 
1328
                handled++;
1329
                handle_stripe(sh);
1330
                release_stripe(sh);
1331
 
1332
                md_spin_lock_irq(&conf->device_lock);
1333
        }
1334
        PRINTK("%d stripes handled\n", handled);
1335
 
1336
        md_spin_unlock_irq(&conf->device_lock);
1337
 
1338
        PRINTK("--- raid5d inactive\n");
1339
}
1340
 
1341
/*
1342
 * Private kernel thread for parity reconstruction after an unclean
1343
 * shutdown. Reconstruction on spare drives in case of a failed drive
1344
 * is done by the generic mdsyncd.
1345
 */
1346
static void raid5syncd (void *data)
1347
{
1348
        raid5_conf_t *conf = data;
1349
        mddev_t *mddev = conf->mddev;
1350
 
1351
        if (!conf->resync_parity)
1352
                return;
1353
        if (conf->resync_parity == 2)
1354
                return;
1355
        down(&mddev->recovery_sem);
1356
        if (md_do_sync(mddev,NULL)) {
1357
                up(&mddev->recovery_sem);
1358
                printk("raid5: resync aborted!\n");
1359
                return;
1360
        }
1361
        conf->resync_parity = 0;
1362
        up(&mddev->recovery_sem);
1363
        printk("raid5: resync finished.\n");
1364
}
1365
 
1366
static int raid5_run (mddev_t *mddev)
1367
{
1368
        raid5_conf_t *conf;
1369
        int i, j, raid_disk, memory;
1370
        mdp_super_t *sb = mddev->sb;
1371
        mdp_disk_t *desc;
1372
        mdk_rdev_t *rdev;
1373
        struct disk_info *disk;
1374
        struct md_list_head *tmp;
1375
        int start_recovery = 0;
1376
 
1377
        MOD_INC_USE_COUNT;
1378
 
1379
        if (sb->level != 5 && sb->level != 4) {
1380
                printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
1381
                MOD_DEC_USE_COUNT;
1382
                return -EIO;
1383
        }
1384
 
1385
        mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
1386
        if ((conf = mddev->private) == NULL)
1387
                goto abort;
1388
        memset (conf, 0, sizeof (*conf));
1389
        conf->mddev = mddev;
1390
 
1391
        if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
1392
                goto abort;
1393
        memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1394
 
1395
        conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1396
        md_init_waitqueue_head(&conf->wait_for_stripe);
1397
        INIT_LIST_HEAD(&conf->handle_list);
1398
        INIT_LIST_HEAD(&conf->delayed_list);
1399
        INIT_LIST_HEAD(&conf->inactive_list);
1400
        atomic_set(&conf->active_stripes, 0);
1401
        atomic_set(&conf->preread_active_stripes, 0);
1402
        conf->buffer_size = PAGE_SIZE; /* good default for rebuild */
1403
 
1404
        conf->plugged = 0;
1405
        conf->plug_tq.sync = 0;
1406
        conf->plug_tq.routine = &raid5_unplug_device;
1407
        conf->plug_tq.data = conf;
1408
 
1409
        PRINTK("raid5_run(md%d) called.\n", mdidx(mddev));
1410
 
1411
        ITERATE_RDEV(mddev,rdev,tmp) {
1412
                /*
1413
                 * This is important -- we are using the descriptor on
1414
                 * the disk only to get a pointer to the descriptor on
1415
                 * the main superblock, which might be more recent.
1416
                 */
1417
                desc = sb->disks + rdev->desc_nr;
1418
                raid_disk = desc->raid_disk;
1419
                disk = conf->disks + raid_disk;
1420
 
1421
                if (disk_faulty(desc)) {
1422
                        printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
1423
                        if (!rdev->faulty) {
1424
                                MD_BUG();
1425
                                goto abort;
1426
                        }
1427
                        disk->number = desc->number;
1428
                        disk->raid_disk = raid_disk;
1429
                        disk->dev = rdev->dev;
1430
 
1431
                        disk->operational = 0;
1432
                        disk->write_only = 0;
1433
                        disk->spare = 0;
1434
                        disk->used_slot = 1;
1435
                        continue;
1436
                }
1437
                if (disk_active(desc)) {
1438
                        if (!disk_sync(desc)) {
1439
                                printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
1440
                                MD_BUG();
1441
                                goto abort;
1442
                        }
1443
                        if (raid_disk > sb->raid_disks) {
1444
                                printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
1445
                                continue;
1446
                        }
1447
                        if (disk->operational) {
1448
                                printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
1449
                                continue;
1450
                        }
1451
                        printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
1452
 
1453
                        disk->number = desc->number;
1454
                        disk->raid_disk = raid_disk;
1455
                        disk->dev = rdev->dev;
1456
                        disk->operational = 1;
1457
                        disk->used_slot = 1;
1458
 
1459
                        conf->working_disks++;
1460
                } else {
1461
                        /*
1462
                         * Must be a spare disk ..
1463
                         */
1464
                        printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
1465
                        disk->number = desc->number;
1466
                        disk->raid_disk = raid_disk;
1467
                        disk->dev = rdev->dev;
1468
 
1469
                        disk->operational = 0;
1470
                        disk->write_only = 0;
1471
                        disk->spare = 1;
1472
                        disk->used_slot = 1;
1473
                }
1474
        }
1475
 
1476
        for (i = 0; i < MD_SB_DISKS; i++) {
1477
                desc = sb->disks + i;
1478
                raid_disk = desc->raid_disk;
1479
                disk = conf->disks + raid_disk;
1480
 
1481
                if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
1482
                        !conf->disks[raid_disk].used_slot) {
1483
 
1484
                        disk->number = desc->number;
1485
                        disk->raid_disk = raid_disk;
1486
                        disk->dev = MKDEV(0,0);
1487
 
1488
                        disk->operational = 0;
1489
                        disk->write_only = 0;
1490
                        disk->spare = 0;
1491
                        disk->used_slot = 1;
1492
                }
1493
        }
1494
 
1495
        conf->raid_disks = sb->raid_disks;
1496
        /*
1497
         * 0 for a fully functional array, 1 for a degraded array.
1498
         */
1499
        conf->failed_disks = conf->raid_disks - conf->working_disks;
1500
        conf->mddev = mddev;
1501
        conf->chunk_size = sb->chunk_size;
1502
        conf->level = sb->level;
1503
        conf->algorithm = sb->layout;
1504
        conf->max_nr_stripes = NR_STRIPES;
1505
 
1506
#if 0
1507
        for (i = 0; i < conf->raid_disks; i++) {
1508
                if (!conf->disks[i].used_slot) {
1509
                        MD_BUG();
1510
                        goto abort;
1511
                }
1512
        }
1513
#endif
1514
        if (!conf->chunk_size || conf->chunk_size % 4) {
1515
                printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
1516
                goto abort;
1517
        }
1518
        if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
1519
                printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
1520
                goto abort;
1521
        }
1522
        if (conf->failed_disks > 1) {
1523
                printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
1524
                goto abort;
1525
        }
1526
 
1527
        if (conf->working_disks != sb->raid_disks) {
1528
                printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1529
                start_recovery = 1;
1530
        }
1531
 
1532
        {
1533
                const char * name = "raid5d";
1534
 
1535
                conf->thread = md_register_thread(raid5d, conf, name);
1536
                if (!conf->thread) {
1537
                        printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1538
                        goto abort;
1539
                }
1540
        }
1541
 
1542
        memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
1543
                 conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
1544
        if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
1545
                printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
1546
                shrink_stripes(conf, conf->max_nr_stripes);
1547
                goto abort;
1548
        } else
1549
                printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
1550
 
1551
        /*
1552
         * Regenerate the "device is in sync with the raid set" bit for
1553
         * each device.
1554
         */
1555
        for (i = 0; i < MD_SB_DISKS ; i++) {
1556
                mark_disk_nonsync(sb->disks + i);
1557
                for (j = 0; j < sb->raid_disks; j++) {
1558
                        if (!conf->disks[j].operational)
1559
                                continue;
1560
                        if (sb->disks[i].number == conf->disks[j].number)
1561
                                mark_disk_sync(sb->disks + i);
1562
                }
1563
        }
1564
        sb->active_disks = conf->working_disks;
1565
 
1566
        if (sb->active_disks == sb->raid_disks)
1567
                printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
1568
        else
1569
                printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
1570
 
1571
        if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
1572
                const char * name = "raid5syncd";
1573
 
1574
                conf->resync_thread = md_register_thread(raid5syncd, conf,name);
1575
                if (!conf->resync_thread) {
1576
                        printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1577
                        goto abort;
1578
                }
1579
 
1580
                printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
1581
                conf->resync_parity = 1;
1582
                md_wakeup_thread(conf->resync_thread);
1583
        }
1584
 
1585
        print_raid5_conf(conf);
1586
        if (start_recovery)
1587
                md_recover_arrays();
1588
        print_raid5_conf(conf);
1589
 
1590
        /* Ok, everything is just fine now */
1591
        return (0);
1592
abort:
1593
        if (conf) {
1594
                print_raid5_conf(conf);
1595
                if (conf->stripe_hashtbl)
1596
                        free_pages((unsigned long) conf->stripe_hashtbl,
1597
                                                        HASH_PAGES_ORDER);
1598
                kfree(conf);
1599
        }
1600
        mddev->private = NULL;
1601
        printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
1602
        MOD_DEC_USE_COUNT;
1603
        return -EIO;
1604
}
1605
 
1606
static int raid5_stop_resync (mddev_t *mddev)
1607
{
1608
        raid5_conf_t *conf = mddev_to_conf(mddev);
1609
        mdk_thread_t *thread = conf->resync_thread;
1610
 
1611
        if (thread) {
1612
                if (conf->resync_parity) {
1613
                        conf->resync_parity = 2;
1614
                        md_interrupt_thread(thread);
1615
                        printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
1616
                        return 1;
1617
                }
1618
                return 0;
1619
        }
1620
        return 0;
1621
}
1622
 
1623
static int raid5_restart_resync (mddev_t *mddev)
1624
{
1625
        raid5_conf_t *conf = mddev_to_conf(mddev);
1626
 
1627
        if (conf->resync_parity) {
1628
                if (!conf->resync_thread) {
1629
                        MD_BUG();
1630
                        return 0;
1631
                }
1632
                printk("raid5: waking up raid5resync.\n");
1633
                conf->resync_parity = 1;
1634
                md_wakeup_thread(conf->resync_thread);
1635
                return 1;
1636
        } else
1637
                printk("raid5: no restart-resync needed.\n");
1638
        return 0;
1639
}
1640
 
1641
 
1642
static int raid5_stop (mddev_t *mddev)
1643
{
1644
        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1645
 
1646
        if (conf->resync_thread)
1647
                md_unregister_thread(conf->resync_thread);
1648
        md_unregister_thread(conf->thread);
1649
        shrink_stripes(conf, conf->max_nr_stripes);
1650
        free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
1651
        kfree(conf);
1652
        mddev->private = NULL;
1653
        MOD_DEC_USE_COUNT;
1654
        return 0;
1655
}
1656
 
1657
#if RAID5_DEBUG
1658
static void print_sh (struct stripe_head *sh)
1659
{
1660
        int i;
1661
 
1662
        printk("sh %lu, size %d, pd_idx %d, state %ld.\n", sh->sector, sh->size, sh->pd_idx, sh->state);
1663
        printk("sh %lu,  count %d.\n", sh->sector, atomic_read(&sh->count));
1664
        printk("sh %lu, ", sh->sector);
1665
        for (i = 0; i < MD_SB_DISKS; i++) {
1666
                if (sh->bh_cache[i])
1667
                        printk("(cache%d: %p %ld) ", i, sh->bh_cache[i], sh->bh_cache[i]->b_state);
1668
        }
1669
        printk("\n");
1670
}
1671
 
1672
static void printall (raid5_conf_t *conf)
1673
{
1674
        struct stripe_head *sh;
1675
        int i;
1676
 
1677
        md_spin_lock_irq(&conf->device_lock);
1678
        for (i = 0; i < NR_HASH; i++) {
1679
                sh = conf->stripe_hashtbl[i];
1680
                for (; sh; sh = sh->hash_next) {
1681
                        if (sh->raid_conf != conf)
1682
                                continue;
1683
                        print_sh(sh);
1684
                }
1685
        }
1686
        md_spin_unlock_irq(&conf->device_lock);
1687
 
1688
        PRINTK("--- raid5d inactive\n");
1689
}
1690
#endif
1691
 
1692
static void raid5_status (struct seq_file *seq, mddev_t *mddev)
1693
{
1694
        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1695
        mdp_super_t *sb = mddev->sb;
1696
        int i;
1697
 
1698
        seq_printf (seq, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
1699
        seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
1700
        for (i = 0; i < conf->raid_disks; i++)
1701
                seq_printf (seq, "%s", conf->disks[i].operational ? "U" : "_");
1702
        seq_printf (seq, "]");
1703
#if RAID5_DEBUG
1704
#define D(x) \
1705
        seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x))
1706
        printall(conf);
1707
#endif
1708
 
1709
}
1710
 
1711
static void print_raid5_conf (raid5_conf_t *conf)
1712
{
1713
        int i;
1714
        struct disk_info *tmp;
1715
 
1716
        printk("RAID5 conf printout:\n");
1717
        if (!conf) {
1718
                printk("(conf==NULL)\n");
1719
                return;
1720
        }
1721
        printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
1722
                 conf->working_disks, conf->failed_disks);
1723
 
1724
#if RAID5_DEBUG
1725
        for (i = 0; i < MD_SB_DISKS; i++) {
1726
#else
1727
        for (i = 0; i < conf->working_disks+conf->failed_disks; i++) {
1728
#endif
1729
                tmp = conf->disks + i;
1730
                printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
1731
                        i, tmp->spare,tmp->operational,
1732
                        tmp->number,tmp->raid_disk,tmp->used_slot,
1733
                        partition_name(tmp->dev));
1734
        }
1735
}
1736
 
1737
static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
1738
{
1739
        int err = 0;
1740
        int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
1741
        raid5_conf_t *conf = mddev->private;
1742
        struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
1743
        mdp_super_t *sb = mddev->sb;
1744
        mdp_disk_t *failed_desc, *spare_desc, *added_desc;
1745
        mdk_rdev_t *spare_rdev, *failed_rdev;
1746
 
1747
        print_raid5_conf(conf);
1748
        md_spin_lock_irq(&conf->device_lock);
1749
        /*
1750
         * find the disk ...
1751
         */
1752
        switch (state) {
1753
 
1754
        case DISKOP_SPARE_ACTIVE:
1755
 
1756
                /*
1757
                 * Find the failed disk within the RAID5 configuration ...
1758
                 * (this can only be in the first conf->raid_disks part)
1759
                 */
1760
                for (i = 0; i < conf->raid_disks; i++) {
1761
                        tmp = conf->disks + i;
1762
                        if ((!tmp->operational && !tmp->spare) ||
1763
                                        !tmp->used_slot) {
1764
                                failed_disk = i;
1765
                                break;
1766
                        }
1767
                }
1768
                /*
1769
                 * When we activate a spare disk we _must_ have a disk in
1770
                 * the lower (active) part of the array to replace.
1771
                 */
1772
                if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
1773
                        MD_BUG();
1774
                        err = 1;
1775
                        goto abort;
1776
                }
1777
                /* fall through */
1778
 
1779
        case DISKOP_SPARE_WRITE:
1780
        case DISKOP_SPARE_INACTIVE:
1781
 
1782
                /*
1783
                 * Find the spare disk ... (can only be in the 'high'
1784
                 * area of the array)
1785
                 */
1786
                for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
1787
                        tmp = conf->disks + i;
1788
                        if (tmp->spare && tmp->number == (*d)->number) {
1789
                                spare_disk = i;
1790
                                break;
1791
                        }
1792
                }
1793
                if (spare_disk == -1) {
1794
                        MD_BUG();
1795
                        err = 1;
1796
                        goto abort;
1797
                }
1798
                break;
1799
 
1800
        case DISKOP_HOT_REMOVE_DISK:
1801
 
1802
                for (i = 0; i < MD_SB_DISKS; i++) {
1803
                        tmp = conf->disks + i;
1804
                        if (tmp->used_slot && (tmp->number == (*d)->number)) {
1805
                                if (tmp->operational) {
1806
                                        err = -EBUSY;
1807
                                        goto abort;
1808
                                }
1809
                                removed_disk = i;
1810
                                break;
1811
                        }
1812
                }
1813
                if (removed_disk == -1) {
1814
                        MD_BUG();
1815
                        err = 1;
1816
                        goto abort;
1817
                }
1818
                break;
1819
 
1820
        case DISKOP_HOT_ADD_DISK:
1821
 
1822
                for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
1823
                        tmp = conf->disks + i;
1824
                        if (!tmp->used_slot) {
1825
                                added_disk = i;
1826
                                break;
1827
                        }
1828
                }
1829
                if (added_disk == -1) {
1830
                        MD_BUG();
1831
                        err = 1;
1832
                        goto abort;
1833
                }
1834
                break;
1835
        }
1836
 
1837
        switch (state) {
1838
        /*
1839
         * Switch the spare disk to write-only mode:
1840
         */
1841
        case DISKOP_SPARE_WRITE:
1842
                if (conf->spare) {
1843
                        MD_BUG();
1844
                        err = 1;
1845
                        goto abort;
1846
                }
1847
                sdisk = conf->disks + spare_disk;
1848
                sdisk->operational = 1;
1849
                sdisk->write_only = 1;
1850
                conf->spare = sdisk;
1851
                break;
1852
        /*
1853
         * Deactivate a spare disk:
1854
         */
1855
        case DISKOP_SPARE_INACTIVE:
1856
                sdisk = conf->disks + spare_disk;
1857
                sdisk->operational = 0;
1858
                sdisk->write_only = 0;
1859
                /*
1860
                 * Was the spare being resynced?
1861
                 */
1862
                if (conf->spare == sdisk)
1863
                        conf->spare = NULL;
1864
                break;
1865
        /*
1866
         * Activate (mark read-write) the (now sync) spare disk,
1867
         * which means we switch it's 'raid position' (->raid_disk)
1868
         * with the failed disk. (only the first 'conf->raid_disks'
1869
         * slots are used for 'real' disks and we must preserve this
1870
         * property)
1871
         */
1872
        case DISKOP_SPARE_ACTIVE:
1873
                if (!conf->spare) {
1874
                        MD_BUG();
1875
                        err = 1;
1876
                        goto abort;
1877
                }
1878
                sdisk = conf->disks + spare_disk;
1879
                fdisk = conf->disks + failed_disk;
1880
 
1881
                spare_desc = &sb->disks[sdisk->number];
1882
                failed_desc = &sb->disks[fdisk->number];
1883
 
1884
                if (spare_desc != *d) {
1885
                        MD_BUG();
1886
                        err = 1;
1887
                        goto abort;
1888
                }
1889
 
1890
                if (spare_desc->raid_disk != sdisk->raid_disk) {
1891
                        MD_BUG();
1892
                        err = 1;
1893
                        goto abort;
1894
                }
1895
 
1896
                if (sdisk->raid_disk != spare_disk) {
1897
                        MD_BUG();
1898
                        err = 1;
1899
                        goto abort;
1900
                }
1901
 
1902
                if (failed_desc->raid_disk != fdisk->raid_disk) {
1903
                        MD_BUG();
1904
                        err = 1;
1905
                        goto abort;
1906
                }
1907
 
1908
                if (fdisk->raid_disk != failed_disk) {
1909
                        MD_BUG();
1910
                        err = 1;
1911
                        goto abort;
1912
                }
1913
 
1914
                /*
1915
                 * do the switch finally
1916
                 */
1917
                spare_rdev = find_rdev_nr(mddev, spare_desc->number);
1918
                failed_rdev = find_rdev_nr(mddev, failed_desc->number);
1919
 
1920
                /* There must be a spare_rdev, but there may not be a
1921
                 * failed_rdev.  That slot might be empty...
1922
                 */
1923
                spare_rdev->desc_nr = failed_desc->number;
1924
                if (failed_rdev)
1925
                        failed_rdev->desc_nr = spare_desc->number;
1926
 
1927
                xchg_values(*spare_desc, *failed_desc);
1928
                xchg_values(*fdisk, *sdisk);
1929
 
1930
                /*
1931
                 * (careful, 'failed' and 'spare' are switched from now on)
1932
                 *
1933
                 * we want to preserve linear numbering and we want to
1934
                 * give the proper raid_disk number to the now activated
1935
                 * disk. (this means we switch back these values)
1936
                 */
1937
 
1938
                xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1939
                xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1940
                xchg_values(spare_desc->number, failed_desc->number);
1941
                xchg_values(sdisk->number, fdisk->number);
1942
 
1943
                *d = failed_desc;
1944
 
1945
                if (sdisk->dev == MKDEV(0,0))
1946
                        sdisk->used_slot = 0;
1947
 
1948
                /*
1949
                 * this really activates the spare.
1950
                 */
1951
                fdisk->spare = 0;
1952
                fdisk->write_only = 0;
1953
 
1954
                /*
1955
                 * if we activate a spare, we definitely replace a
1956
                 * non-operational disk slot in the 'low' area of
1957
                 * the disk array.
1958
                 */
1959
                conf->failed_disks--;
1960
                conf->working_disks++;
1961
                conf->spare = NULL;
1962
 
1963
                break;
1964
 
1965
        case DISKOP_HOT_REMOVE_DISK:
1966
                rdisk = conf->disks + removed_disk;
1967
 
1968
                if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1969
                        MD_BUG();
1970
                        err = 1;
1971
                        goto abort;
1972
                }
1973
                rdisk->dev = MKDEV(0,0);
1974
                rdisk->used_slot = 0;
1975
 
1976
                break;
1977
 
1978
        case DISKOP_HOT_ADD_DISK:
1979
                adisk = conf->disks + added_disk;
1980
                added_desc = *d;
1981
 
1982
                if (added_disk != added_desc->number) {
1983
                        MD_BUG();
1984
                        err = 1;
1985
                        goto abort;
1986
                }
1987
 
1988
                adisk->number = added_desc->number;
1989
                adisk->raid_disk = added_desc->raid_disk;
1990
                adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1991
 
1992
                adisk->operational = 0;
1993
                adisk->write_only = 0;
1994
                adisk->spare = 1;
1995
                adisk->used_slot = 1;
1996
 
1997
 
1998
                break;
1999
 
2000
        default:
2001
                MD_BUG();
2002
                err = 1;
2003
                goto abort;
2004
        }
2005
abort:
2006
        md_spin_unlock_irq(&conf->device_lock);
2007
        print_raid5_conf(conf);
2008
        return err;
2009
}
2010
 
2011
static mdk_personality_t raid5_personality=
2012
{
2013
        name:           "raid5",
2014
        make_request:   raid5_make_request,
2015
        run:            raid5_run,
2016
        stop:           raid5_stop,
2017
        status:         raid5_status,
2018
        error_handler:  raid5_error,
2019
        diskop:         raid5_diskop,
2020
        stop_resync:    raid5_stop_resync,
2021
        restart_resync: raid5_restart_resync,
2022
        sync_request:   raid5_sync_request
2023
};
2024
 
2025
static int md__init raid5_init (void)
2026
{
2027
        return register_md_personality (RAID5, &raid5_personality);
2028
}
2029
 
2030
static void raid5_exit (void)
2031
{
2032
        unregister_md_personality (RAID5);
2033
}
2034
 
2035
module_init(raid5_init);
2036
module_exit(raid5_exit);
2037
MODULE_LICENSE("GPL");

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.