OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [tags/] [before_ORP/] [uclinux/] [uClinux-2.0.x/] [drivers/] [block/] [raid5.c] - Blame information for rev 901

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 199 simons
/*****************************************************************************
2
 * raid5.c : Multiple Devices driver for Linux
3
 *           Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4
 *
5
 * RAID-5 management functions.
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
 * the Free Software Foundation; either version 2, or (at your option)
10
 * any later version.
11
 *
12
 * You should have received a copy of the GNU General Public License
13
 * (for example /usr/src/linux/COPYING); if not, write to the Free
14
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15
 */
16
 
17
#include <linux/module.h>
18
#include <linux/locks.h>
19
#include <linux/malloc.h>
20
#include <linux/md.h>
21
#include <linux/raid5.h>
22
#include <asm/bitops.h>
23
#include <asm/atomic.h>
24
 
25
struct buffer_head *efind_buffer(kdev_t dev, int block, int size);
26
 
27
#define MAJOR_NR MD_MAJOR
28
#define MD_DRIVER
29
#define MD_PERSONALITY
30
 
31
static struct md_personality raid5_personality;
32
 
33
struct stripe_head {
34
        struct stripe_head      *hash_next, **hash_pprev; /* hash pointers */
35
        struct stripe_head      *handle_next;           /* completed during hash scan pointers */
36
        struct raid5_data       *raid_conf;
37
        struct buffer_head      *bh_old[MD_SB_DISKS];   /* disk image */
38
        struct buffer_head      *bh_new[MD_SB_DISKS];   /* buffers of the MD device (present in buffer cache) */
39
        struct buffer_head      *bh_copy[MD_SB_DISKS];  /* copy on write of bh_new (bh_new can change from under us) */
40
        int                     cmd_new[MD_SB_DISKS];   /* READ/WRITE for new */
41
        int                     new[MD_SB_DISKS];       /* buffer added since the last handle_stripe() */
42
        unsigned long           sector;                 /* sector of this row */
43
        int                     size;                   /* buffers size */
44
        int                     pd_idx;                 /* parity disk index */
45
        int                     nr_pending;             /* nr of pending cmds */
46
        __u32                   state;                  /* state flags */
47
        int                     cmd;                    /* stripe cmd */
48
        int                     count;                  /* nr of waiters */
49
        int                     write_method;           /* reconstruct-write / read-modify-write */
50
        int                     phase;                  /* PHASE_BEGIN, ..., PHASE_COMPLETE */
51
        struct wait_queue       *wait;                  /* processes waiting for this stripe */
52
};
53
 
54
/*
55
 * Phase
56
 */
57
#define PHASE_BEGIN             0
58
#define PHASE_READ_OLD          1
59
#define PHASE_WRITE             2
60
#define PHASE_READ              3
61
#define PHASE_COMPLETE          4
62
 
63
/*
64
 * Write method
65
 */
66
#define METHOD_NONE             0
67
#define RECONSTRUCT_WRITE       1
68
#define READ_MODIFY_WRITE       2
69
 
70
/*
71
 * Stripe state
72
 */
73
#define STRIPE_LOCKED           0
74
#define STRIPE_ERROR            1
75
 
76
/*
77
 * Stripe commands
78
 */
79
#define STRIPE_NONE             0
80
#define STRIPE_WRITE            1
81
#define STRIPE_READ             2
82
 
83
/*
84
 * Stripe cache
85
 */
86
#define RAID5_STRIPE_POOL_SIZE  128
87
#define HASH_PAGES              1
88
#define HASH_PAGES_ORDER        0
89
#define NR_HASH                 (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
90
#define HASH_MASK               (NR_HASH - 1)
91
#define stripe_hash(sect, size) (stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
92
 
93
int nr_stripes = 0, nr_locked_stripes = 0, nr_pending_stripes = 0;
94
struct stripe_head **stripe_hashtbl;
95
static struct wait_queue *raid5_wait_for_stripe = NULL;
96
struct stripe_head *stripe_handle_list = NULL, *stripe_handle_tail = NULL;
97
 
98
/*
99
 * Free buffers pool
100
 */
101
#define RAID5_POOL_SIZE 3000
102
static int nr_free_buffers = 0, nr_used_buffers = 0, max_nr_used_buffers = 0;
103
static struct buffer_head *raid5_buffer_list = NULL;
104
static struct wait_queue *raid5_wait_for_bh = NULL;
105
 
106
/*
107
 * The following can be used to debug the driver
108
 */
109
#define RAID5_DEBUG     0
110
 
111
#if RAID5_DEBUG
112
#define PRINTK(x)   do { printk x; } while (0);
113
static int nr_pending = 0, free_1024 = 0, free_4096 = 0, used_1024 = 0, used_4096 = 0;
114
#else
115
#define PRINTK(x)   do { ; } while (0)
116
#endif
117
 
118
static inline int stripe_locked(struct stripe_head *sh)
119
{
120
        return test_bit(STRIPE_LOCKED, &sh->state);
121
}
122
 
123
static inline int stripe_error(struct stripe_head *sh)
124
{
125
        return test_bit(STRIPE_ERROR, &sh->state);
126
}
127
 
128
/*
129
 * Stripes are locked whenever new buffers can't be added to them.
130
 */
131
static inline void lock_stripe(struct stripe_head *sh)
132
{
133
        if (!set_bit(STRIPE_LOCKED, &sh->state)) {
134
                PRINTK(("locking stripe %lu\n", sh->sector));
135
                nr_locked_stripes++;
136
        }
137
}
138
 
139
static inline void unlock_stripe(struct stripe_head *sh)
140
{
141
        if (clear_bit(STRIPE_LOCKED, &sh->state)) {
142
                PRINTK(("unlocking stripe %lu\n", sh->sector));
143
                nr_locked_stripes--;
144
                wake_up(&sh->wait);
145
        }
146
}
147
 
148
static inline void finish_stripe(struct stripe_head *sh)
149
{
150
        unlock_stripe(sh);
151
        sh->cmd = STRIPE_NONE;
152
        sh->phase = PHASE_COMPLETE;
153
        nr_pending_stripes--;
154
        wake_up(&raid5_wait_for_stripe);
155
}
156
 
157
static void unplug_devices(struct stripe_head *sh)
158
{
159
        struct raid5_data *raid_conf = sh->raid_conf;
160
        int i;
161
 
162
        for (i = 0; i < raid_conf->raid_disks; i++)
163
                unplug_device(blk_dev + MAJOR(raid_conf->disks[i].dev));
164
}
165
 
166
static void raid5d (void *data);
167
 
168
void __wait_on_stripe(struct stripe_head *sh)
169
{
170
        struct wait_queue wait = { current, NULL };
171
 
172
        PRINTK(("wait_on_stripe %lu\n", sh->sector));
173
        sh->count++;
174
        add_wait_queue(&sh->wait, &wait);
175
repeat:
176
        current->state = TASK_UNINTERRUPTIBLE;
177
        if (stripe_locked(sh)) {
178
                schedule();
179
                goto repeat;
180
        }
181
        PRINTK(("wait_on_stripe %lu done\n", sh->sector));
182
        remove_wait_queue(&sh->wait, &wait);
183
        sh->count--;
184
        current->state = TASK_RUNNING;
185
}
186
 
187
static inline void wait_on_stripe(struct stripe_head *sh)
188
{
189
        if (stripe_locked(sh))
190
                __wait_on_stripe(sh);
191
}
192
 
193
static inline void remove_hash(struct stripe_head *sh)
194
{
195
        PRINTK(("remove_hash(), stripe %lu\n", sh->sector));
196
 
197
        if (sh->hash_pprev) {
198
                if (sh->hash_next)
199
                        sh->hash_next->hash_pprev = sh->hash_pprev;
200
                *sh->hash_pprev = sh->hash_next;
201
                sh->hash_pprev = NULL;
202
                nr_stripes--;
203
        }
204
}
205
 
206
static inline void insert_hash(struct stripe_head *sh)
207
{
208
        struct stripe_head **shp = &stripe_hash(sh->sector, sh->size);
209
 
210
        PRINTK(("insert_hash(), stripe %lu, nr_stripes %d\n", sh->sector, nr_stripes));
211
 
212
        if ((sh->hash_next = *shp) != NULL)
213
                (*shp)->hash_pprev = &sh->hash_next;
214
        *shp = sh;
215
        sh->hash_pprev = shp;
216
        nr_stripes++;
217
}
218
 
219
static void add_bh (struct buffer_head *bh)
220
{
221
        unsigned long flags;
222
 
223
        save_flags(flags);
224
        cli();
225
        bh->b_next = raid5_buffer_list;
226
        raid5_buffer_list = bh;
227
        nr_free_buffers++;
228
#if RAID5_DEBUG
229
        if (bh->b_size == 1024)
230
                free_1024++;
231
        if (bh->b_size == 4096)
232
                free_4096++;
233
#endif
234
        restore_flags(flags);
235
}
236
 
237
static void raid5_kfree_bh (struct buffer_head *bh)
238
{
239
        unsigned long flags;
240
 
241
        save_flags(flags);
242
        cli();
243
        nr_used_buffers--;
244
#if RAID5_DEBUG
245
        if (bh->b_size == 1024)
246
                used_1024--;
247
        if (bh->b_size == 4096)
248
                used_4096--;
249
#endif
250
        if (nr_free_buffers < RAID5_POOL_SIZE) {
251
#if 0 /* This can magically catch races :-) */
252
                char *b_data = ((volatile struct buffer_head *) bh)->b_data;
253
                int b_size = ((volatile struct buffer_head *) bh)->b_size;
254
                memset (bh, 0, sizeof (struct buffer_head));
255
                ((volatile struct buffer_head *) bh)->b_data = b_data;
256
                ((volatile struct buffer_head *) bh)->b_size = b_size;
257
#endif
258
                add_bh (bh);
259
                wake_up (&raid5_wait_for_bh);
260
        } else {
261
                if (bh->b_size == PAGE_SIZE)
262
                        free_page ((unsigned long) bh->b_data);
263
                else
264
                        kfree (bh->b_data);
265
#if 0
266
                memset (bh, 0, sizeof (struct buffer_head));
267
#endif
268
                kfree (bh);
269
        }
270
#if RAID5_DEBUG
271
        printk ("kfree_bh: nr_free == %d, nr_used == %d, max_nr_used == %d\n", nr_free_buffers, nr_used_buffers, max_nr_used_buffers);
272
#endif
273
        restore_flags(flags);
274
}
275
 
276
static void raid5_kfree_old_bh(struct stripe_head *sh, int i)
277
{
278
        if (!sh->bh_old[i]) {
279
                printk("raid5_kfree_old_bh: bug: sector %lu, index %d not present\n", sh->sector, i);
280
                return;
281
        }
282
        raid5_kfree_bh(sh->bh_old[i]);
283
        sh->bh_old[i] = NULL;
284
}
285
 
286
static void raid5_update_old_bh(struct stripe_head *sh, int i)
287
{
288
        PRINTK(("stripe %lu, idx %d, updating cache copy\n", sh->sector, i));
289
        if (!sh->bh_copy[i]) {
290
                printk("raid5_update_old_bh: bug: sector %lu, index %d not present\n", sh->sector, i);
291
                return;
292
        }
293
        if (sh->bh_old[i])
294
                raid5_kfree_old_bh(sh, i);
295
        sh->bh_old[i] = sh->bh_copy[i];
296
        sh->bh_copy[i] = NULL;
297
}
298
 
299
static void kfree_stripe(struct stripe_head *sh)
300
{
301
        struct raid5_data *raid_conf = sh->raid_conf;
302
        int disks = raid_conf->raid_disks, j;
303
 
304
        PRINTK(("kfree_stripe called, stripe %lu\n", sh->sector));
305
        if (sh->phase != PHASE_COMPLETE || stripe_locked(sh) || sh->count) {
306
                printk("raid5: kfree_stripe(), sector %lu, phase %d, locked %d, count %d\n", sh->sector, sh->phase, stripe_locked(sh), sh->count);
307
                return;
308
        }
309
        for (j = 0; j < disks; j++) {
310
                if (sh->bh_old[j])
311
                        raid5_kfree_old_bh(sh, j);
312
                if (sh->bh_new[j] || sh->bh_copy[j])
313
                        printk("raid5: bug: sector %lu, new %p, copy %p\n", sh->sector, sh->bh_new[j], sh->bh_copy[j]);
314
        }
315
        remove_hash(sh);
316
        kfree(sh);
317
}
318
 
319
static int shrink_stripe_cache(int nr)
320
{
321
        struct stripe_head *sh;
322
        int i, count = 0;
323
        static int clock = 0;
324
 
325
        PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, nr_stripes, clock));
326
        for (i = 0; i < NR_HASH; i++) {
327
repeat:
328
                sh = stripe_hashtbl[(i + clock) & HASH_MASK];
329
                for (; sh; sh = sh->hash_next) {
330
                        if (sh->phase != PHASE_COMPLETE)
331
                                continue;
332
                        if (stripe_locked(sh))
333
                                continue;
334
                        if (sh->count)
335
                                continue;
336
                        kfree_stripe(sh);
337
                        if (++count == nr) {
338
                                PRINTK(("shrink completed, nr_stripes %d\n", nr_stripes));
339
                                clock = (i + clock) & HASH_MASK;
340
                                return nr;
341
                        }
342
                        goto repeat;
343
                }
344
        }
345
        PRINTK(("shrink completed, nr_stripes %d\n", nr_stripes));
346
        return count;
347
}
348
 
349
static struct stripe_head *find_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
350
{
351
        struct stripe_head *sh;
352
 
353
        if (raid_conf->buffer_size != size) {
354
                PRINTK(("switching size, %d --> %d\n", raid_conf->buffer_size, size));
355
                shrink_stripe_cache(RAID5_STRIPE_POOL_SIZE);
356
                raid_conf->buffer_size = size;
357
        }
358
 
359
        PRINTK(("find_stripe, sector %lu\n", sector));
360
        for (sh = stripe_hash(sector, size); sh; sh = sh->hash_next)
361
                if (sh->sector == sector && sh->raid_conf == raid_conf) {
362
                        if (sh->size == size) {
363
                                PRINTK(("found stripe %lu\n", sector));
364
                                return sh;
365
                        } else {
366
                                PRINTK(("switching size for %lu, %d --> %d\n", sector, sh->size, size));
367
                                kfree_stripe(sh);
368
                                break;
369
                        }
370
                }
371
        PRINTK(("stripe %lu not in cache\n", sector));
372
        return NULL;
373
}
374
 
375
static struct stripe_head *kmalloc_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
376
{
377
        struct stripe_head *sh = NULL, *tmp;
378
 
379
        PRINTK(("kmalloc_stripe called\n"));
380
 
381
        while (nr_stripes > RAID5_STRIPE_POOL_SIZE) {
382
                shrink_stripe_cache(RAID5_STRIPE_POOL_SIZE / 8);
383
                if (nr_stripes <= RAID5_STRIPE_POOL_SIZE)
384
                        break;
385
                md_wakeup_thread(raid_conf->thread);
386
                PRINTK(("waiting for some stripes to complete\n"));
387
                sleep_on(&raid5_wait_for_stripe);
388
        }
389
        md_wakeup_thread(raid_conf->thread);
390
        sh = kmalloc(sizeof(*sh), GFP_KERNEL);
391
 
392
        /*
393
         * The above might have slept, so perhaps another process
394
         * already created the stripe for us..
395
         */
396
        if ((tmp = find_stripe(raid_conf, sector, size)) != NULL) {
397
                kfree(sh);
398
                wait_on_stripe(tmp);
399
                return tmp;
400
        }
401
        if (sh) {
402
                memset(sh, 0, sizeof(*sh));
403
                sh->phase = PHASE_COMPLETE;
404
                sh->cmd = STRIPE_NONE;
405
                sh->raid_conf = raid_conf;
406
                sh->sector = sector;
407
                sh->size = size;
408
                insert_hash(sh);
409
        }
410
        return sh;
411
}
412
 
413
static struct stripe_head *get_stripe(struct raid5_data *raid_conf, unsigned long sector, int size)
414
{
415
        struct stripe_head *sh;
416
 
417
        PRINTK(("get_stripe, sector %lu\n", sector));
418
        sh = find_stripe(raid_conf, sector, size);
419
        if (sh)
420
                wait_on_stripe(sh);
421
        else
422
                sh = kmalloc_stripe(raid_conf, sector, size);
423
        return sh;
424
}
425
 
426
static struct buffer_head *remove_bh (int b_size)
427
{
428
        struct buffer_head *bh, *bhp = NULL;
429
        unsigned long flags;
430
 
431
        save_flags(flags);
432
        cli();
433
        if ((bh = raid5_buffer_list) == NULL)
434
                return NULL;
435
        do {
436
                if (bh->b_size == b_size || b_size == -1)
437
                        break;
438
                bhp = bh;
439
                bh = bh->b_next;
440
        } while (bh);
441
        if (!bh)
442
                return NULL;
443
        if (bhp)
444
                bhp->b_next = bh->b_next;
445
        else
446
                raid5_buffer_list = bh->b_next;
447
#if RAID5_DEBUG
448
        if (bh->b_size == 1024)
449
                free_1024--;
450
        if (bh->b_size == 4096)
451
                free_4096--;
452
#endif
453
        nr_free_buffers--;
454
        if (!nr_free_buffers && raid5_buffer_list)
455
                printk ("raid5: bug: buffer_list != NULL, nr_free_buffers == 0\n");
456
        restore_flags(flags);
457
        return bh;
458
}
459
 
460
 
461
static void shrink_buffers (int num)
462
{
463
        struct buffer_head *bh;
464
 
465
        while (num--) {
466
                if ((bh = remove_bh(-1)) == NULL)
467
                        return;
468
                if (bh->b_size == PAGE_SIZE)
469
                        free_page ((unsigned long) bh->b_data);
470
                else
471
                        kfree (bh->b_data);
472
                kfree (bh);
473
        }
474
}
475
 
476
static void grow_buffers (int num, int b_size, int priority)
477
{
478
        struct buffer_head *bh;
479
 
480
        while (num--) {
481
                bh = kmalloc (sizeof (struct buffer_head), priority);
482
                if (!bh)
483
                        break;
484
                memset (bh, 0, sizeof (struct buffer_head));
485
                if (b_size == PAGE_SIZE)
486
                        bh->b_data = (char *) __get_free_page (priority);
487
                else
488
                        bh->b_data = kmalloc (b_size, priority);
489
                if (!bh->b_data) {
490
                        kfree (bh);
491
                        break;
492
                }
493
                bh->b_size = b_size;
494
                add_bh (bh);
495
        }
496
}
497
 
498
static struct buffer_head *raid5_kmalloc_bh (struct stripe_head *sh, int b_size)
499
{
500
        struct buffer_head *bh;
501
        struct raid5_data *raid_conf = sh->raid_conf;
502
        unsigned long flags;
503
 
504
        bh = remove_bh(b_size);
505
        if (!bh && nr_free_buffers > RAID5_POOL_SIZE / 10)
506
                shrink_buffers (RAID5_POOL_SIZE / 10);
507
        if (!bh && nr_used_buffers < RAID5_POOL_SIZE) {
508
#if 0
509
                grow_buffers (200, b_size, GFP_BUFFER);
510
#else
511
                grow_buffers (200, b_size, GFP_KERNEL);
512
#endif
513
                bh = remove_bh(b_size);
514
        }
515
        if (bh == NULL && nr_used_buffers > RAID5_POOL_SIZE / 2) {
516
                shrink_stripe_cache(RAID5_STRIPE_POOL_SIZE / 2);
517
                bh = remove_bh(b_size);
518
        }
519
 
520
        while (bh == NULL && nr_used_buffers > 3 * RAID5_POOL_SIZE / 4) {
521
                md_wakeup_thread(raid_conf->thread);
522
                run_task_queue (&tq_disk);
523
                unplug_devices(sh);
524
                PRINTK(("waiting for bh\n"));
525
                sleep_on (&raid5_wait_for_bh);
526
                bh = remove_bh(b_size);
527
        }
528
        if (bh == NULL) {
529
                grow_buffers (200, b_size, GFP_KERNEL);
530
                bh = remove_bh(b_size);
531
        }
532
        if (bh) {
533
                save_flags(flags);
534
                cli();
535
                nr_used_buffers++;
536
                if (nr_used_buffers > max_nr_used_buffers)
537
                        max_nr_used_buffers = nr_used_buffers;
538
#if RAID5_DEBUG
539
                if (bh->b_size == 1024)
540
                        used_1024++;
541
                if (bh->b_size == 4096)
542
                        used_4096++;
543
                printk ("kmalloc_bh: free, used, pending, max = %d, %d, %d, %d\n", nr_free_buffers, nr_used_buffers, nr_pending, max_nr_used_buffers);
544
                printk ("kmalloc_bh: free1, used1, free4, used4 = %d, %d, %d, %d\n", free_1024, used_1024, free_4096, used_4096);
545
#endif
546
                restore_flags(flags);
547
        }
548
        return bh;
549
}
550
 
551
static inline void raid5_end_buffer_io (struct stripe_head *sh, int i, int uptodate)
552
{
553
        struct buffer_head *bh = sh->bh_new[i];
554
 
555
        sh->bh_new[i] = NULL;
556
        clear_bit (BH_MD, &bh->b_state);
557
        bh->private_bh = NULL;
558
        bh->personality = NULL;
559
        mark_buffer_uptodate(bh, uptodate);
560
        unlock_buffer(bh);
561
        if (!uptodate)
562
                printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for "
563
                       "block %lu\n", kdevname(bh->b_dev), bh->b_blocknr);
564
}
565
 
566
static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate)
567
{
568
        if (uptodate)
569
                set_bit(BH_Uptodate, &bh->b_state);
570
        else
571
                clear_bit(BH_Uptodate, &bh->b_state);
572
}
573
 
574
static void raid5_end_request (struct buffer_head * bh, int uptodate)
575
{
576
        struct stripe_head *sh = bh->private_bh;
577
        struct raid5_data *raid_conf = sh->raid_conf;
578
        int disks = raid_conf->raid_disks, i;
579
        unsigned long flags;
580
 
581
        PRINTK(("end_request %lu, nr_pending %d\n", sh->sector, sh->nr_pending));
582
        save_flags(flags);
583
        cli();
584
        raid5_mark_buffer_uptodate(bh, uptodate);
585
        --sh->nr_pending;
586
        if (!sh->nr_pending) {
587
                md_wakeup_thread(raid_conf->thread);
588
                atomic_inc(&raid_conf->nr_handle);
589
                if (!stripe_handle_tail)
590
                        stripe_handle_list = sh;
591
                else
592
                        stripe_handle_tail->handle_next = sh;
593
                sh->handle_next = NULL;
594
                stripe_handle_tail = sh;
595
        }
596
        if (!uptodate)
597
                md_error(bh->b_dev, bh->b_rdev);
598
        if (raid_conf->failed_disks) {
599
                for (i = 0; i < disks; i++) {
600
                        if (raid_conf->disks[i].operational)
601
                                continue;
602
                        if (bh != sh->bh_old[i] && bh != sh->bh_new[i] && bh != sh->bh_copy[i])
603
                                continue;
604
                        set_bit(STRIPE_ERROR, &sh->state);
605
                }
606
        }
607
        restore_flags(flags);
608
}
609
 
610
static int raid5_map (struct md_dev *mddev, kdev_t *rdev,
611
                      unsigned long *rsector, unsigned long size)
612
{
613
        /* No complex mapping used: the core of the work is done in the
614
         * request routine
615
         */
616
        return 0;
617
}
618
 
619
static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i)
620
{
621
        struct raid5_data *raid_conf = sh->raid_conf;
622
        struct md_dev *mddev = raid_conf->mddev;
623
        int minor = (int) (mddev - md_dev);
624
        char *b_data;
625
 
626
        b_data = ((volatile struct buffer_head *) bh)->b_data;
627
        memset (bh, 0, sizeof (struct buffer_head));
628
        ((volatile struct buffer_head *) bh)->b_data = b_data;
629
 
630
        bh->personality = &raid5_personality;
631
        bh->private_bh  = (void *) sh;
632
 
633
        bh->b_rdev      = raid_conf->disks[i].dev;
634
        bh->b_dev       = MKDEV(MD_MAJOR, minor);
635
        bh->b_rsector   = sh->sector;
636
        bh->b_blocknr   = sh->sector / (sh->size >> 9);
637
 
638
        bh->b_state     = (1 << BH_MD) | (1 << BH_Req);
639
        bh->b_count     = 1;
640
        bh->b_size      = sh->size;
641
        bh->b_list      = BUF_LOCKED;
642
}
643
 
644
static int raid5_error (struct md_dev *mddev, kdev_t dev)
645
{
646
        struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
647
        md_superblock_t *sb = mddev->sb;
648
        struct disk_info *disk;
649
        int i;
650
 
651
        PRINTK(("raid5_error called\n"));
652
        for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++)
653
                if (disk->dev == dev && disk->operational) {
654
                        disk->operational = 0;
655
                        sb->disks[disk->number].state |= (1 << MD_FAULTY_DEVICE);
656
                        sb->disks[disk->number].state &= ~(1 << MD_SYNC_DEVICE);
657
                        sb->disks[disk->number].state &= ~(1 << MD_ACTIVE_DEVICE);
658
                        sb->active_disks--;
659
                        sb->working_disks--;
660
                        sb->failed_disks++;
661
                        mddev->sb_dirty = 1;
662
                        raid_conf->working_disks--;
663
                        raid_conf->failed_disks++;
664
                        md_wakeup_thread(raid_conf->thread);
665
                        printk (KERN_ALERT
666
                                "RAID5: Disk failure on %s, disabling device."
667
                                "Operation continuing on %d devices\n",
668
                                kdevname (dev), raid_conf->working_disks);
669
                }
670
        return 0;
671
}
672
 
673
/*
674
 * Input: a 'big' sector number,
675
 * Output: index of the data and parity disk, and the sector # in them.
676
 */
677
static inline unsigned long
678
raid5_compute_sector (int r_sector, unsigned int raid_disks, unsigned int data_disks,
679
                        unsigned int * dd_idx, unsigned int * pd_idx,
680
                        struct raid5_data *raid_conf)
681
{
682
        unsigned int  stripe;
683
        int chunk_number, chunk_offset;
684
        unsigned long new_sector;
685
        int sectors_per_chunk = raid_conf->chunk_size >> 9;
686
 
687
        /* First compute the information on this sector */
688
 
689
        /*
690
         * Compute the chunk number and the sector offset inside the chunk
691
         */
692
        chunk_number = r_sector / sectors_per_chunk;
693
        chunk_offset = r_sector % sectors_per_chunk;
694
 
695
        /*
696
         * Compute the stripe number
697
         */
698
        stripe = chunk_number / data_disks;
699
 
700
        /*
701
         * Compute the data disk and parity disk indexes inside the stripe
702
         */
703
        *dd_idx = chunk_number % data_disks;
704
 
705
        /*
706
         * Select the parity disk based on the user selected algorithm.
707
         */
708
        if (raid_conf->level == 4)
709
                *pd_idx = data_disks;
710
        else switch (raid_conf->algorithm) {
711
                case ALGORITHM_LEFT_ASYMMETRIC:
712
                        *pd_idx = data_disks - stripe % raid_disks;
713
                        if (*dd_idx >= *pd_idx)
714
                                (*dd_idx)++;
715
                        break;
716
                case ALGORITHM_RIGHT_ASYMMETRIC:
717
                        *pd_idx = stripe % raid_disks;
718
                        if (*dd_idx >= *pd_idx)
719
                                (*dd_idx)++;
720
                        break;
721
                case ALGORITHM_LEFT_SYMMETRIC:
722
                        *pd_idx = data_disks - stripe % raid_disks;
723
                        *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
724
                        break;
725
                case ALGORITHM_RIGHT_SYMMETRIC:
726
                        *pd_idx = stripe % raid_disks;
727
                        *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
728
                        break;
729
                default:
730
                        printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
731
        }
732
 
733
        /*
734
         * Finally, compute the new sector number
735
         */
736
        new_sector = stripe * sectors_per_chunk + chunk_offset;
737
 
738
#if 0
739
        if (    *dd_idx > data_disks || *pd_idx > data_disks ||
740
                chunk_offset + bh->b_size / 512 > sectors_per_chunk     )
741
 
742
                printk ("raid5: bug: dd_idx == %d, pd_idx == %d, chunk_offset == %d\n",
743
                                *dd_idx, *pd_idx, chunk_offset);
744
#endif
745
 
746
        return new_sector;
747
}
748
 
749
static unsigned long compute_blocknr(struct stripe_head *sh, int i)
750
{
751
        struct raid5_data *raid_conf = sh->raid_conf;
752
        int raid_disks = raid_conf->raid_disks, data_disks = raid_disks - 1;
753
        unsigned long new_sector = sh->sector, check;
754
        int sectors_per_chunk = raid_conf->chunk_size >> 9;
755
        unsigned long stripe = new_sector / sectors_per_chunk;
756
        int chunk_offset = new_sector % sectors_per_chunk;
757
        int chunk_number, dummy1, dummy2, dd_idx = i;
758
        unsigned long r_sector, blocknr;
759
 
760
        switch (raid_conf->algorithm) {
761
                case ALGORITHM_LEFT_ASYMMETRIC:
762
                case ALGORITHM_RIGHT_ASYMMETRIC:
763
                        if (i > sh->pd_idx)
764
                                i--;
765
                        break;
766
                case ALGORITHM_LEFT_SYMMETRIC:
767
                case ALGORITHM_RIGHT_SYMMETRIC:
768
                        if (i < sh->pd_idx)
769
                                i += raid_disks;
770
                        i -= (sh->pd_idx + 1);
771
                        break;
772
                default:
773
                        printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm);
774
        }
775
 
776
        chunk_number = stripe * data_disks + i;
777
        r_sector = chunk_number * sectors_per_chunk + chunk_offset;
778
        blocknr = r_sector / (sh->size >> 9);
779
 
780
        check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, raid_conf);
781
        if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
782
                printk("compute_blocknr: map not correct\n");
783
                return 0;
784
        }
785
        return blocknr;
786
}
787
 
788
static void xor_block(struct buffer_head *dest, struct buffer_head *source)
789
{
790
        int lines = dest->b_size / (sizeof (int)) / 8, i;
791
        int *destp = (int *) dest->b_data, *sourcep = (int *) source->b_data;
792
 
793
        for (i = lines; i > 0; i--) {
794
                *(destp + 0) ^= *(sourcep + 0);
795
                *(destp + 1) ^= *(sourcep + 1);
796
                *(destp + 2) ^= *(sourcep + 2);
797
                *(destp + 3) ^= *(sourcep + 3);
798
                *(destp + 4) ^= *(sourcep + 4);
799
                *(destp + 5) ^= *(sourcep + 5);
800
                *(destp + 6) ^= *(sourcep + 6);
801
                *(destp + 7) ^= *(sourcep + 7);
802
                destp += 8;
803
                sourcep += 8;
804
        }
805
}
806
 
807
static void compute_block(struct stripe_head *sh, int dd_idx)
808
{
809
        struct raid5_data *raid_conf = sh->raid_conf;
810
        int i, disks = raid_conf->raid_disks;
811
 
812
        PRINTK(("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx));
813
 
814
        if (sh->bh_old[dd_idx] == NULL)
815
                sh->bh_old[dd_idx] = raid5_kmalloc_bh(sh, sh->size);
816
        raid5_build_block(sh, sh->bh_old[dd_idx], dd_idx);
817
 
818
        memset(sh->bh_old[dd_idx]->b_data, 0, sh->size);
819
        for (i = 0; i < disks; i++) {
820
                if (i == dd_idx)
821
                        continue;
822
                if (sh->bh_old[i]) {
823
                        xor_block(sh->bh_old[dd_idx], sh->bh_old[i]);
824
                        continue;
825
                } else
826
                        printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
827
        }
828
        raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1);
829
}
830
 
831
static void compute_parity(struct stripe_head *sh, int method)
832
{
833
        struct raid5_data *raid_conf = sh->raid_conf;
834
        int i, pd_idx = sh->pd_idx, disks = raid_conf->raid_disks;
835
 
836
        PRINTK(("compute_parity, stripe %lu, method %d\n", sh->sector, method));
837
        for (i = 0; i < disks; i++) {
838
                if (i == pd_idx || !sh->bh_new[i])
839
                        continue;
840
                if (!sh->bh_copy[i])
841
                        sh->bh_copy[i] = raid5_kmalloc_bh(sh, sh->size);
842
                raid5_build_block(sh, sh->bh_copy[i], i);
843
                mark_buffer_clean(sh->bh_new[i]);
844
                memcpy(sh->bh_copy[i]->b_data, sh->bh_new[i]->b_data, sh->size);
845
        }
846
        if (sh->bh_copy[pd_idx] == NULL)
847
                sh->bh_copy[pd_idx] = raid5_kmalloc_bh(sh, sh->size);
848
        raid5_build_block(sh, sh->bh_copy[pd_idx], sh->pd_idx);
849
 
850
        if (method == RECONSTRUCT_WRITE) {
851
                memset(sh->bh_copy[pd_idx]->b_data, 0, sh->size);
852
                for (i = 0; i < disks; i++) {
853
                        if (i == sh->pd_idx)
854
                                continue;
855
                        if (sh->bh_new[i]) {
856
                                xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
857
                                continue;
858
                        }
859
                        if (sh->bh_old[i]) {
860
                                xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
861
                                continue;
862
                        }
863
                }
864
        } else if (method == READ_MODIFY_WRITE) {
865
                memcpy(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size);
866
                for (i = 0; i < disks; i++) {
867
                        if (i == sh->pd_idx)
868
                                continue;
869
                        if (sh->bh_new[i] && sh->bh_old[i]) {
870
                                xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]);
871
                                xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]);
872
                                continue;
873
                        }
874
                }
875
        }
876
        raid5_mark_buffer_uptodate(sh->bh_copy[pd_idx], 1);
877
}
878
 
879
static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
880
{
881
        struct raid5_data *raid_conf = sh->raid_conf;
882
 
883
        if (sh->bh_new[dd_idx])
884
                printk("raid5: bug: stripe->bh_new[%d], sector %lu exists\n", dd_idx, sh->sector);
885
 
886
        set_bit(BH_MD, &bh->b_state);
887
        set_bit(BH_Lock, &bh->b_state);
888
        bh->personality  = &raid5_personality;
889
        bh->private_bh   = (void *) sh;
890
        bh->b_rdev    = raid_conf->disks[dd_idx].dev;
891
        bh->b_rsector = sh->sector;
892
 
893
        if (sh->phase == PHASE_COMPLETE && sh->cmd == STRIPE_NONE) {
894
                sh->phase = PHASE_BEGIN;
895
                sh->cmd = (rw == READ) ? STRIPE_READ : STRIPE_WRITE;
896
                nr_pending_stripes++;
897
                atomic_inc(&raid_conf->nr_handle);
898
        }
899
        sh->bh_new[dd_idx] = bh;
900
        sh->cmd_new[dd_idx] = rw;
901
        sh->new[dd_idx] = 1;
902
}
903
 
904
static void complete_stripe(struct stripe_head *sh)
905
{
906
        struct raid5_data *raid_conf = sh->raid_conf;
907
        int disks = raid_conf->raid_disks;
908
        int i, new = 0;
909
 
910
        PRINTK(("complete_stripe %lu\n", sh->sector));
911
        for (i = 0; i < disks; i++) {
912
                if (sh->cmd == STRIPE_WRITE && i == sh->pd_idx)
913
                        raid5_update_old_bh(sh, i);
914
                if (sh->bh_new[i]) {
915
                        if (!sh->new[i]) {
916
#if 0
917
                                if (sh->cmd == STRIPE_WRITE) {
918
                                        if (memcmp(sh->bh_new[i]->b_data, sh->bh_copy[i]->b_data, sh->size)) {
919
                                                printk("copy differs, %s, sector %lu ",
920
                                                        test_bit(BH_Dirty, &sh->bh_new[i]->b_state) ? "dirty" : "clean",
921
                                                        sh->sector);
922
                                        } else if (test_bit(BH_Dirty, &sh->bh_new[i]->b_state))
923
                                                printk("sector %lu dirty\n", sh->sector);
924
                                }
925
#endif
926
                                if (sh->cmd == STRIPE_WRITE)
927
                                        raid5_update_old_bh(sh, i);
928
                                raid5_end_buffer_io(sh, i, 1);
929
                                continue;
930
                        } else
931
                                new++;
932
                }
933
                if (new && sh->cmd == STRIPE_WRITE)
934
                        printk("raid5: bug, completed STRIPE_WRITE with new == %d\n", new);
935
        }
936
        if (!new)
937
                finish_stripe(sh);
938
        else {
939
                PRINTK(("stripe %lu, new == %d\n", sh->sector, new));
940
                sh->phase = PHASE_BEGIN;
941
        }
942
}
943
 
944
/*
945
 * handle_stripe() is our main logic routine. Note that:
946
 *
947
 * 1.   lock_stripe() should be used whenever we can't accept additonal
948
 *      buffers, either during short sleeping in handle_stripe() or
949
 *      during io operations.
950
 *
951
 * 2.   We should be careful to set sh->nr_pending whenever we sleep,
952
 *      to prevent re-entry of handle_stripe() for the same sh.
953
 *
954
 * 3.   raid_conf->failed_disks and disk->operational can be changed
955
 *      from an interrupt. This complicates things a bit, but it allows
956
 *      us to stop issuing requests for a failed drive as soon as possible.
957
 */
958
static void handle_stripe(struct stripe_head *sh)
959
{
960
        struct raid5_data *raid_conf = sh->raid_conf;
961
        struct md_dev *mddev = raid_conf->mddev;
962
        int minor = (int) (mddev - md_dev);
963
        struct buffer_head *bh;
964
        int disks = raid_conf->raid_disks;
965
        int i, nr = 0, nr_read = 0, nr_write = 0;
966
        int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0, parity = 0;
967
        int nr_failed_other = 0, nr_failed_overwrite = 0, parity_failed = 0;
968
        int reading = 0, nr_writing = 0;
969
        int method1 = INT_MAX, method2 = INT_MAX;
970
        int block;
971
        unsigned long flags;
972
        int operational[MD_SB_DISKS], failed_disks = raid_conf->failed_disks;
973
 
974
        PRINTK(("handle_stripe(), stripe %lu\n", sh->sector));
975
        if (sh->nr_pending) {
976
                printk("handle_stripe(), stripe %lu, io still pending\n", sh->sector);
977
                return;
978
        }
979
        if (sh->phase == PHASE_COMPLETE) {
980
                printk("handle_stripe(), stripe %lu, already complete\n", sh->sector);
981
                return;
982
        }
983
 
984
        atomic_dec(&raid_conf->nr_handle);
985
 
986
        if (clear_bit(STRIPE_ERROR, &sh->state)) {
987
                printk("raid5: restarting stripe %lu\n", sh->sector);
988
                sh->phase = PHASE_BEGIN;
989
        }
990
 
991
        if ((sh->cmd == STRIPE_WRITE && sh->phase == PHASE_WRITE) ||
992
            (sh->cmd == STRIPE_READ && sh->phase == PHASE_READ)) {
993
                /*
994
                 * Completed
995
                 */
996
                complete_stripe(sh);
997
                if (sh->phase == PHASE_COMPLETE)
998
                        return;
999
        }
1000
 
1001
        save_flags(flags);
1002
        cli();
1003
        for (i = 0; i < disks; i++)
1004
                operational[i] = raid_conf->disks[i].operational;
1005
        failed_disks = raid_conf->failed_disks;
1006
        restore_flags(flags);
1007
 
1008
        if (failed_disks > 1) {
1009
                for (i = 0; i < disks; i++) {
1010
                        if (sh->bh_new[i]) {
1011
                                raid5_end_buffer_io(sh, i, 0);
1012
                                continue;
1013
                        }
1014
                }
1015
                finish_stripe(sh);
1016
                return;
1017
        }
1018
 
1019
        for (i = 0; i < disks; i++) {
1020
                if (sh->bh_old[i])
1021
                        nr_cache++;
1022
                if (i == sh->pd_idx) {
1023
                        if (sh->bh_old[i])
1024
                                parity = 1;
1025
                        else if(!operational[i])
1026
                                parity_failed = 1;
1027
                        continue;
1028
                }
1029
                if (!sh->bh_new[i]) {
1030
                        if (sh->bh_old[i])
1031
                                nr_cache_other++;
1032
                        else if (!operational[i])
1033
                                nr_failed_other++;
1034
                        continue;
1035
                }
1036
                sh->new[i] = 0;
1037
                nr++;
1038
                if (sh->cmd_new[i] == READ)
1039
                        nr_read++;
1040
                if (sh->cmd_new[i] == WRITE)
1041
                        nr_write++;
1042
                if (sh->bh_old[i])
1043
                        nr_cache_overwrite++;
1044
                else if (!operational[i])
1045
                        nr_failed_overwrite++;
1046
        }
1047
 
1048
        if (nr_write && nr_read)
1049
                printk("raid5: bug, nr_write == %d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd);
1050
 
1051
        if (nr_write) {
1052
                /*
1053
                 * Attempt to add entries :-)
1054
                 */
1055
                if (nr_write != disks - 1) {
1056
                        for (i = 0; i < disks; i++) {
1057
                                if (i == sh->pd_idx)
1058
                                        continue;
1059
                                if (sh->bh_new[i])
1060
                                        continue;
1061
                                block = (int) compute_blocknr(sh, i);
1062
                                bh = efind_buffer(MKDEV(MD_MAJOR, minor), block, sh->size);
1063
                                if (bh && bh->b_count == 0 && buffer_dirty(bh) && !buffer_locked(bh)) {
1064
                                        PRINTK(("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block));
1065
                                        add_stripe_bh(sh, bh, i, WRITE);
1066
                                        sh->new[i] = 0;
1067
                                        nr++; nr_write++;
1068
                                        if (sh->bh_old[i]) {
1069
                                                nr_cache_overwrite++;
1070
                                                nr_cache_other--;
1071
                                        } else if (!operational[i]) {
1072
                                                nr_failed_overwrite++;
1073
                                                nr_failed_other--;
1074
                                        }
1075
                                }
1076
                        }
1077
                }
1078
                PRINTK(("handle_stripe() -- begin writing, stripe %lu\n", sh->sector));
1079
                /*
1080
                 * Writing, need to update parity buffer.
1081
                 *
1082
                 * Compute the number of I/O requests in the "reconstruct
1083
                 * write" and "read modify write" methods.
1084
                 */
1085
                if (!nr_failed_other)
1086
                        method1 = (disks - 1) - (nr_write + nr_cache_other);
1087
                if (!nr_failed_overwrite && !parity_failed)
1088
                        method2 = nr_write - nr_cache_overwrite + (1 - parity);
1089
 
1090
                if (method1 == INT_MAX && method2 == INT_MAX)
1091
                        printk("raid5: bug: method1 == method2 == INT_MAX\n");
1092
                PRINTK(("handle_stripe(), sector %lu, nr_write %d, method1 %d, method2 %d\n", sh->sector, nr_write, method1, method2));
1093
 
1094
                if (!method1 || !method2) {
1095
                        lock_stripe(sh);
1096
                        sh->nr_pending++;
1097
                        sh->phase = PHASE_WRITE;
1098
                        compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1099
                        for (i = 0; i < disks; i++) {
1100
                                if (!operational[i])
1101
                                        continue;
1102
                                if (i == sh->pd_idx || sh->bh_new[i])
1103
                                        nr_writing++;
1104
                        }
1105
 
1106
                        sh->nr_pending = nr_writing;
1107
                        PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, sh->nr_pending));
1108
 
1109
                        for (i = 0; i < disks; i++) {
1110
                                if (!operational[i])
1111
                                        continue;
1112
                                bh = sh->bh_copy[i];
1113
                                if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL)))
1114
                                        printk("raid5: bug: bh == %p, bh_new[%d] == %p\n", bh, i, sh->bh_new[i]);
1115
                                if (i == sh->pd_idx && !bh)
1116
                                        printk("raid5: bug: bh == NULL, i == pd_idx == %d\n", i);
1117
                                if (bh) {
1118
                                        bh->b_state |= (1<<BH_Dirty);
1119
                                        PRINTK(("making request for buffer %d\n", i));
1120
                                        clear_bit(BH_Lock, &bh->b_state);
1121
                                        make_request(MAJOR(raid_conf->disks[i].dev), WRITE, bh);
1122
                                }
1123
                        }
1124
                        return;
1125
                }
1126
 
1127
                lock_stripe(sh);
1128
                sh->nr_pending++;
1129
                if (method1 < method2) {
1130
                        sh->write_method = RECONSTRUCT_WRITE;
1131
                        for (i = 0; i < disks; i++) {
1132
                                if (i == sh->pd_idx)
1133
                                        continue;
1134
                                if (sh->bh_new[i] || sh->bh_old[i])
1135
                                        continue;
1136
                                sh->bh_old[i] = raid5_kmalloc_bh(sh, sh->size);
1137
                                raid5_build_block(sh, sh->bh_old[i], i);
1138
                                reading++;
1139
                        }
1140
                } else {
1141
                        sh->write_method = READ_MODIFY_WRITE;
1142
                        for (i = 0; i < disks; i++) {
1143
                                if (sh->bh_old[i])
1144
                                        continue;
1145
                                if (!sh->bh_new[i] && i != sh->pd_idx)
1146
                                        continue;
1147
                                sh->bh_old[i] = raid5_kmalloc_bh(sh, sh->size);
1148
                                raid5_build_block(sh, sh->bh_old[i], i);
1149
                                reading++;
1150
                        }
1151
                }
1152
                sh->phase = PHASE_READ_OLD;
1153
                sh->nr_pending = reading;
1154
                PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, sh->nr_pending));
1155
                for (i = 0; i < disks; i++) {
1156
                        if (!sh->bh_old[i])
1157
                                continue;
1158
                        if (buffer_uptodate(sh->bh_old[i]))
1159
                                continue;
1160
                        clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
1161
                        make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
1162
                }
1163
        } else {
1164
                /*
1165
                 * Reading
1166
                 */
1167
                method1 = nr_read - nr_cache_overwrite;
1168
                lock_stripe(sh);
1169
                sh->nr_pending++;
1170
 
1171
                PRINTK(("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1));
1172
                if (!method1 || (method1 == 1 && nr_cache == disks - 1)) {
1173
                        PRINTK(("read %lu completed from cache\n", sh->sector));
1174
                        for (i = 0; i < disks; i++) {
1175
                                if (!sh->bh_new[i])
1176
                                        continue;
1177
                                if (!sh->bh_old[i])
1178
                                        compute_block(sh, i);
1179
                                memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
1180
                        }
1181
                        sh->nr_pending--;
1182
                        complete_stripe(sh);
1183
                        return;
1184
                }
1185
                if (nr_failed_overwrite) {
1186
                        sh->phase = PHASE_READ_OLD;
1187
                        sh->nr_pending = (disks - 1) - nr_cache;
1188
                        PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, sh->nr_pending));
1189
                        for (i = 0; i < disks; i++) {
1190
                                if (sh->bh_old[i])
1191
                                        continue;
1192
                                if (!operational[i])
1193
                                        continue;
1194
                                sh->bh_old[i] = raid5_kmalloc_bh(sh, sh->size);
1195
                                raid5_build_block(sh, sh->bh_old[i], i);
1196
                                clear_bit(BH_Lock, &sh->bh_old[i]->b_state);
1197
                                make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]);
1198
                        }
1199
                } else {
1200
                        sh->phase = PHASE_READ;
1201
                        sh->nr_pending = nr_read - nr_cache_overwrite;
1202
                        PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, sh->nr_pending));
1203
                        for (i = 0; i < disks; i++) {
1204
                                if (!sh->bh_new[i])
1205
                                        continue;
1206
                                if (sh->bh_old[i]) {
1207
                                        memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
1208
                                        continue;
1209
                                }
1210
                                clear_bit(BH_Lock, &sh->bh_new[i]->b_state);
1211
                                make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_new[i]);
1212
                        }
1213
                }
1214
        }
1215
}
1216
 
1217
static int raid5_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh)
1218
{
1219
        struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
1220
        const unsigned int raid_disks = raid_conf->raid_disks;
1221
        const unsigned int data_disks = raid_disks - 1;
1222
        unsigned int  dd_idx, pd_idx;
1223
        unsigned long new_sector;
1224
 
1225
        struct stripe_head *sh;
1226
 
1227
        if (rw == READA) rw = READ;
1228
        if (rw == WRITEA) rw = WRITE;
1229
 
1230
        new_sector = raid5_compute_sector(bh->b_rsector, raid_disks, data_disks,
1231
                                                &dd_idx, &pd_idx, raid_conf);
1232
 
1233
        PRINTK(("raid5_make_request, sector %lu\n", new_sector));
1234
        sh = get_stripe(raid_conf, new_sector, bh->b_size);
1235
        if ((rw == READ && sh->cmd == STRIPE_WRITE) || (rw == WRITE && sh->cmd == STRIPE_READ)) {
1236
                printk("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd);
1237
                lock_stripe(sh);
1238
                if (!sh->nr_pending)
1239
                        handle_stripe(sh);
1240
                wait_on_stripe(sh);
1241
        }
1242
        sh->pd_idx = pd_idx;
1243
        if (sh->phase != PHASE_COMPLETE && sh->phase != PHASE_BEGIN)
1244
                PRINTK(("stripe %lu catching the bus!\n", sh->sector));
1245
        add_stripe_bh(sh, bh, dd_idx, rw);
1246
 
1247
        md_wakeup_thread(raid_conf->thread);
1248
        return 0;
1249
}
1250
 
1251
/*
1252
 * This is our raid5 kernel thread.
1253
 *
1254
 * We scan the hash table for stripes which can be handled now.
1255
 * During the scan, completed stripes are saved for us by the interrupt
1256
 * handler, so that they will not have to wait for our next wakeup.
1257
 */
1258
static void raid5d (void *data)
1259
{
1260
        struct stripe_head *sh;
1261
        struct raid5_data *raid_conf = data;
1262
        struct md_dev *mddev = raid_conf->mddev;
1263
        int i, handled = 0, unplug = 0;
1264
        unsigned long flags;
1265
 
1266
        PRINTK(("+++ raid5d active\n"));
1267
 
1268
        if (mddev->sb_dirty) {
1269
                mddev->sb_dirty = 0;
1270
                md_update_sb((int) (mddev - md_dev));
1271
        }
1272
        save_flags(flags);
1273
        cli();
1274
        stripe_handle_list = stripe_handle_tail = NULL;
1275
        restore_flags(flags);
1276
 
1277
        for (i = 0; i < NR_HASH; i++) {
1278
repeat:
1279
                sh = stripe_hashtbl[i];
1280
                for (; sh; sh = sh->hash_next) {
1281
                        if (sh->raid_conf != raid_conf)
1282
                                continue;
1283
                        if (sh->phase == PHASE_COMPLETE)
1284
                                continue;
1285
                        if (sh->nr_pending)
1286
                                continue;
1287
                        if (sh->sector == raid_conf->next_sector) {
1288
                                raid_conf->sector_count += (sh->size >> 9);
1289
                                if (raid_conf->sector_count >= 128)
1290
                                        unplug = 1;
1291
                        } else
1292
                                unplug = 1;
1293
                        if (unplug) {
1294
                                PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, raid_conf->sector_count));
1295
                                unplug_devices(sh);
1296
                                unplug = 0;
1297
                                raid_conf->sector_count = 0;
1298
                        }
1299
                        raid_conf->next_sector = sh->sector + (sh->size >> 9);
1300
                        handled++;
1301
                        handle_stripe(sh);
1302
                        goto repeat;
1303
                }
1304
        }
1305
        if (raid_conf) {
1306
                PRINTK(("%d stripes handled, nr_handle %d\n", handled, raid_conf->nr_handle));
1307
                save_flags(flags);
1308
                cli();
1309
                if (!raid_conf->nr_handle)
1310
                        clear_bit(THREAD_WAKEUP, &raid_conf->thread->flags);
1311
        }
1312
        PRINTK(("--- raid5d inactive\n"));
1313
}
1314
 
1315
static int raid5_run (int minor, struct md_dev *mddev)
1316
{
1317
        struct raid5_data *raid_conf;
1318
        int i, j, raid_disk;
1319
        md_superblock_t *sb = mddev->sb;
1320
        md_descriptor_t *descriptor;
1321
        struct real_dev *realdev;
1322
 
1323
        MOD_INC_USE_COUNT;
1324
 
1325
        if (sb->level != 5 && sb->level != 4) {
1326
                printk("raid5: %s: raid level not set to 4/5 (%d)\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
1327
                MOD_DEC_USE_COUNT;
1328
                return -EIO;
1329
        }
1330
 
1331
        mddev->private = kmalloc (sizeof (struct raid5_data), GFP_KERNEL);
1332
        raid_conf = mddev->private;
1333
        memset (raid_conf, 0, sizeof (*raid_conf));
1334
        raid_conf->mddev = mddev;
1335
 
1336
        PRINTK(("raid5_run(%d) called.\n", minor));
1337
 
1338
        for (i = 0; i < mddev->nb_dev; i++) {
1339
                realdev = &mddev->devices[i];
1340
                if (!realdev->sb) {
1341
                        printk(KERN_ERR "raid5: disabled device %s (couldn't access raid superblock)\n", kdevname(realdev->dev));
1342
                        continue;
1343
                }
1344
 
1345
                /*
1346
                 * This is important -- we are using the descriptor on
1347
                 * the disk only to get a pointer to the descriptor on
1348
                 * the main superblock, which might be more recent.
1349
                 */
1350
                descriptor = &sb->disks[realdev->sb->descriptor.number];
1351
                if (descriptor->state & (1 << MD_FAULTY_DEVICE)) {
1352
                        printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", kdevname(realdev->dev));
1353
                        continue;
1354
                }
1355
                if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) {
1356
                        if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) {
1357
                                printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", kdevname(realdev->dev));
1358
                                continue;
1359
                        }
1360
                        raid_disk = descriptor->raid_disk;
1361
                        if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) {
1362
                                printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", kdevname(realdev->dev));
1363
                                continue;
1364
                        }
1365
                        if (raid_conf->disks[raid_disk].operational) {
1366
                                printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", kdevname(realdev->dev), raid_disk);
1367
                                continue;
1368
                        }
1369
                        printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", kdevname(realdev->dev), raid_disk);
1370
 
1371
                        raid_conf->disks[raid_disk].number = descriptor->number;
1372
                        raid_conf->disks[raid_disk].raid_disk = raid_disk;
1373
                        raid_conf->disks[raid_disk].dev = mddev->devices[i].dev;
1374
                        raid_conf->disks[raid_disk].operational = 1;
1375
 
1376
                        raid_conf->working_disks++;
1377
                }
1378
        }
1379
        raid_conf->raid_disks = sb->raid_disks;
1380
        raid_conf->failed_disks = raid_conf->raid_disks - raid_conf->working_disks;
1381
        raid_conf->mddev = mddev;
1382
        raid_conf->chunk_size = sb->chunk_size;
1383
        raid_conf->level = sb->level;
1384
        raid_conf->algorithm = sb->parity_algorithm;
1385
 
1386
        if (!raid_conf->chunk_size || raid_conf->chunk_size % 4) {
1387
                printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", raid_conf->chunk_size, kdevname(MKDEV(MD_MAJOR, minor)));
1388
                goto abort;
1389
        }
1390
        if (raid_conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
1391
                printk(KERN_ERR "raid5: unsupported parity algorithm %d for %s\n", raid_conf->algorithm, kdevname(MKDEV(MD_MAJOR, minor)));
1392
                goto abort;
1393
        }
1394
        if (raid_conf->failed_disks > 1) {
1395
                printk(KERN_ERR "raid5: not enough operational devices for %s (%d/%d failed)\n", kdevname(MKDEV(MD_MAJOR, minor)), raid_conf->failed_disks, raid_conf->raid_disks);
1396
                goto abort;
1397
        }
1398
 
1399
#if 0
1400
        if (check_consistenty(mddev)) {
1401
                printk(KERN_ERR "raid5: detected raid-5 xor inconsistenty -- run ckraid\n");
1402
                sb->state |= 1 << MD_SB_ERRORS;
1403
                goto abort;
1404
        }
1405
#endif
1406
 
1407
        if ((raid_conf->thread = md_register_thread(raid5d, raid_conf)) == NULL) {
1408
                printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
1409
                goto abort;
1410
        }
1411
 
1412
        /*
1413
         * Regenerate the "device is in sync with the raid set" bit for
1414
         * each device.
1415
         */
1416
        for (i = 0; i < sb->nr_disks ; i++) {
1417
                sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE);
1418
                for (j = 0; j < sb->raid_disks; j++) {
1419
                        if (!raid_conf->disks[j].operational)
1420
                                continue;
1421
                        if (sb->disks[i].number == raid_conf->disks[j].number)
1422
                                sb->disks[i].state |= 1 << MD_SYNC_DEVICE;
1423
                }
1424
        }
1425
        sb->active_disks = raid_conf->working_disks;
1426
 
1427
        if (sb->active_disks == sb->raid_disks)
1428
                printk("raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
1429
        else
1430
                printk(KERN_ALERT "raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm);
1431
 
1432
        /* Ok, everything is just fine now */
1433
        return (0);
1434
abort:
1435
        if (raid_conf)
1436
                kfree(raid_conf);
1437
        mddev->private = NULL;
1438
        printk(KERN_ALERT "raid5: failed to run raid set %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
1439
        MOD_DEC_USE_COUNT;
1440
        return -EIO;
1441
}
1442
 
1443
static int raid5_stop (int minor, struct md_dev *mddev)
1444
{
1445
        struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
1446
 
1447
        md_unregister_thread(raid_conf->thread);
1448
        kfree (raid_conf);
1449
        shrink_stripe_cache(RAID5_STRIPE_POOL_SIZE);
1450
        shrink_buffers(RAID5_POOL_SIZE);
1451
        MOD_DEC_USE_COUNT;
1452
        return 0;
1453
}
1454
 
1455
static int raid5_status (char *page, int minor, struct md_dev *mddev)
1456
{
1457
        struct raid5_data *raid_conf = (struct raid5_data *) mddev->private;
1458
        md_superblock_t *sb = mddev->sb;
1459
        int sz = 0, i;
1460
 
1461
        sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->parity_algorithm);
1462
        sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks);
1463
        for (i = 0; i < raid_conf->raid_disks; i++)
1464
                sz += sprintf (page+sz, "%s", raid_conf->disks[i].operational ? "U" : "_");
1465
        sz += sprintf (page+sz, "]");
1466
        return sz;
1467
}
1468
 
1469
static struct md_personality raid5_personality=
1470
{
1471
        "raid5",
1472
        raid5_map,
1473
        raid5_make_request,
1474
        raid5_end_request,
1475
        raid5_run,
1476
        raid5_stop,
1477
        raid5_status,
1478
        NULL,                   /* no ioctls */
1479
        0,
1480
        raid5_error
1481
};
1482
 
1483
int raid5_init (void)
1484
{
1485
        if ((stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER, 0)) == NULL)
1486
                return -ENOMEM;
1487
        memset(stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1488
        return register_md_personality (RAID5, &raid5_personality);
1489
}
1490
 
1491
#ifdef MODULE
1492
int init_module (void)
1493
{
1494
        return raid5_init();
1495
}
1496
 
1497
void cleanup_module (void)
1498
{
1499
        free_pages((unsigned long) stripe_hashtbl, HASH_PAGES_ORDER);
1500
        shrink_stripe_cache(RAID5_STRIPE_POOL_SIZE);
1501
        shrink_buffers(RAID5_POOL_SIZE);
1502
        unregister_md_personality (RAID5);
1503
}
1504
#endif

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.