OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [drivers/] [md/] [raid1.c] - Blame information for rev 1779

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1275 phoenix
/*
2
 * raid1.c : Multiple Devices driver for Linux
3
 *
4
 * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
5
 *
6
 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7
 *
8
 * RAID-1 management functions.
9
 *
10
 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
11
 *
12
 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
13
 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14
 *
15
 * This program is free software; you can redistribute it and/or modify
16
 * it under the terms of the GNU General Public License as published by
17
 * the Free Software Foundation; either version 2, or (at your option)
18
 * any later version.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * (for example /usr/src/linux/COPYING); if not, write to the Free
22
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23
 */
24
 
25
#include <linux/module.h>
26
#include <linux/config.h>
27
#include <linux/slab.h>
28
#include <linux/raid/raid1.h>
29
#include <asm/atomic.h>
30
 
31
#define MAJOR_NR MD_MAJOR
32
#define MD_DRIVER
33
#define MD_PERSONALITY
34
 
35
#define MAX_WORK_PER_DISK 128
36
 
37
#define NR_RESERVED_BUFS        32
38
 
39
 
40
/*
41
 * The following can be used to debug the driver
42
 */
43
#define RAID1_DEBUG     0
44
 
45
#if RAID1_DEBUG
46
#define PRINTK(x...)   printk(x)
47
#define inline
48
#define __inline__
49
#else
50
#define PRINTK(x...)  do { } while (0)
51
#endif
52
 
53
 
54
static mdk_personality_t raid1_personality;
55
static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
56
struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
57
 
58
static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
59
{
60
        /* return a linked list of "cnt" struct buffer_heads.
61
         * don't take any off the free list unless we know we can
62
         * get all we need, otherwise we could deadlock
63
         */
64
        struct buffer_head *bh=NULL;
65
 
66
        while(cnt) {
67
                struct buffer_head *t;
68
                md_spin_lock_irq(&conf->device_lock);
69
                if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
70
                        while (cnt) {
71
                                t = conf->freebh;
72
                                conf->freebh = t->b_next;
73
                                t->b_next = bh;
74
                                bh = t;
75
                                t->b_state = 0;
76
                                conf->freebh_cnt--;
77
                                cnt--;
78
                        }
79
                md_spin_unlock_irq(&conf->device_lock);
80
                if (cnt == 0)
81
                        break;
82
                t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
83
                if (t) {
84
                        t->b_next = bh;
85
                        bh = t;
86
                        cnt--;
87
                } else {
88
                        PRINTK("raid1: waiting for %d bh\n", cnt);
89
                        conf->freebh_blocked = 1;
90
                        wait_disk_event(conf->wait_buffer,
91
                                        !conf->freebh_blocked ||
92
                                        conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
93
                        conf->freebh_blocked = 0;
94
                }
95
        }
96
        return bh;
97
}
98
 
99
static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
100
{
101
        unsigned long flags;
102
        spin_lock_irqsave(&conf->device_lock, flags);
103
        while (bh) {
104
                struct buffer_head *t = bh;
105
                bh=bh->b_next;
106
                if (t->b_pprev == NULL)
107
                        kmem_cache_free(bh_cachep, t);
108
                else {
109
                        t->b_next= conf->freebh;
110
                        conf->freebh = t;
111
                        conf->freebh_cnt++;
112
                }
113
        }
114
        spin_unlock_irqrestore(&conf->device_lock, flags);
115
        wake_up(&conf->wait_buffer);
116
}
117
 
118
static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
119
{
120
        /* allocate cnt buffer_heads, possibly less if kmalloc fails */
121
        int i = 0;
122
 
123
        while (i < cnt) {
124
                struct buffer_head *bh;
125
                bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
126
                if (!bh) break;
127
 
128
                md_spin_lock_irq(&conf->device_lock);
129
                bh->b_pprev = &conf->freebh;
130
                bh->b_next = conf->freebh;
131
                conf->freebh = bh;
132
                conf->freebh_cnt++;
133
                md_spin_unlock_irq(&conf->device_lock);
134
 
135
                i++;
136
        }
137
        return i;
138
}
139
 
140
static void raid1_shrink_bh(raid1_conf_t *conf)
141
{
142
        /* discard all buffer_heads */
143
 
144
        md_spin_lock_irq(&conf->device_lock);
145
        while (conf->freebh) {
146
                struct buffer_head *bh = conf->freebh;
147
                conf->freebh = bh->b_next;
148
                kmem_cache_free(bh_cachep, bh);
149
                conf->freebh_cnt--;
150
        }
151
        md_spin_unlock_irq(&conf->device_lock);
152
}
153
 
154
 
155
static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
156
{
157
        struct raid1_bh *r1_bh = NULL;
158
 
159
        do {
160
                md_spin_lock_irq(&conf->device_lock);
161
                if (!conf->freer1_blocked && conf->freer1) {
162
                        r1_bh = conf->freer1;
163
                        conf->freer1 = r1_bh->next_r1;
164
                        conf->freer1_cnt--;
165
                        r1_bh->next_r1 = NULL;
166
                        r1_bh->state = (1 << R1BH_PreAlloc);
167
                        r1_bh->bh_req.b_state = 0;
168
                }
169
                md_spin_unlock_irq(&conf->device_lock);
170
                if (r1_bh)
171
                        return r1_bh;
172
                r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
173
                if (r1_bh) {
174
                        memset(r1_bh, 0, sizeof(*r1_bh));
175
                        return r1_bh;
176
                }
177
                conf->freer1_blocked = 1;
178
                wait_disk_event(conf->wait_buffer,
179
                                !conf->freer1_blocked ||
180
                                conf->freer1_cnt > NR_RESERVED_BUFS/2
181
                        );
182
                conf->freer1_blocked = 0;
183
        } while (1);
184
}
185
 
186
static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
187
{
188
        struct buffer_head *bh = r1_bh->mirror_bh_list;
189
        raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
190
 
191
        r1_bh->mirror_bh_list = NULL;
192
 
193
        if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
194
                unsigned long flags;
195
                spin_lock_irqsave(&conf->device_lock, flags);
196
                r1_bh->next_r1 = conf->freer1;
197
                conf->freer1 = r1_bh;
198
                conf->freer1_cnt++;
199
                spin_unlock_irqrestore(&conf->device_lock, flags);
200
                /* don't need to wakeup wait_buffer because
201
                 *  raid1_free_bh below will do that
202
                 */
203
        } else {
204
                kfree(r1_bh);
205
        }
206
        raid1_free_bh(conf, bh);
207
}
208
 
209
static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
210
{
211
        int i = 0;
212
 
213
        while (i < cnt) {
214
                struct raid1_bh *r1_bh;
215
                r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
216
                if (!r1_bh)
217
                        break;
218
                memset(r1_bh, 0, sizeof(*r1_bh));
219
                set_bit(R1BH_PreAlloc, &r1_bh->state);
220
                r1_bh->mddev = conf->mddev;
221
 
222
                raid1_free_r1bh(r1_bh);
223
                i++;
224
        }
225
        return i;
226
}
227
 
228
static void raid1_shrink_r1bh(raid1_conf_t *conf)
229
{
230
        md_spin_lock_irq(&conf->device_lock);
231
        while (conf->freer1) {
232
                struct raid1_bh *r1_bh = conf->freer1;
233
                conf->freer1 = r1_bh->next_r1;
234
                conf->freer1_cnt--;
235
                kfree(r1_bh);
236
        }
237
        md_spin_unlock_irq(&conf->device_lock);
238
}
239
 
240
 
241
 
242
static inline void raid1_free_buf(struct raid1_bh *r1_bh)
243
{
244
        unsigned long flags;
245
        struct buffer_head *bh = r1_bh->mirror_bh_list;
246
        raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
247
        r1_bh->mirror_bh_list = NULL;
248
 
249
        spin_lock_irqsave(&conf->device_lock, flags);
250
        r1_bh->next_r1 = conf->freebuf;
251
        conf->freebuf = r1_bh;
252
        spin_unlock_irqrestore(&conf->device_lock, flags);
253
        raid1_free_bh(conf, bh);
254
}
255
 
256
static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
257
{
258
        struct raid1_bh *r1_bh;
259
 
260
        md_spin_lock_irq(&conf->device_lock);
261
        wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
262
        r1_bh = conf->freebuf;
263
        conf->freebuf = r1_bh->next_r1;
264
        r1_bh->next_r1= NULL;
265
        md_spin_unlock_irq(&conf->device_lock);
266
 
267
        return r1_bh;
268
}
269
 
270
static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
271
{
272
        int i = 0;
273
        struct raid1_bh *head = NULL, **tail;
274
        tail = &head;
275
 
276
        while (i < cnt) {
277
                struct raid1_bh *r1_bh;
278
                struct page *page;
279
 
280
                page = alloc_page(GFP_KERNEL);
281
                if (!page)
282
                        break;
283
 
284
                r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
285
                if (!r1_bh) {
286
                        __free_page(page);
287
                        break;
288
                }
289
                memset(r1_bh, 0, sizeof(*r1_bh));
290
                r1_bh->bh_req.b_page = page;
291
                r1_bh->bh_req.b_data = page_address(page);
292
                *tail = r1_bh;
293
                r1_bh->next_r1 = NULL;
294
                tail = & r1_bh->next_r1;
295
                i++;
296
        }
297
        /* this lock probably isn't needed, as at the time when
298
         * we are allocating buffers, nobody else will be touching the
299
         * freebuf list.  But it doesn't hurt....
300
         */
301
        md_spin_lock_irq(&conf->device_lock);
302
        *tail = conf->freebuf;
303
        conf->freebuf = head;
304
        md_spin_unlock_irq(&conf->device_lock);
305
        return i;
306
}
307
 
308
static void raid1_shrink_buffers (raid1_conf_t *conf)
309
{
310
        struct raid1_bh *head;
311
        md_spin_lock_irq(&conf->device_lock);
312
        head = conf->freebuf;
313
        conf->freebuf = NULL;
314
        md_spin_unlock_irq(&conf->device_lock);
315
 
316
        while (head) {
317
                struct raid1_bh *r1_bh = head;
318
                head = r1_bh->next_r1;
319
                __free_page(r1_bh->bh_req.b_page);
320
                kfree(r1_bh);
321
        }
322
}
323
 
324
static int raid1_map (mddev_t *mddev, kdev_t *rdev)
325
{
326
        raid1_conf_t *conf = mddev_to_conf(mddev);
327
        int i, disks = MD_SB_DISKS;
328
 
329
        /*
330
         * Later we do read balancing on the read side
331
         * now we use the first available disk.
332
         */
333
 
334
        for (i = 0; i < disks; i++) {
335
                if (conf->mirrors[i].operational) {
336
                        *rdev = conf->mirrors[i].dev;
337
                        return (0);
338
                }
339
        }
340
 
341
        printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
342
        return (-1);
343
}
344
 
345
static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
346
{
347
        unsigned long flags;
348
        mddev_t *mddev = r1_bh->mddev;
349
        raid1_conf_t *conf = mddev_to_conf(mddev);
350
 
351
        md_spin_lock_irqsave(&retry_list_lock, flags);
352
        if (raid1_retry_list == NULL)
353
                raid1_retry_tail = &raid1_retry_list;
354
        *raid1_retry_tail = r1_bh;
355
        raid1_retry_tail = &r1_bh->next_r1;
356
        r1_bh->next_r1 = NULL;
357
        md_spin_unlock_irqrestore(&retry_list_lock, flags);
358
        md_wakeup_thread(conf->thread);
359
}
360
 
361
 
362
static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
363
{
364
        unsigned long flags;
365
        spin_lock_irqsave(&conf->segment_lock, flags);
366
        if (sector < conf->start_active)
367
                conf->cnt_done--;
368
        else if (sector >= conf->start_future && conf->phase == phase)
369
                conf->cnt_future--;
370
        else if (!--conf->cnt_pending)
371
                wake_up(&conf->wait_ready);
372
 
373
        spin_unlock_irqrestore(&conf->segment_lock, flags);
374
}
375
 
376
static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
377
{
378
        unsigned long flags;
379
        spin_lock_irqsave(&conf->segment_lock, flags);
380
        if (sector >= conf->start_ready)
381
                --conf->cnt_ready;
382
        else if (sector >= conf->start_active) {
383
                if (!--conf->cnt_active) {
384
                        conf->start_active = conf->start_ready;
385
                        wake_up(&conf->wait_done);
386
                }
387
        }
388
        spin_unlock_irqrestore(&conf->segment_lock, flags);
389
}
390
 
391
/*
392
 * raid1_end_bh_io() is called when we have finished servicing a mirrored
393
 * operation and are ready to return a success/failure code to the buffer
394
 * cache layer.
395
 */
396
static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
397
{
398
        struct buffer_head *bh = r1_bh->master_bh;
399
 
400
        io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
401
                        test_bit(R1BH_SyncPhase, &r1_bh->state));
402
 
403
        bh->b_end_io(bh, uptodate);
404
        raid1_free_r1bh(r1_bh);
405
}
406
void raid1_end_request (struct buffer_head *bh, int uptodate)
407
{
408
        struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
409
 
410
        /*
411
         * this branch is our 'one mirror IO has finished' event handler:
412
         */
413
        if (!uptodate)
414
                md_error (r1_bh->mddev, bh->b_dev);
415
        else
416
                /*
417
                 * Set R1BH_Uptodate in our master buffer_head, so that
418
                 * we will return a good error code for to the higher
419
                 * levels even if IO on some other mirrored buffer fails.
420
                 *
421
                 * The 'master' represents the complex operation to
422
                 * user-side. So if something waits for IO, then it will
423
                 * wait for the 'master' buffer_head.
424
                 */
425
                set_bit (R1BH_Uptodate, &r1_bh->state);
426
 
427
        /*
428
         * We split up the read and write side, imho they are
429
         * conceptually different.
430
         */
431
 
432
        if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
433
                /*
434
                 * we have only one buffer_head on the read side
435
                 */
436
 
437
                if (uptodate) {
438
                        raid1_end_bh_io(r1_bh, uptodate);
439
                        return;
440
                }
441
                /*
442
                 * oops, read error:
443
                 */
444
                printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
445
                         partition_name(bh->b_dev), bh->b_blocknr);
446
                raid1_reschedule_retry(r1_bh);
447
                return;
448
        }
449
 
450
        /*
451
         * WRITE:
452
         *
453
         * Let's see if all mirrored write operations have finished
454
         * already.
455
         */
456
 
457
        if (atomic_dec_and_test(&r1_bh->remaining))
458
                raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
459
}
460
 
461
/*
462
 * This routine returns the disk from which the requested read should
463
 * be done. It bookkeeps the last read position for every disk
464
 * in array and when new read requests come, the disk which last
465
 * position is nearest to the request, is chosen.
466
 *
467
 * TODO: now if there are 2 mirrors in the same 2 devices, performance
468
 * degrades dramatically because position is mirror, not device based.
469
 * This should be changed to be device based. Also atomic sequential
470
 * reads should be somehow balanced.
471
 */
472
 
473
static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
474
{
475
        int new_disk = conf->last_used;
476
        const int sectors = bh->b_size >> 9;
477
        const unsigned long this_sector = bh->b_rsector;
478
        int disk = new_disk;
479
        unsigned long new_distance;
480
        unsigned long current_distance;
481
 
482
        /*
483
         * Check if it is sane at all to balance
484
         */
485
 
486
        if (conf->resync_mirrors)
487
                goto rb_out;
488
 
489
 
490
#if defined(CONFIG_ALPHA) && ((__GNUC__ < 3) || \
491
                              ((__GNUC__ == 3) && (__GNUC_MINOR__ < 3)))
492
        /* Work around a compiler bug in older gcc */
493
        new_disk = *(volatile int *)&new_disk;
494
#endif
495
 
496
        /* make sure that disk is operational */
497
        while( !conf->mirrors[new_disk].operational) {
498
                if (new_disk <= 0) new_disk = conf->raid_disks;
499
                new_disk--;
500
                if (new_disk == disk) {
501
                        /*
502
                         * This means no working disk was found
503
                         * Nothing much to do, lets not change anything
504
                         * and hope for the best...
505
                         */
506
 
507
                        new_disk = conf->last_used;
508
 
509
                        goto rb_out;
510
                }
511
        }
512
        disk = new_disk;
513
        /* now disk == new_disk == starting point for search */
514
 
515
        /*
516
         * Don't touch anything for sequential reads.
517
         */
518
 
519
        if (this_sector == conf->mirrors[new_disk].head_position)
520
                goto rb_out;
521
 
522
        /*
523
         * If reads have been done only on a single disk
524
         * for a time, lets give another disk a change.
525
         * This is for kicking those idling disks so that
526
         * they would find work near some hotspot.
527
         */
528
 
529
        if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
530
                conf->sect_count = 0;
531
 
532
#if defined(CONFIG_SPARC64) && (__GNUC__ == 2) && (__GNUC_MINOR__ == 92)
533
                /* Work around a compiler bug in egcs-2.92.11 19980921 */
534
                new_disk = *(volatile int *)&new_disk;
535
#endif
536
                do {
537
                        if (new_disk<=0)
538
                                new_disk = conf->raid_disks;
539
                        new_disk--;
540
                        if (new_disk == disk)
541
                                break;
542
                } while ((conf->mirrors[new_disk].write_only) ||
543
                         (!conf->mirrors[new_disk].operational));
544
 
545
                goto rb_out;
546
        }
547
 
548
        current_distance = abs(this_sector -
549
                                conf->mirrors[disk].head_position);
550
 
551
        /* Find the disk which is closest */
552
 
553
#if defined(CONFIG_ALPHA) && ((__GNUC__ < 3) || \
554
                              ((__GNUC__ == 3) && (__GNUC_MINOR__ < 3)))
555
        /* Work around a compiler bug in older gcc */
556
        disk = *(volatile int *)&disk;
557
#endif
558
        do {
559
                if (disk <= 0)
560
                        disk = conf->raid_disks;
561
                disk--;
562
 
563
                if ((conf->mirrors[disk].write_only) ||
564
                                (!conf->mirrors[disk].operational))
565
                        continue;
566
 
567
                new_distance = abs(this_sector -
568
                                        conf->mirrors[disk].head_position);
569
 
570
                if (new_distance < current_distance) {
571
                        conf->sect_count = 0;
572
                        current_distance = new_distance;
573
                        new_disk = disk;
574
                }
575
        } while (disk != conf->last_used);
576
 
577
rb_out:
578
        conf->mirrors[new_disk].head_position = this_sector + sectors;
579
 
580
        conf->last_used = new_disk;
581
        conf->sect_count += sectors;
582
 
583
        return new_disk;
584
}
585
 
586
static int raid1_make_request (mddev_t *mddev, int rw,
587
                               struct buffer_head * bh)
588
{
589
        raid1_conf_t *conf = mddev_to_conf(mddev);
590
        struct buffer_head *bh_req, *bhl;
591
        struct raid1_bh * r1_bh;
592
        int disks = MD_SB_DISKS;
593
        int i, sum_bhs = 0;
594
        struct mirror_info *mirror;
595
 
596
        if (!buffer_locked(bh))
597
                BUG();
598
 
599
/*
600
 * make_request() can abort the operation when READA is being
601
 * used and no empty request is available.
602
 *
603
 * Currently, just replace the command with READ/WRITE.
604
 */
605
        if (rw == READA)
606
                rw = READ;
607
 
608
        r1_bh = raid1_alloc_r1bh (conf);
609
 
610
        spin_lock_irq(&conf->segment_lock);
611
        wait_event_lock_irq(conf->wait_done,
612
                        bh->b_rsector < conf->start_active ||
613
                        bh->b_rsector >= conf->start_future,
614
                        conf->segment_lock);
615
        if (bh->b_rsector < conf->start_active)
616
                conf->cnt_done++;
617
        else {
618
                conf->cnt_future++;
619
                if (conf->phase)
620
                        set_bit(R1BH_SyncPhase, &r1_bh->state);
621
        }
622
        spin_unlock_irq(&conf->segment_lock);
623
 
624
        /*
625
         * i think the read and write branch should be separated completely,
626
         * since we want to do read balancing on the read side for example.
627
         * Alternative implementations? :) --mingo
628
         */
629
 
630
        r1_bh->master_bh = bh;
631
        r1_bh->mddev = mddev;
632
        r1_bh->cmd = rw;
633
 
634
        if (rw == READ) {
635
                /*
636
                 * read balancing logic:
637
                 */
638
                mirror = conf->mirrors + raid1_read_balance(conf, bh);
639
 
640
                bh_req = &r1_bh->bh_req;
641
                memcpy(bh_req, bh, sizeof(*bh));
642
                bh_req->b_blocknr = bh->b_rsector;
643
                bh_req->b_dev = mirror->dev;
644
                bh_req->b_rdev = mirror->dev;
645
        /*      bh_req->b_rsector = bh->n_rsector; */
646
                bh_req->b_end_io = raid1_end_request;
647
                bh_req->b_private = r1_bh;
648
                generic_make_request (rw, bh_req);
649
                return 0;
650
        }
651
 
652
        /*
653
         * WRITE:
654
         */
655
 
656
        bhl = raid1_alloc_bh(conf, conf->raid_disks);
657
        for (i = 0; i < disks; i++) {
658
                struct buffer_head *mbh;
659
                if (!conf->mirrors[i].operational)
660
                        continue;
661
 
662
        /*
663
         * We should use a private pool (size depending on NR_REQUEST),
664
         * to avoid writes filling up the memory with bhs
665
         *
666
         * Such pools are much faster than kmalloc anyways (so we waste
667
         * almost nothing by not using the master bh when writing and
668
         * win alot of cleanness) but for now we are cool enough. --mingo
669
         *
670
         * It's safe to sleep here, buffer heads cannot be used in a shared
671
         * manner in the write branch. Look how we lock the buffer at the
672
         * beginning of this function to grok the difference ;)
673
         */
674
                mbh = bhl;
675
                if (mbh == NULL) {
676
                        MD_BUG();
677
                        break;
678
                }
679
                bhl = mbh->b_next;
680
                mbh->b_next = NULL;
681
                mbh->b_this_page = (struct buffer_head *)1;
682
 
683
        /*
684
         * prepare mirrored mbh (fields ordered for max mem throughput):
685
         */
686
                mbh->b_blocknr    = bh->b_rsector;
687
                mbh->b_dev        = conf->mirrors[i].dev;
688
                mbh->b_rdev       = conf->mirrors[i].dev;
689
                mbh->b_rsector    = bh->b_rsector;
690
                mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
691
                                                (1<<BH_Mapped) | (1<<BH_Lock);
692
 
693
                atomic_set(&mbh->b_count, 1);
694
                mbh->b_size       = bh->b_size;
695
                mbh->b_page       = bh->b_page;
696
                mbh->b_data       = bh->b_data;
697
                mbh->b_list       = BUF_LOCKED;
698
                mbh->b_end_io     = raid1_end_request;
699
                mbh->b_private    = r1_bh;
700
 
701
                mbh->b_next = r1_bh->mirror_bh_list;
702
                r1_bh->mirror_bh_list = mbh;
703
                sum_bhs++;
704
        }
705
        if (bhl) raid1_free_bh(conf,bhl);
706
        if (!sum_bhs) {
707
                /* Gag - all mirrors non-operational.. */
708
                raid1_end_bh_io(r1_bh, 0);
709
                return 0;
710
        }
711
        md_atomic_set(&r1_bh->remaining, sum_bhs);
712
 
713
        /*
714
         * We have to be a bit careful about the semaphore above, thats
715
         * why we start the requests separately. Since kmalloc() could
716
         * fail, sleep and make_request() can sleep too, this is the
717
         * safer solution. Imagine, end_request decreasing the semaphore
718
         * before we could have set it up ... We could play tricks with
719
         * the semaphore (presetting it and correcting at the end if
720
         * sum_bhs is not 'n' but we have to do end_request by hand if
721
         * all requests finish until we had a chance to set up the
722
         * semaphore correctly ... lots of races).
723
         */
724
        bh = r1_bh->mirror_bh_list;
725
        while(bh) {
726
                struct buffer_head *bh2 = bh;
727
                bh = bh->b_next;
728
                generic_make_request(rw, bh2);
729
        }
730
        return (0);
731
}
732
 
733
static void raid1_status(struct seq_file *seq, mddev_t *mddev)
734
{
735
        raid1_conf_t *conf = mddev_to_conf(mddev);
736
        int i;
737
 
738
        seq_printf(seq, " [%d/%d] [", conf->raid_disks,
739
                                                 conf->working_disks);
740
        for (i = 0; i < conf->raid_disks; i++)
741
                seq_printf(seq, "%s",
742
                        conf->mirrors[i].operational ? "U" : "_");
743
        seq_printf(seq, "]");
744
}
745
 
746
#define LAST_DISK KERN_ALERT \
747
"raid1: only one disk left and IO error.\n"
748
 
749
#define NO_SPARE_DISK KERN_ALERT \
750
"raid1: no spare disk left, degrading mirror level by one.\n"
751
 
752
#define DISK_FAILED KERN_ALERT \
753
"raid1: Disk failure on %s, disabling device. \n" \
754
"       Operation continuing on %d devices\n"
755
 
756
#define START_SYNCING KERN_ALERT \
757
"raid1: start syncing spare disk.\n"
758
 
759
#define ALREADY_SYNCING KERN_INFO \
760
"raid1: syncing already in progress.\n"
761
 
762
static void mark_disk_bad (mddev_t *mddev, int failed)
763
{
764
        raid1_conf_t *conf = mddev_to_conf(mddev);
765
        struct mirror_info *mirror = conf->mirrors+failed;
766
        mdp_super_t *sb = mddev->sb;
767
 
768
        mirror->operational = 0;
769
        mark_disk_faulty(sb->disks+mirror->number);
770
        mark_disk_nonsync(sb->disks+mirror->number);
771
        mark_disk_inactive(sb->disks+mirror->number);
772
        if (!mirror->write_only)
773
                sb->active_disks--;
774
        sb->working_disks--;
775
        sb->failed_disks++;
776
        mddev->sb_dirty = 1;
777
        md_wakeup_thread(conf->thread);
778
        if (!mirror->write_only)
779
                conf->working_disks--;
780
        printk (DISK_FAILED, partition_name (mirror->dev),
781
                                 conf->working_disks);
782
}
783
 
784
static int raid1_error (mddev_t *mddev, kdev_t dev)
785
{
786
        raid1_conf_t *conf = mddev_to_conf(mddev);
787
        struct mirror_info * mirrors = conf->mirrors;
788
        int disks = MD_SB_DISKS;
789
        int i;
790
 
791
        /* Find the drive.
792
         * If it is not operational, then we have already marked it as dead
793
         * else if it is the last working disks, ignore the error, let the
794
         * next level up know.
795
         * else mark the drive as failed
796
         */
797
 
798
        for (i = 0; i < disks; i++)
799
                if (mirrors[i].dev==dev && mirrors[i].operational)
800
                        break;
801
        if (i == disks)
802
                return 0;
803
 
804
        if (i < conf->raid_disks && conf->working_disks == 1) {
805
                /* Don't fail the drive, act as though we were just a
806
                 * normal single drive
807
                 */
808
 
809
                return 1;
810
        }
811
        mark_disk_bad(mddev, i);
812
        return 0;
813
}
814
 
815
#undef LAST_DISK
816
#undef NO_SPARE_DISK
817
#undef DISK_FAILED
818
#undef START_SYNCING
819
 
820
 
821
static void print_raid1_conf (raid1_conf_t *conf)
822
{
823
        int i;
824
        struct mirror_info *tmp;
825
 
826
        printk("RAID1 conf printout:\n");
827
        if (!conf) {
828
                printk("(conf==NULL)\n");
829
                return;
830
        }
831
        printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
832
                         conf->raid_disks, conf->nr_disks);
833
 
834
        for (i = 0; i < MD_SB_DISKS; i++) {
835
                tmp = conf->mirrors + i;
836
                printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
837
                        i, tmp->spare,tmp->operational,
838
                        tmp->number,tmp->raid_disk,tmp->used_slot,
839
                        partition_name(tmp->dev));
840
        }
841
}
842
 
843
static void close_sync(raid1_conf_t *conf)
844
{
845
        mddev_t *mddev = conf->mddev;
846
        /* If reconstruction was interrupted, we need to close the "active" and "pending"
847
         * holes.
848
         * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
849
         */
850
        /* this is really needed when recovery stops too... */
851
        spin_lock_irq(&conf->segment_lock);
852
        conf->start_active = conf->start_pending;
853
        conf->start_ready = conf->start_pending;
854
        wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
855
        conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
856
        conf->start_future = (mddev->sb->size<<1)+1;
857
        conf->cnt_pending = conf->cnt_future;
858
        conf->cnt_future = 0;
859
        conf->phase = conf->phase ^1;
860
        wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
861
        conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
862
        conf->phase = 0;
863
        conf->cnt_future = conf->cnt_done;;
864
        conf->cnt_done = 0;
865
        spin_unlock_irq(&conf->segment_lock);
866
        wake_up(&conf->wait_done);
867
}
868
 
869
static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
870
{
871
        int err = 0;
872
        int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
873
        raid1_conf_t *conf = mddev->private;
874
        struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
875
        mdp_super_t *sb = mddev->sb;
876
        mdp_disk_t *failed_desc, *spare_desc, *added_desc;
877
        mdk_rdev_t *spare_rdev, *failed_rdev;
878
 
879
        print_raid1_conf(conf);
880
 
881
        switch (state) {
882
        case DISKOP_SPARE_ACTIVE:
883
        case DISKOP_SPARE_INACTIVE:
884
                /* need to wait for pending sync io before locking device */
885
                close_sync(conf);
886
        }
887
 
888
        md_spin_lock_irq(&conf->device_lock);
889
        /*
890
         * find the disk ...
891
         */
892
        switch (state) {
893
 
894
        case DISKOP_SPARE_ACTIVE:
895
 
896
                /*
897
                 * Find the failed disk within the RAID1 configuration ...
898
                 * (this can only be in the first conf->working_disks part)
899
                 */
900
                for (i = 0; i < conf->raid_disks; i++) {
901
                        tmp = conf->mirrors + i;
902
                        if ((!tmp->operational && !tmp->spare) ||
903
                                        !tmp->used_slot) {
904
                                failed_disk = i;
905
                                break;
906
                        }
907
                }
908
                /*
909
                 * When we activate a spare disk we _must_ have a disk in
910
                 * the lower (active) part of the array to replace.
911
                 */
912
                if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
913
                        MD_BUG();
914
                        err = 1;
915
                        goto abort;
916
                }
917
                /* fall through */
918
 
919
        case DISKOP_SPARE_WRITE:
920
        case DISKOP_SPARE_INACTIVE:
921
 
922
                /*
923
                 * Find the spare disk ... (can only be in the 'high'
924
                 * area of the array)
925
                 */
926
                for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
927
                        tmp = conf->mirrors + i;
928
                        if (tmp->spare && tmp->number == (*d)->number) {
929
                                spare_disk = i;
930
                                break;
931
                        }
932
                }
933
                if (spare_disk == -1) {
934
                        MD_BUG();
935
                        err = 1;
936
                        goto abort;
937
                }
938
                break;
939
 
940
        case DISKOP_HOT_REMOVE_DISK:
941
 
942
                for (i = 0; i < MD_SB_DISKS; i++) {
943
                        tmp = conf->mirrors + i;
944
                        if (tmp->used_slot && (tmp->number == (*d)->number)) {
945
                                if (tmp->operational) {
946
                                        err = -EBUSY;
947
                                        goto abort;
948
                                }
949
                                removed_disk = i;
950
                                break;
951
                        }
952
                }
953
                if (removed_disk == -1) {
954
                        MD_BUG();
955
                        err = 1;
956
                        goto abort;
957
                }
958
                break;
959
 
960
        case DISKOP_HOT_ADD_DISK:
961
 
962
                for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
963
                        tmp = conf->mirrors + i;
964
                        if (!tmp->used_slot) {
965
                                added_disk = i;
966
                                break;
967
                        }
968
                }
969
                if (added_disk == -1) {
970
                        MD_BUG();
971
                        err = 1;
972
                        goto abort;
973
                }
974
                break;
975
        }
976
 
977
        switch (state) {
978
        /*
979
         * Switch the spare disk to write-only mode:
980
         */
981
        case DISKOP_SPARE_WRITE:
982
                sdisk = conf->mirrors + spare_disk;
983
                sdisk->operational = 1;
984
                sdisk->write_only = 1;
985
                break;
986
        /*
987
         * Deactivate a spare disk:
988
         */
989
        case DISKOP_SPARE_INACTIVE:
990
                if (conf->start_future > 0) {
991
                        MD_BUG();
992
                        err = -EBUSY;
993
                        break;
994
                }
995
                sdisk = conf->mirrors + spare_disk;
996
                sdisk->operational = 0;
997
                sdisk->write_only = 0;
998
                break;
999
        /*
1000
         * Activate (mark read-write) the (now sync) spare disk,
1001
         * which means we switch it's 'raid position' (->raid_disk)
1002
         * with the failed disk. (only the first 'conf->nr_disks'
1003
         * slots are used for 'real' disks and we must preserve this
1004
         * property)
1005
         */
1006
        case DISKOP_SPARE_ACTIVE:
1007
                if (conf->start_future > 0) {
1008
                        MD_BUG();
1009
                        err = -EBUSY;
1010
                        break;
1011
                }
1012
                sdisk = conf->mirrors + spare_disk;
1013
                fdisk = conf->mirrors + failed_disk;
1014
 
1015
                spare_desc = &sb->disks[sdisk->number];
1016
                failed_desc = &sb->disks[fdisk->number];
1017
 
1018
                if (spare_desc != *d) {
1019
                        MD_BUG();
1020
                        err = 1;
1021
                        goto abort;
1022
                }
1023
 
1024
                if (spare_desc->raid_disk != sdisk->raid_disk) {
1025
                        MD_BUG();
1026
                        err = 1;
1027
                        goto abort;
1028
                }
1029
 
1030
                if (sdisk->raid_disk != spare_disk) {
1031
                        MD_BUG();
1032
                        err = 1;
1033
                        goto abort;
1034
                }
1035
 
1036
                if (failed_desc->raid_disk != fdisk->raid_disk) {
1037
                        MD_BUG();
1038
                        err = 1;
1039
                        goto abort;
1040
                }
1041
 
1042
                if (fdisk->raid_disk != failed_disk) {
1043
                        MD_BUG();
1044
                        err = 1;
1045
                        goto abort;
1046
                }
1047
 
1048
                /*
1049
                 * do the switch finally
1050
                 */
1051
                spare_rdev = find_rdev_nr(mddev, spare_desc->number);
1052
                failed_rdev = find_rdev_nr(mddev, failed_desc->number);
1053
 
1054
                /* There must be a spare_rdev, but there may not be a
1055
                 * failed_rdev.  That slot might be empty...
1056
                 */
1057
                spare_rdev->desc_nr = failed_desc->number;
1058
                if (failed_rdev)
1059
                        failed_rdev->desc_nr = spare_desc->number;
1060
 
1061
                xchg_values(*spare_desc, *failed_desc);
1062
                xchg_values(*fdisk, *sdisk);
1063
 
1064
                /*
1065
                 * (careful, 'failed' and 'spare' are switched from now on)
1066
                 *
1067
                 * we want to preserve linear numbering and we want to
1068
                 * give the proper raid_disk number to the now activated
1069
                 * disk. (this means we switch back these values)
1070
                 */
1071
 
1072
                xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1073
                xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1074
                xchg_values(spare_desc->number, failed_desc->number);
1075
                xchg_values(sdisk->number, fdisk->number);
1076
 
1077
                *d = failed_desc;
1078
 
1079
                if (sdisk->dev == MKDEV(0,0))
1080
                        sdisk->used_slot = 0;
1081
                /*
1082
                 * this really activates the spare.
1083
                 */
1084
                fdisk->spare = 0;
1085
                fdisk->write_only = 0;
1086
 
1087
                /*
1088
                 * if we activate a spare, we definitely replace a
1089
                 * non-operational disk slot in the 'low' area of
1090
                 * the disk array.
1091
                 */
1092
 
1093
                conf->working_disks++;
1094
 
1095
                break;
1096
 
1097
        case DISKOP_HOT_REMOVE_DISK:
1098
                rdisk = conf->mirrors + removed_disk;
1099
 
1100
                if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1101
                        MD_BUG();
1102
                        err = 1;
1103
                        goto abort;
1104
                }
1105
                rdisk->dev = MKDEV(0,0);
1106
                rdisk->used_slot = 0;
1107
                conf->nr_disks--;
1108
                break;
1109
 
1110
        case DISKOP_HOT_ADD_DISK:
1111
                adisk = conf->mirrors + added_disk;
1112
                added_desc = *d;
1113
 
1114
                if (added_disk != added_desc->number) {
1115
                        MD_BUG();
1116
                        err = 1;
1117
                        goto abort;
1118
                }
1119
 
1120
                adisk->number = added_desc->number;
1121
                adisk->raid_disk = added_desc->raid_disk;
1122
                adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1123
 
1124
                adisk->operational = 0;
1125
                adisk->write_only = 0;
1126
                adisk->spare = 1;
1127
                adisk->used_slot = 1;
1128
                adisk->head_position = 0;
1129
                conf->nr_disks++;
1130
 
1131
                break;
1132
 
1133
        default:
1134
                MD_BUG();
1135
                err = 1;
1136
                goto abort;
1137
        }
1138
abort:
1139
        md_spin_unlock_irq(&conf->device_lock);
1140
        if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
1141
                /* should move to "END_REBUILD" when such exists */
1142
                raid1_shrink_buffers(conf);
1143
 
1144
        print_raid1_conf(conf);
1145
        return err;
1146
}
1147
 
1148
 
1149
#define IO_ERROR KERN_ALERT \
1150
"raid1: %s: unrecoverable I/O read error for block %lu\n"
1151
 
1152
#define REDIRECT_SECTOR KERN_ERR \
1153
"raid1: %s: redirecting sector %lu to another mirror\n"
1154
 
1155
/*
1156
 * This is a kernel thread which:
1157
 *
1158
 *      1.      Retries failed read operations on working mirrors.
1159
 *      2.      Updates the raid superblock when problems encounter.
1160
 *      3.      Performs writes following reads for array syncronising.
1161
 */
1162
static void end_sync_write(struct buffer_head *bh, int uptodate);
1163
static void end_sync_read(struct buffer_head *bh, int uptodate);
1164
 
1165
static void raid1d (void *data)
1166
{
1167
        struct raid1_bh *r1_bh;
1168
        struct buffer_head *bh;
1169
        unsigned long flags;
1170
        raid1_conf_t *conf = data;
1171
        mddev_t *mddev = conf->mddev;
1172
        kdev_t dev;
1173
 
1174
        if (mddev->sb_dirty)
1175
                md_update_sb(mddev);
1176
 
1177
        for (;;) {
1178
                md_spin_lock_irqsave(&retry_list_lock, flags);
1179
                r1_bh = raid1_retry_list;
1180
                if (!r1_bh)
1181
                        break;
1182
                raid1_retry_list = r1_bh->next_r1;
1183
                md_spin_unlock_irqrestore(&retry_list_lock, flags);
1184
 
1185
                mddev = r1_bh->mddev;
1186
                bh = &r1_bh->bh_req;
1187
                switch(r1_bh->cmd) {
1188
                case SPECIAL:
1189
                        /* have to allocate lots of bh structures and
1190
                         * schedule writes
1191
                         */
1192
                        if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
1193
                                int i, sum_bhs = 0;
1194
                                int disks = MD_SB_DISKS;
1195
                                struct buffer_head *bhl, *mbh;
1196
 
1197
                                conf = mddev_to_conf(mddev);
1198
                                bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
1199
                                for (i = 0; i < disks ; i++) {
1200
                                        if (!conf->mirrors[i].operational)
1201
                                                continue;
1202
                                        if (i==conf->last_used)
1203
                                                /* we read from here, no need to write */
1204
                                                continue;
1205
                                        if (i < conf->raid_disks
1206
                                            && !conf->resync_mirrors)
1207
                                                /* don't need to write this,
1208
                                                 * we are just rebuilding */
1209
                                                continue;
1210
                                        mbh = bhl;
1211
                                        if (!mbh) {
1212
                                                MD_BUG();
1213
                                                break;
1214
                                        }
1215
                                        bhl = mbh->b_next;
1216
                                        mbh->b_this_page = (struct buffer_head *)1;
1217
 
1218
 
1219
                                /*
1220
                                 * prepare mirrored bh (fields ordered for max mem throughput):
1221
                                 */
1222
                                        mbh->b_blocknr    = bh->b_blocknr;
1223
                                        mbh->b_dev        = conf->mirrors[i].dev;
1224
                                        mbh->b_rdev       = conf->mirrors[i].dev;
1225
                                        mbh->b_rsector    = bh->b_blocknr;
1226
                                        mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
1227
                                                (1<<BH_Mapped) | (1<<BH_Lock);
1228
                                        atomic_set(&mbh->b_count, 1);
1229
                                        mbh->b_size       = bh->b_size;
1230
                                        mbh->b_page       = bh->b_page;
1231
                                        mbh->b_data       = bh->b_data;
1232
                                        mbh->b_list       = BUF_LOCKED;
1233
                                        mbh->b_end_io     = end_sync_write;
1234
                                        mbh->b_private    = r1_bh;
1235
 
1236
                                        mbh->b_next = r1_bh->mirror_bh_list;
1237
                                        r1_bh->mirror_bh_list = mbh;
1238
 
1239
                                        sum_bhs++;
1240
                                }
1241
                                md_atomic_set(&r1_bh->remaining, sum_bhs);
1242
                                if (bhl) raid1_free_bh(conf, bhl);
1243
                                mbh = r1_bh->mirror_bh_list;
1244
 
1245
                                if (!sum_bhs) {
1246
                                        /* nowhere to write this too... I guess we
1247
                                         * must be done
1248
                                         */
1249
                                        sync_request_done(bh->b_blocknr, conf);
1250
                                        md_done_sync(mddev, bh->b_size>>9, 0);
1251
                                        raid1_free_buf(r1_bh);
1252
                                } else
1253
                                while (mbh) {
1254
                                        struct buffer_head *bh1 = mbh;
1255
                                        mbh = mbh->b_next;
1256
                                        generic_make_request(WRITE, bh1);
1257
                                        md_sync_acct(bh1->b_dev, bh1->b_size/512);
1258
                                }
1259
                        } else {
1260
                                /* There is no point trying a read-for-reconstruct
1261
                                 * as reconstruct is about to be aborted
1262
                                 */
1263
 
1264
                                printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1265
                                md_done_sync(mddev, bh->b_size>>9, 0);
1266
                        }
1267
 
1268
                        break;
1269
                case READ:
1270
                case READA:
1271
                        dev = bh->b_dev;
1272
                        raid1_map (mddev, &bh->b_dev);
1273
                        if (bh->b_dev == dev) {
1274
                                printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1275
                                raid1_end_bh_io(r1_bh, 0);
1276
                        } else {
1277
                                printk (REDIRECT_SECTOR,
1278
                                        partition_name(bh->b_dev), bh->b_blocknr);
1279
                                bh->b_rdev = bh->b_dev;
1280
                                bh->b_rsector = bh->b_blocknr;
1281
                                generic_make_request (r1_bh->cmd, bh);
1282
                        }
1283
                        break;
1284
                }
1285
        }
1286
        md_spin_unlock_irqrestore(&retry_list_lock, flags);
1287
}
1288
#undef IO_ERROR
1289
#undef REDIRECT_SECTOR
1290
 
1291
/*
1292
 * Private kernel thread to reconstruct mirrors after an unclean
1293
 * shutdown.
1294
 */
1295
static void raid1syncd (void *data)
1296
{
1297
        raid1_conf_t *conf = data;
1298
        mddev_t *mddev = conf->mddev;
1299
 
1300
        if (!conf->resync_mirrors)
1301
                return;
1302
        if (conf->resync_mirrors == 2)
1303
                return;
1304
        down(&mddev->recovery_sem);
1305
        if (!md_do_sync(mddev, NULL)) {
1306
                /*
1307
                 * Only if everything went Ok.
1308
                 */
1309
                conf->resync_mirrors = 0;
1310
        }
1311
 
1312
        close_sync(conf);
1313
 
1314
        up(&mddev->recovery_sem);
1315
        raid1_shrink_buffers(conf);
1316
}
1317
 
1318
/*
1319
 * perform a "sync" on one "block"
1320
 *
1321
 * We need to make sure that no normal I/O request - particularly write
1322
 * requests - conflict with active sync requests.
1323
 * This is achieved by conceptually dividing the device space into a
1324
 * number of sections:
1325
 *  DONE: 0 .. a-1     These blocks are in-sync
1326
 *  ACTIVE: a.. b-1    These blocks may have active sync requests, but
1327
 *                     no normal IO requests
1328
 *  READY: b .. c-1    These blocks have no normal IO requests - sync
1329
 *                     request may be happening
1330
 *  PENDING: c .. d-1  These blocks may have IO requests, but no new
1331
 *                     ones will be added
1332
 *  FUTURE:  d .. end  These blocks are not to be considered yet. IO may
1333
 *                     be happening, but not sync
1334
 *
1335
 * We keep a
1336
 *   phase    which flips (0 or 1) each time d moves and
1337
 * a count of:
1338
 *   z =  active io requests in FUTURE since d moved - marked with
1339
 *        current phase
1340
 *   y =  active io requests in FUTURE before d moved, or PENDING -
1341
 *        marked with previous phase
1342
 *   x =  active sync requests in READY
1343
 *   w =  active sync requests in ACTIVE
1344
 *   v =  active io requests in DONE
1345
 *
1346
 * Normally, a=b=c=d=0 and z= active io requests
1347
 *   or a=b=c=d=END and v= active io requests
1348
 * Allowed changes to a,b,c,d:
1349
 * A:  c==d &&  y==0 -> d+=window, y=z, z=0, phase=!phase
1350
 * B:  y==0 -> c=d
1351
 * C:   b=c, w+=x, x=0
1352
 * D:  w==0 -> a=b
1353
 * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
1354
 *
1355
 * At start of sync we apply A.
1356
 * When y reaches 0, we apply B then A then being sync requests
1357
 * When sync point reaches c-1, we wait for y==0, and W==0, and
1358
 * then apply apply B then A then D then C.
1359
 * Finally, we apply E
1360
 *
1361
 * The sync request simply issues a "read" against a working drive
1362
 * This is marked so that on completion the raid1d thread is woken to
1363
 * issue suitable write requests
1364
 */
1365
 
1366
static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
1367
{
1368
        raid1_conf_t *conf = mddev_to_conf(mddev);
1369
        struct mirror_info *mirror;
1370
        struct raid1_bh *r1_bh;
1371
        struct buffer_head *bh;
1372
        int bsize;
1373
        int disk;
1374
        int block_nr;
1375
        int buffs;
1376
 
1377
        if (!sector_nr) {
1378
                /* we want enough buffers to hold twice the window of 128*/
1379
                buffs = 128 *2 / (PAGE_SIZE>>9);
1380
                buffs = raid1_grow_buffers(conf, buffs);
1381
                if (buffs < 2)
1382
                        goto nomem;
1383
                conf->window = buffs*(PAGE_SIZE>>9)/2;
1384
        }
1385
        spin_lock_irq(&conf->segment_lock);
1386
        if (!sector_nr) {
1387
                /* initialize ...*/
1388
                conf->start_active = 0;
1389
                conf->start_ready = 0;
1390
                conf->start_pending = 0;
1391
                conf->start_future = 0;
1392
                conf->phase = 0;
1393
 
1394
                conf->cnt_future += conf->cnt_done+conf->cnt_pending;
1395
                conf->cnt_done = conf->cnt_pending = 0;
1396
                if (conf->cnt_ready || conf->cnt_active)
1397
                        MD_BUG();
1398
        }
1399
        while (sector_nr >= conf->start_pending) {
1400
                PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
1401
                        sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
1402
                        conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
1403
                wait_event_lock_irq(conf->wait_done,
1404
                                        !conf->cnt_active,
1405
                                        conf->segment_lock);
1406
                wait_event_lock_irq(conf->wait_ready,
1407
                                        !conf->cnt_pending,
1408
                                        conf->segment_lock);
1409
                conf->start_active = conf->start_ready;
1410
                conf->start_ready = conf->start_pending;
1411
                conf->start_pending = conf->start_future;
1412
                conf->start_future = conf->start_future+conf->window;
1413
                // Note: falling off the end is not a problem
1414
                conf->phase = conf->phase ^1;
1415
                conf->cnt_active = conf->cnt_ready;
1416
                conf->cnt_ready = 0;
1417
                conf->cnt_pending = conf->cnt_future;
1418
                conf->cnt_future = 0;
1419
                wake_up(&conf->wait_done);
1420
        }
1421
        conf->cnt_ready++;
1422
        spin_unlock_irq(&conf->segment_lock);
1423
 
1424
 
1425
        /* If reconstructing, and >1 working disc,
1426
         * could dedicate one to rebuild and others to
1427
         * service read requests ..
1428
         */
1429
        disk = conf->last_used;
1430
        /* make sure disk is operational */
1431
        while (!conf->mirrors[disk].operational) {
1432
                if (disk <= 0) disk = conf->raid_disks;
1433
                disk--;
1434
                if (disk == conf->last_used)
1435
                        break;
1436
        }
1437
        conf->last_used = disk;
1438
 
1439
        mirror = conf->mirrors+conf->last_used;
1440
 
1441
        r1_bh = raid1_alloc_buf (conf);
1442
        r1_bh->master_bh = NULL;
1443
        r1_bh->mddev = mddev;
1444
        r1_bh->cmd = SPECIAL;
1445
        bh = &r1_bh->bh_req;
1446
 
1447
        block_nr = sector_nr;
1448
        bsize = 512;
1449
        while (!(block_nr & 1) && bsize < PAGE_SIZE
1450
                        && (block_nr+2)*(bsize>>9) <= (mddev->sb->size *2)) {
1451
                block_nr >>= 1;
1452
                bsize <<= 1;
1453
        }
1454
        bh->b_size = bsize;
1455
        bh->b_list = BUF_LOCKED;
1456
        bh->b_dev = mirror->dev;
1457
        bh->b_rdev = mirror->dev;
1458
        bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
1459
        if (!bh->b_page)
1460
                BUG();
1461
        if (!bh->b_data)
1462
                BUG();
1463
        if (bh->b_data != page_address(bh->b_page))
1464
                BUG();
1465
        bh->b_end_io = end_sync_read;
1466
        bh->b_private = r1_bh;
1467
        bh->b_blocknr = sector_nr;
1468
        bh->b_rsector = sector_nr;
1469
        init_waitqueue_head(&bh->b_wait);
1470
 
1471
        generic_make_request(READ, bh);
1472
        md_sync_acct(bh->b_dev, bh->b_size/512);
1473
 
1474
        return (bsize >> 9);
1475
 
1476
nomem:
1477
        raid1_shrink_buffers(conf);
1478
        return -ENOMEM;
1479
}
1480
 
1481
static void end_sync_read(struct buffer_head *bh, int uptodate)
1482
{
1483
        struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1484
 
1485
        /* we have read a block, now it needs to be re-written,
1486
         * or re-read if the read failed.
1487
         * We don't do much here, just schedule handling by raid1d
1488
         */
1489
        if (!uptodate)
1490
                md_error (r1_bh->mddev, bh->b_dev);
1491
        else
1492
                set_bit(R1BH_Uptodate, &r1_bh->state);
1493
        raid1_reschedule_retry(r1_bh);
1494
}
1495
 
1496
static void end_sync_write(struct buffer_head *bh, int uptodate)
1497
{
1498
        struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1499
 
1500
        if (!uptodate)
1501
                md_error (r1_bh->mddev, bh->b_dev);
1502
        if (atomic_dec_and_test(&r1_bh->remaining)) {
1503
                mddev_t *mddev = r1_bh->mddev;
1504
                unsigned long sect = bh->b_blocknr;
1505
                int size = bh->b_size;
1506
                raid1_free_buf(r1_bh);
1507
                sync_request_done(sect, mddev_to_conf(mddev));
1508
                md_done_sync(mddev,size>>9, uptodate);
1509
        }
1510
}
1511
 
1512
#define INVALID_LEVEL KERN_WARNING \
1513
"raid1: md%d: raid level not set to mirroring (%d)\n"
1514
 
1515
#define NO_SB KERN_ERR \
1516
"raid1: disabled mirror %s (couldn't access raid superblock)\n"
1517
 
1518
#define ERRORS KERN_ERR \
1519
"raid1: disabled mirror %s (errors detected)\n"
1520
 
1521
#define NOT_IN_SYNC KERN_ERR \
1522
"raid1: disabled mirror %s (not in sync)\n"
1523
 
1524
#define INCONSISTENT KERN_ERR \
1525
"raid1: disabled mirror %s (inconsistent descriptor)\n"
1526
 
1527
#define ALREADY_RUNNING KERN_ERR \
1528
"raid1: disabled mirror %s (mirror %d already operational)\n"
1529
 
1530
#define OPERATIONAL KERN_INFO \
1531
"raid1: device %s operational as mirror %d\n"
1532
 
1533
#define MEM_ERROR KERN_ERR \
1534
"raid1: couldn't allocate memory for md%d\n"
1535
 
1536
#define SPARE KERN_INFO \
1537
"raid1: spare disk %s\n"
1538
 
1539
#define NONE_OPERATIONAL KERN_ERR \
1540
"raid1: no operational mirrors for md%d\n"
1541
 
1542
#define ARRAY_IS_ACTIVE KERN_INFO \
1543
"raid1: raid set md%d active with %d out of %d mirrors\n"
1544
 
1545
#define THREAD_ERROR KERN_ERR \
1546
"raid1: couldn't allocate thread for md%d\n"
1547
 
1548
#define START_RESYNC KERN_WARNING \
1549
"raid1: raid set md%d not clean; reconstructing mirrors\n"
1550
 
1551
static int raid1_run (mddev_t *mddev)
1552
{
1553
        raid1_conf_t *conf;
1554
        int i, j, disk_idx;
1555
        struct mirror_info *disk;
1556
        mdp_super_t *sb = mddev->sb;
1557
        mdp_disk_t *descriptor;
1558
        mdk_rdev_t *rdev;
1559
        struct md_list_head *tmp;
1560
        int start_recovery = 0;
1561
 
1562
        MOD_INC_USE_COUNT;
1563
 
1564
        if (sb->level != 1) {
1565
                printk(INVALID_LEVEL, mdidx(mddev), sb->level);
1566
                goto out;
1567
        }
1568
        /*
1569
         * copy the already verified devices into our private RAID1
1570
         * bookkeeping area. [whatever we allocate in raid1_run(),
1571
         * should be freed in raid1_stop()]
1572
         */
1573
 
1574
        conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
1575
        mddev->private = conf;
1576
        if (!conf) {
1577
                printk(MEM_ERROR, mdidx(mddev));
1578
                goto out;
1579
        }
1580
        memset(conf, 0, sizeof(*conf));
1581
 
1582
        ITERATE_RDEV(mddev,rdev,tmp) {
1583
                if (rdev->faulty) {
1584
                        printk(ERRORS, partition_name(rdev->dev));
1585
                } else {
1586
                        if (!rdev->sb) {
1587
                                MD_BUG();
1588
                                continue;
1589
                        }
1590
                }
1591
                if (rdev->desc_nr == -1) {
1592
                        MD_BUG();
1593
                        continue;
1594
                }
1595
                descriptor = &sb->disks[rdev->desc_nr];
1596
                disk_idx = descriptor->raid_disk;
1597
                disk = conf->mirrors + disk_idx;
1598
 
1599
                if (disk_faulty(descriptor)) {
1600
                        disk->number = descriptor->number;
1601
                        disk->raid_disk = disk_idx;
1602
                        disk->dev = rdev->dev;
1603
                        disk->sect_limit = MAX_WORK_PER_DISK;
1604
                        disk->operational = 0;
1605
                        disk->write_only = 0;
1606
                        disk->spare = 0;
1607
                        disk->used_slot = 1;
1608
                        disk->head_position = 0;
1609
                        continue;
1610
                }
1611
                if (disk_active(descriptor)) {
1612
                        if (!disk_sync(descriptor)) {
1613
                                printk(NOT_IN_SYNC,
1614
                                        partition_name(rdev->dev));
1615
                                continue;
1616
                        }
1617
                        if ((descriptor->number > MD_SB_DISKS) ||
1618
                                         (disk_idx > sb->raid_disks)) {
1619
 
1620
                                printk(INCONSISTENT,
1621
                                        partition_name(rdev->dev));
1622
                                continue;
1623
                        }
1624
                        if (disk->operational) {
1625
                                printk(ALREADY_RUNNING,
1626
                                        partition_name(rdev->dev),
1627
                                        disk_idx);
1628
                                continue;
1629
                        }
1630
                        printk(OPERATIONAL, partition_name(rdev->dev),
1631
                                        disk_idx);
1632
                        disk->number = descriptor->number;
1633
                        disk->raid_disk = disk_idx;
1634
                        disk->dev = rdev->dev;
1635
                        disk->sect_limit = MAX_WORK_PER_DISK;
1636
                        disk->operational = 1;
1637
                        disk->write_only = 0;
1638
                        disk->spare = 0;
1639
                        disk->used_slot = 1;
1640
                        disk->head_position = 0;
1641
                        conf->working_disks++;
1642
                } else {
1643
                /*
1644
                 * Must be a spare disk ..
1645
                 */
1646
                        printk(SPARE, partition_name(rdev->dev));
1647
                        disk->number = descriptor->number;
1648
                        disk->raid_disk = disk_idx;
1649
                        disk->dev = rdev->dev;
1650
                        disk->sect_limit = MAX_WORK_PER_DISK;
1651
                        disk->operational = 0;
1652
                        disk->write_only = 0;
1653
                        disk->spare = 1;
1654
                        disk->used_slot = 1;
1655
                        disk->head_position = 0;
1656
                }
1657
        }
1658
        conf->raid_disks = sb->raid_disks;
1659
        conf->nr_disks = sb->nr_disks;
1660
        conf->mddev = mddev;
1661
        conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1662
 
1663
        conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
1664
        init_waitqueue_head(&conf->wait_buffer);
1665
        init_waitqueue_head(&conf->wait_done);
1666
        init_waitqueue_head(&conf->wait_ready);
1667
 
1668
        if (!conf->working_disks) {
1669
                printk(NONE_OPERATIONAL, mdidx(mddev));
1670
                goto out_free_conf;
1671
        }
1672
 
1673
 
1674
        /* pre-allocate some buffer_head structures.
1675
         * As a minimum, 1 r1bh and raid_disks buffer_heads
1676
         * would probably get us by in tight memory situations,
1677
         * but a few more is probably a good idea.
1678
         * For now, try NR_RESERVED_BUFS r1bh and
1679
         * NR_RESERVED_BUFS*raid_disks bufferheads
1680
         * This will allow at least NR_RESERVED_BUFS concurrent
1681
         * reads or writes even if kmalloc starts failing
1682
         */
1683
        if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
1684
            raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
1685
                              < NR_RESERVED_BUFS*conf->raid_disks) {
1686
                printk(MEM_ERROR, mdidx(mddev));
1687
                goto out_free_conf;
1688
        }
1689
 
1690
        for (i = 0; i < MD_SB_DISKS; i++) {
1691
 
1692
                descriptor = sb->disks+i;
1693
                disk_idx = descriptor->raid_disk;
1694
                disk = conf->mirrors + disk_idx;
1695
 
1696
                if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
1697
                                !disk->used_slot) {
1698
 
1699
                        disk->number = descriptor->number;
1700
                        disk->raid_disk = disk_idx;
1701
                        disk->dev = MKDEV(0,0);
1702
 
1703
                        disk->operational = 0;
1704
                        disk->write_only = 0;
1705
                        disk->spare = 0;
1706
                        disk->used_slot = 1;
1707
                        disk->head_position = 0;
1708
                }
1709
        }
1710
 
1711
        /*
1712
         * find the first working one and use it as a starting point
1713
         * to read balancing.
1714
         */
1715
        for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
1716
                /* nothing */;
1717
        conf->last_used = j;
1718
 
1719
 
1720
        if (conf->working_disks != sb->raid_disks) {
1721
                printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1722
                start_recovery = 1;
1723
        }
1724
 
1725
        {
1726
                const char * name = "raid1d";
1727
 
1728
                conf->thread = md_register_thread(raid1d, conf, name);
1729
                if (!conf->thread) {
1730
                        printk(THREAD_ERROR, mdidx(mddev));
1731
                        goto out_free_conf;
1732
                }
1733
        }
1734
 
1735
        if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
1736
            (conf->working_disks > 1)) {
1737
                const char * name = "raid1syncd";
1738
 
1739
                conf->resync_thread = md_register_thread(raid1syncd, conf,name);
1740
                if (!conf->resync_thread) {
1741
                        printk(THREAD_ERROR, mdidx(mddev));
1742
                        goto out_free_conf;
1743
                }
1744
 
1745
                printk(START_RESYNC, mdidx(mddev));
1746
                conf->resync_mirrors = 1;
1747
                md_wakeup_thread(conf->resync_thread);
1748
        }
1749
 
1750
        /*
1751
         * Regenerate the "device is in sync with the raid set" bit for
1752
         * each device.
1753
         */
1754
        for (i = 0; i < MD_SB_DISKS; i++) {
1755
                mark_disk_nonsync(sb->disks+i);
1756
                for (j = 0; j < sb->raid_disks; j++) {
1757
                        if (!conf->mirrors[j].operational)
1758
                                continue;
1759
                        if (sb->disks[i].number == conf->mirrors[j].number)
1760
                                mark_disk_sync(sb->disks+i);
1761
                }
1762
        }
1763
        sb->active_disks = conf->working_disks;
1764
 
1765
        if (start_recovery)
1766
                md_recover_arrays();
1767
 
1768
 
1769
        printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
1770
        /*
1771
         * Ok, everything is just fine now
1772
         */
1773
        return 0;
1774
 
1775
out_free_conf:
1776
        raid1_shrink_r1bh(conf);
1777
        raid1_shrink_bh(conf);
1778
        raid1_shrink_buffers(conf);
1779
        kfree(conf);
1780
        mddev->private = NULL;
1781
out:
1782
        MOD_DEC_USE_COUNT;
1783
        return -EIO;
1784
}
1785
 
1786
#undef INVALID_LEVEL
1787
#undef NO_SB
1788
#undef ERRORS
1789
#undef NOT_IN_SYNC
1790
#undef INCONSISTENT
1791
#undef ALREADY_RUNNING
1792
#undef OPERATIONAL
1793
#undef SPARE
1794
#undef NONE_OPERATIONAL
1795
#undef ARRAY_IS_ACTIVE
1796
 
1797
static int raid1_stop_resync (mddev_t *mddev)
1798
{
1799
        raid1_conf_t *conf = mddev_to_conf(mddev);
1800
 
1801
        if (conf->resync_thread) {
1802
                if (conf->resync_mirrors) {
1803
                        conf->resync_mirrors = 2;
1804
                        md_interrupt_thread(conf->resync_thread);
1805
 
1806
                        printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
1807
                        return 1;
1808
                }
1809
                return 0;
1810
        }
1811
        return 0;
1812
}
1813
 
1814
static int raid1_restart_resync (mddev_t *mddev)
1815
{
1816
        raid1_conf_t *conf = mddev_to_conf(mddev);
1817
 
1818
        if (conf->resync_mirrors) {
1819
                if (!conf->resync_thread) {
1820
                        MD_BUG();
1821
                        return 0;
1822
                }
1823
                conf->resync_mirrors = 1;
1824
                md_wakeup_thread(conf->resync_thread);
1825
                return 1;
1826
        }
1827
        return 0;
1828
}
1829
 
1830
static int raid1_stop (mddev_t *mddev)
1831
{
1832
        raid1_conf_t *conf = mddev_to_conf(mddev);
1833
 
1834
        md_unregister_thread(conf->thread);
1835
        if (conf->resync_thread)
1836
                md_unregister_thread(conf->resync_thread);
1837
        raid1_shrink_r1bh(conf);
1838
        raid1_shrink_bh(conf);
1839
        raid1_shrink_buffers(conf);
1840
        kfree(conf);
1841
        mddev->private = NULL;
1842
        MOD_DEC_USE_COUNT;
1843
        return 0;
1844
}
1845
 
1846
static mdk_personality_t raid1_personality=
1847
{
1848
        name:           "raid1",
1849
        make_request:   raid1_make_request,
1850
        run:            raid1_run,
1851
        stop:           raid1_stop,
1852
        status:         raid1_status,
1853
        error_handler:  raid1_error,
1854
        diskop:         raid1_diskop,
1855
        stop_resync:    raid1_stop_resync,
1856
        restart_resync: raid1_restart_resync,
1857
        sync_request:   raid1_sync_request
1858
};
1859
 
1860
static int md__init raid1_init (void)
1861
{
1862
        return register_md_personality (RAID1, &raid1_personality);
1863
}
1864
 
1865
static void raid1_exit (void)
1866
{
1867
        unregister_md_personality (RAID1);
1868
}
1869
 
1870
module_init(raid1_init);
1871
module_exit(raid1_exit);
1872
MODULE_LICENSE("GPL");

powered by: WebSVN 2.1.0

© copyright 1999-2025 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.