OpenCores
URL https://opencores.org/ocsvn/test_project/test_project/trunk

Subversion Repositories test_project

[/] [test_project/] [trunk/] [linux_sd_driver/] [block/] [ll_rw_blk.c] - Blame information for rev 81

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 62 marcus.erl
/*
2
 * Copyright (C) 1991, 1992 Linus Torvalds
3
 * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
4
 * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
5
 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
6
 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> -  July2000
7
 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
8
 */
9
 
10
/*
11
 * This handles all read/write requests to block devices
12
 */
13
#include <linux/kernel.h>
14
#include <linux/module.h>
15
#include <linux/backing-dev.h>
16
#include <linux/bio.h>
17
#include <linux/blkdev.h>
18
#include <linux/highmem.h>
19
#include <linux/mm.h>
20
#include <linux/kernel_stat.h>
21
#include <linux/string.h>
22
#include <linux/init.h>
23
#include <linux/bootmem.h>      /* for max_pfn/max_low_pfn */
24
#include <linux/completion.h>
25
#include <linux/slab.h>
26
#include <linux/swap.h>
27
#include <linux/writeback.h>
28
#include <linux/task_io_accounting_ops.h>
29
#include <linux/interrupt.h>
30
#include <linux/cpu.h>
31
#include <linux/blktrace_api.h>
32
#include <linux/fault-inject.h>
33
#include <linux/scatterlist.h>
34
 
35
/*
36
 * for max sense size
37
 */
38
#include <scsi/scsi_cmnd.h>
39
 
40
static void blk_unplug_work(struct work_struct *work);
41
static void blk_unplug_timeout(unsigned long data);
42
static void drive_stat_acct(struct request *rq, int new_io);
43
static void init_request_from_bio(struct request *req, struct bio *bio);
44
static int __make_request(struct request_queue *q, struct bio *bio);
45
static struct io_context *current_io_context(gfp_t gfp_flags, int node);
46
static void blk_recalc_rq_segments(struct request *rq);
47
static void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
48
                            struct bio *bio);
49
 
50
/*
51
 * For the allocated request tables
52
 */
53
static struct kmem_cache *request_cachep;
54
 
55
/*
56
 * For queue allocation
57
 */
58
static struct kmem_cache *requestq_cachep;
59
 
60
/*
61
 * For io context allocations
62
 */
63
static struct kmem_cache *iocontext_cachep;
64
 
65
/*
66
 * Controlling structure to kblockd
67
 */
68
static struct workqueue_struct *kblockd_workqueue;
69
 
70
unsigned long blk_max_low_pfn, blk_max_pfn;
71
 
72
EXPORT_SYMBOL(blk_max_low_pfn);
73
EXPORT_SYMBOL(blk_max_pfn);
74
 
75
static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
76
 
77
/* Amount of time in which a process may batch requests */
78
#define BLK_BATCH_TIME  (HZ/50UL)
79
 
80
/* Number of requests a "batching" process may submit */
81
#define BLK_BATCH_REQ   32
82
 
83
/*
84
 * Return the threshold (number of used requests) at which the queue is
85
 * considered to be congested.  It include a little hysteresis to keep the
86
 * context switch rate down.
87
 */
88
static inline int queue_congestion_on_threshold(struct request_queue *q)
89
{
90
        return q->nr_congestion_on;
91
}
92
 
93
/*
94
 * The threshold at which a queue is considered to be uncongested
95
 */
96
static inline int queue_congestion_off_threshold(struct request_queue *q)
97
{
98
        return q->nr_congestion_off;
99
}
100
 
101
static void blk_queue_congestion_threshold(struct request_queue *q)
102
{
103
        int nr;
104 81 tac2
 
105 62 marcus.erl
        nr = q->nr_requests - (q->nr_requests / 8) + 1;
106
        if (nr > q->nr_requests)
107
                nr = q->nr_requests;
108
        q->nr_congestion_on = nr;
109
 
110
        nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
111
        if (nr < 1)
112
                nr = 1;
113
        q->nr_congestion_off = nr;
114
}
115
 
116
/**
117
 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
118
 * @bdev:       device
119
 *
120
 * Locates the passed device's request queue and returns the address of its
121
 * backing_dev_info
122
 *
123
 * Will return NULL if the request queue cannot be located.
124
 */
125
struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
126
{
127
        struct backing_dev_info *ret = NULL;
128
        struct request_queue *q = bdev_get_queue(bdev);
129
 
130
        if (q)
131
                ret = &q->backing_dev_info;
132
        return ret;
133
}
134
EXPORT_SYMBOL(blk_get_backing_dev_info);
135
 
136
/**
137
 * blk_queue_prep_rq - set a prepare_request function for queue
138
 * @q:          queue
139
 * @pfn:        prepare_request function
140
 *
141
 * It's possible for a queue to register a prepare_request callback which
142
 * is invoked before the request is handed to the request_fn. The goal of
143
 * the function is to prepare a request for I/O, it can be used to build a
144
 * cdb from the request data for instance.
145
 *
146
 */
147
void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
148
{
149
        q->prep_rq_fn = pfn;
150
}
151
 
152
EXPORT_SYMBOL(blk_queue_prep_rq);
153
 
154
/**
155
 * blk_queue_merge_bvec - set a merge_bvec function for queue
156
 * @q:          queue
157
 * @mbfn:       merge_bvec_fn
158
 *
159
 * Usually queues have static limitations on the max sectors or segments that
160
 * we can put in a request. Stacking drivers may have some settings that
161
 * are dynamic, and thus we have to query the queue whether it is ok to
162
 * add a new bio_vec to a bio at a given offset or not. If the block device
163
 * has such limitations, it needs to register a merge_bvec_fn to control
164
 * the size of bio's sent to it. Note that a block device *must* allow a
165
 * single page to be added to an empty bio. The block device driver may want
166
 * to use the bio_split() function to deal with these bio's. By default
167
 * no merge_bvec_fn is defined for a queue, and only the fixed limits are
168
 * honored.
169
 */
170
void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn)
171
{
172
        q->merge_bvec_fn = mbfn;
173
}
174
 
175
EXPORT_SYMBOL(blk_queue_merge_bvec);
176
 
177
void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
178
{
179
        q->softirq_done_fn = fn;
180
}
181
 
182
EXPORT_SYMBOL(blk_queue_softirq_done);
183
 
184
/**
185
 * blk_queue_make_request - define an alternate make_request function for a device
186
 * @q:  the request queue for the device to be affected
187
 * @mfn: the alternate make_request function
188
 *
189
 * Description:
190
 *    The normal way for &struct bios to be passed to a device
191
 *    driver is for them to be collected into requests on a request
192
 *    queue, and then to allow the device driver to select requests
193
 *    off that queue when it is ready.  This works well for many block
194
 *    devices. However some block devices (typically virtual devices
195
 *    such as md or lvm) do not benefit from the processing on the
196
 *    request queue, and are served best by having the requests passed
197
 *    directly to them.  This can be achieved by providing a function
198
 *    to blk_queue_make_request().
199
 *
200
 * Caveat:
201
 *    The driver that does this *must* be able to deal appropriately
202
 *    with buffers in "highmemory". This can be accomplished by either calling
203
 *    __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
204
 *    blk_queue_bounce() to create a buffer in normal memory.
205
 **/
206
void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn)
207
{
208
        /*
209
         * set defaults
210
         */
211
        q->nr_requests = BLKDEV_MAX_RQ;
212
        blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
213
        blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
214
        q->make_request_fn = mfn;
215
        q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
216
        q->backing_dev_info.state = 0;
217
        q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
218
        blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
219
        blk_queue_hardsect_size(q, 512);
220
        blk_queue_dma_alignment(q, 511);
221
        blk_queue_congestion_threshold(q);
222
        q->nr_batching = BLK_BATCH_REQ;
223
 
224
        q->unplug_thresh = 4;           /* hmm */
225
        q->unplug_delay = (3 * HZ) / 1000;      /* 3 milliseconds */
226
        if (q->unplug_delay == 0)
227
                q->unplug_delay = 1;
228
 
229
        INIT_WORK(&q->unplug_work, blk_unplug_work);
230
 
231
        q->unplug_timer.function = blk_unplug_timeout;
232
        q->unplug_timer.data = (unsigned long)q;
233
 
234
        /*
235
         * by default assume old behaviour and bounce for any highmem page
236
         */
237
        blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
238
}
239
 
240
EXPORT_SYMBOL(blk_queue_make_request);
241
 
242
static void rq_init(struct request_queue *q, struct request *rq)
243
{
244
        INIT_LIST_HEAD(&rq->queuelist);
245
        INIT_LIST_HEAD(&rq->donelist);
246
 
247
        rq->errors = 0;
248
        rq->bio = rq->biotail = NULL;
249
        INIT_HLIST_NODE(&rq->hash);
250
        RB_CLEAR_NODE(&rq->rb_node);
251
        rq->ioprio = 0;
252
        rq->buffer = NULL;
253
        rq->ref_count = 1;
254
        rq->q = q;
255
        rq->special = NULL;
256
        rq->data_len = 0;
257
        rq->data = NULL;
258
        rq->nr_phys_segments = 0;
259
        rq->sense = NULL;
260
        rq->end_io = NULL;
261
        rq->end_io_data = NULL;
262
        rq->completion_data = NULL;
263
        rq->next_rq = NULL;
264
}
265
 
266
/**
267
 * blk_queue_ordered - does this queue support ordered writes
268
 * @q:        the request queue
269
 * @ordered:  one of QUEUE_ORDERED_*
270
 * @prepare_flush_fn: rq setup helper for cache flush ordered writes
271
 *
272
 * Description:
273
 *   For journalled file systems, doing ordered writes on a commit
274
 *   block instead of explicitly doing wait_on_buffer (which is bad
275
 *   for performance) can be a big win. Block drivers supporting this
276
 *   feature should call this function and indicate so.
277
 *
278
 **/
279
int blk_queue_ordered(struct request_queue *q, unsigned ordered,
280
                      prepare_flush_fn *prepare_flush_fn)
281
{
282
        if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&
283
            prepare_flush_fn == NULL) {
284
                printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n");
285
                return -EINVAL;
286
        }
287
 
288
        if (ordered != QUEUE_ORDERED_NONE &&
289
            ordered != QUEUE_ORDERED_DRAIN &&
290
            ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
291
            ordered != QUEUE_ORDERED_DRAIN_FUA &&
292
            ordered != QUEUE_ORDERED_TAG &&
293
            ordered != QUEUE_ORDERED_TAG_FLUSH &&
294
            ordered != QUEUE_ORDERED_TAG_FUA) {
295
                printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
296
                return -EINVAL;
297
        }
298
 
299
        q->ordered = ordered;
300
        q->next_ordered = ordered;
301
        q->prepare_flush_fn = prepare_flush_fn;
302
 
303
        return 0;
304
}
305
 
306
EXPORT_SYMBOL(blk_queue_ordered);
307
 
308
/*
309
 * Cache flushing for ordered writes handling
310
 */
311
inline unsigned blk_ordered_cur_seq(struct request_queue *q)
312
{
313
        if (!q->ordseq)
314
                return 0;
315
        return 1 << ffz(q->ordseq);
316
}
317
 
318
unsigned blk_ordered_req_seq(struct request *rq)
319
{
320
        struct request_queue *q = rq->q;
321
 
322
        BUG_ON(q->ordseq == 0);
323
 
324
        if (rq == &q->pre_flush_rq)
325
                return QUEUE_ORDSEQ_PREFLUSH;
326
        if (rq == &q->bar_rq)
327
                return QUEUE_ORDSEQ_BAR;
328
        if (rq == &q->post_flush_rq)
329
                return QUEUE_ORDSEQ_POSTFLUSH;
330
 
331
        /*
332
         * !fs requests don't need to follow barrier ordering.  Always
333
         * put them at the front.  This fixes the following deadlock.
334
         *
335
         * http://thread.gmane.org/gmane.linux.kernel/537473
336
         */
337
        if (!blk_fs_request(rq))
338
                return QUEUE_ORDSEQ_DRAIN;
339
 
340
        if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
341
            (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
342
                return QUEUE_ORDSEQ_DRAIN;
343
        else
344
                return QUEUE_ORDSEQ_DONE;
345
}
346
 
347
void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
348
{
349
        struct request *rq;
350
        int uptodate;
351
 
352
        if (error && !q->orderr)
353
                q->orderr = error;
354
 
355
        BUG_ON(q->ordseq & seq);
356
        q->ordseq |= seq;
357
 
358
        if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
359
                return;
360
 
361
        /*
362
         * Okay, sequence complete.
363
         */
364
        uptodate = 1;
365
        if (q->orderr)
366
                uptodate = q->orderr;
367
 
368
        q->ordseq = 0;
369
        rq = q->orig_bar_rq;
370
 
371
        end_that_request_first(rq, uptodate, rq->hard_nr_sectors);
372
        end_that_request_last(rq, uptodate);
373
}
374
 
375
static void pre_flush_end_io(struct request *rq, int error)
376
{
377
        elv_completed_request(rq->q, rq);
378
        blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
379
}
380
 
381
static void bar_end_io(struct request *rq, int error)
382
{
383
        elv_completed_request(rq->q, rq);
384
        blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
385
}
386
 
387
static void post_flush_end_io(struct request *rq, int error)
388
{
389
        elv_completed_request(rq->q, rq);
390
        blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
391
}
392
 
393
static void queue_flush(struct request_queue *q, unsigned which)
394
{
395
        struct request *rq;
396
        rq_end_io_fn *end_io;
397
 
398
        if (which == QUEUE_ORDERED_PREFLUSH) {
399
                rq = &q->pre_flush_rq;
400
                end_io = pre_flush_end_io;
401
        } else {
402
                rq = &q->post_flush_rq;
403
                end_io = post_flush_end_io;
404
        }
405
 
406
        rq->cmd_flags = REQ_HARDBARRIER;
407
        rq_init(q, rq);
408
        rq->elevator_private = NULL;
409
        rq->elevator_private2 = NULL;
410
        rq->rq_disk = q->bar_rq.rq_disk;
411
        rq->end_io = end_io;
412
        q->prepare_flush_fn(q, rq);
413
 
414
        elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
415
}
416
 
417
static inline struct request *start_ordered(struct request_queue *q,
418
                                            struct request *rq)
419
{
420
        q->orderr = 0;
421
        q->ordered = q->next_ordered;
422
        q->ordseq |= QUEUE_ORDSEQ_STARTED;
423
 
424
        /*
425
         * Prep proxy barrier request.
426
         */
427
        blkdev_dequeue_request(rq);
428
        q->orig_bar_rq = rq;
429
        rq = &q->bar_rq;
430
        rq->cmd_flags = 0;
431
        rq_init(q, rq);
432
        if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
433
                rq->cmd_flags |= REQ_RW;
434
        if (q->ordered & QUEUE_ORDERED_FUA)
435
                rq->cmd_flags |= REQ_FUA;
436
        rq->elevator_private = NULL;
437
        rq->elevator_private2 = NULL;
438
        init_request_from_bio(rq, q->orig_bar_rq->bio);
439
        rq->end_io = bar_end_io;
440
 
441
        /*
442
         * Queue ordered sequence.  As we stack them at the head, we
443
         * need to queue in reverse order.  Note that we rely on that
444
         * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
445
         * request gets inbetween ordered sequence. If this request is
446
         * an empty barrier, we don't need to do a postflush ever since
447
         * there will be no data written between the pre and post flush.
448
         * Hence a single flush will suffice.
449
         */
450
        if ((q->ordered & QUEUE_ORDERED_POSTFLUSH) && !blk_empty_barrier(rq))
451
                queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
452
        else
453
                q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
454
 
455
        elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
456
 
457
        if (q->ordered & QUEUE_ORDERED_PREFLUSH) {
458
                queue_flush(q, QUEUE_ORDERED_PREFLUSH);
459
                rq = &q->pre_flush_rq;
460
        } else
461
                q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
462
 
463
        if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)
464
                q->ordseq |= QUEUE_ORDSEQ_DRAIN;
465
        else
466
                rq = NULL;
467
 
468
        return rq;
469
}
470
 
471
int blk_do_ordered(struct request_queue *q, struct request **rqp)
472
{
473
        struct request *rq = *rqp;
474
        const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
475
 
476
        if (!q->ordseq) {
477
                if (!is_barrier)
478
                        return 1;
479
 
480
                if (q->next_ordered != QUEUE_ORDERED_NONE) {
481
                        *rqp = start_ordered(q, rq);
482
                        return 1;
483
                } else {
484
                        /*
485
                         * This can happen when the queue switches to
486
                         * ORDERED_NONE while this request is on it.
487
                         */
488
                        blkdev_dequeue_request(rq);
489
                        end_that_request_first(rq, -EOPNOTSUPP,
490
                                               rq->hard_nr_sectors);
491
                        end_that_request_last(rq, -EOPNOTSUPP);
492
                        *rqp = NULL;
493
                        return 0;
494
                }
495
        }
496
 
497
        /*
498
         * Ordered sequence in progress
499
         */
500
 
501
        /* Special requests are not subject to ordering rules. */
502
        if (!blk_fs_request(rq) &&
503
            rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
504
                return 1;
505
 
506
        if (q->ordered & QUEUE_ORDERED_TAG) {
507
                /* Ordered by tag.  Blocking the next barrier is enough. */
508
                if (is_barrier && rq != &q->bar_rq)
509
                        *rqp = NULL;
510
        } else {
511
                /* Ordered by draining.  Wait for turn. */
512
                WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
513
                if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
514
                        *rqp = NULL;
515
        }
516
 
517
        return 1;
518
}
519
 
520
static void req_bio_endio(struct request *rq, struct bio *bio,
521
                          unsigned int nbytes, int error)
522
{
523
        struct request_queue *q = rq->q;
524
 
525
        if (&q->bar_rq != rq) {
526
                if (error)
527
                        clear_bit(BIO_UPTODATE, &bio->bi_flags);
528
                else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
529
                        error = -EIO;
530
 
531
                if (unlikely(nbytes > bio->bi_size)) {
532
                        printk("%s: want %u bytes done, only %u left\n",
533
                               __FUNCTION__, nbytes, bio->bi_size);
534
                        nbytes = bio->bi_size;
535
                }
536
 
537
                bio->bi_size -= nbytes;
538
                bio->bi_sector += (nbytes >> 9);
539
                if (bio->bi_size == 0)
540
                        bio_endio(bio, error);
541
        } else {
542
 
543
                /*
544
                 * Okay, this is the barrier request in progress, just
545
                 * record the error;
546
                 */
547
                if (error && !q->orderr)
548
                        q->orderr = error;
549
        }
550
}
551
 
552
/**
553
 * blk_queue_bounce_limit - set bounce buffer limit for queue
554
 * @q:  the request queue for the device
555
 * @dma_addr:   bus address limit
556
 *
557
 * Description:
558
 *    Different hardware can have different requirements as to what pages
559
 *    it can do I/O directly to. A low level driver can call
560
 *    blk_queue_bounce_limit to have lower memory pages allocated as bounce
561
 *    buffers for doing I/O to pages residing above @page.
562
 **/
563
void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr)
564
{
565
        unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
566
        int dma = 0;
567
 
568
        q->bounce_gfp = GFP_NOIO;
569
#if BITS_PER_LONG == 64
570
        /* Assume anything <= 4GB can be handled by IOMMU.
571
           Actually some IOMMUs can handle everything, but I don't
572
           know of a way to test this here. */
573
        if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
574
                dma = 1;
575
        q->bounce_pfn = max_low_pfn;
576
#else
577
        if (bounce_pfn < blk_max_low_pfn)
578
                dma = 1;
579
        q->bounce_pfn = bounce_pfn;
580
#endif
581
        if (dma) {
582
                init_emergency_isa_pool();
583
                q->bounce_gfp = GFP_NOIO | GFP_DMA;
584
                q->bounce_pfn = bounce_pfn;
585
        }
586
}
587
 
588
EXPORT_SYMBOL(blk_queue_bounce_limit);
589
 
590
/**
591
 * blk_queue_max_sectors - set max sectors for a request for this queue
592
 * @q:  the request queue for the device
593
 * @max_sectors:  max sectors in the usual 512b unit
594
 *
595
 * Description:
596
 *    Enables a low level driver to set an upper limit on the size of
597
 *    received requests.
598
 **/
599
void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
600
{
601
        if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
602
                max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
603
                printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
604
        }
605
 
606
        if (BLK_DEF_MAX_SECTORS > max_sectors)
607
                q->max_hw_sectors = q->max_sectors = max_sectors;
608
        else {
609
                q->max_sectors = BLK_DEF_MAX_SECTORS;
610
                q->max_hw_sectors = max_sectors;
611
        }
612
}
613
 
614
EXPORT_SYMBOL(blk_queue_max_sectors);
615
 
616
/**
617
 * blk_queue_max_phys_segments - set max phys segments for a request for this queue
618
 * @q:  the request queue for the device
619
 * @max_segments:  max number of segments
620
 *
621
 * Description:
622
 *    Enables a low level driver to set an upper limit on the number of
623
 *    physical data segments in a request.  This would be the largest sized
624
 *    scatter list the driver could handle.
625
 **/
626
void blk_queue_max_phys_segments(struct request_queue *q,
627
                                 unsigned short max_segments)
628 81 tac2
{
629
        //printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
630 62 marcus.erl
        if (!max_segments) {
631
                max_segments = 1;
632
                printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
633
        }
634
 
635
        q->max_phys_segments = max_segments;
636
}
637
 
638
EXPORT_SYMBOL(blk_queue_max_phys_segments);
639
 
640
/**
641
 * blk_queue_max_hw_segments - set max hw segments for a request for this queue
642
 * @q:  the request queue for the device
643
 * @max_segments:  max number of segments
644
 *
645
 * Description:
646
 *    Enables a low level driver to set an upper limit on the number of
647
 *    hw data segments in a request.  This would be the largest number of
648
 *    address/length pairs the host adapter can actually give as once
649
 *    to the device.
650
 **/
651
void blk_queue_max_hw_segments(struct request_queue *q,
652
                               unsigned short max_segments)
653
{
654
        if (!max_segments) {
655
                max_segments = 1;
656
                printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
657
        }
658
 
659
        q->max_hw_segments = max_segments;
660
}
661
 
662
EXPORT_SYMBOL(blk_queue_max_hw_segments);
663
 
664
/**
665
 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
666
 * @q:  the request queue for the device
667
 * @max_size:  max size of segment in bytes
668
 *
669
 * Description:
670
 *    Enables a low level driver to set an upper limit on the size of a
671
 *    coalesced segment
672
 **/
673
void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
674
{
675
        if (max_size < PAGE_CACHE_SIZE) {
676
                max_size = PAGE_CACHE_SIZE;
677
                printk("%s: set to minimum %d\n", __FUNCTION__, max_size);
678
        }
679
 
680
        q->max_segment_size = max_size;
681
}
682
 
683
EXPORT_SYMBOL(blk_queue_max_segment_size);
684
 
685
/**
686
 * blk_queue_hardsect_size - set hardware sector size for the queue
687
 * @q:  the request queue for the device
688
 * @size:  the hardware sector size, in bytes
689
 *
690
 * Description:
691
 *   This should typically be set to the lowest possible sector size
692
 *   that the hardware can operate on (possible without reverting to
693
 *   even internal read-modify-write operations). Usually the default
694
 *   of 512 covers most hardware.
695
 **/
696
void blk_queue_hardsect_size(struct request_queue *q, unsigned short size)
697
{
698
        q->hardsect_size = size;
699
}
700
 
701
EXPORT_SYMBOL(blk_queue_hardsect_size);
702
 
703
/*
704
 * Returns the minimum that is _not_ zero, unless both are zero.
705
 */
706
#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
707
 
708
/**
709
 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
710
 * @t:  the stacking driver (top)
711
 * @b:  the underlying device (bottom)
712
 **/
713
void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
714
{
715
        /* zero is "infinity" */
716
        t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);
717
        t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors);
718
 
719
        t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
720
        t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
721
        t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
722
        t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
723
        if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
724
                clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags);
725
}
726
 
727
EXPORT_SYMBOL(blk_queue_stack_limits);
728
 
729
/**
730
 * blk_queue_segment_boundary - set boundary rules for segment merging
731
 * @q:  the request queue for the device
732
 * @mask:  the memory boundary mask
733
 **/
734
void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
735
{
736
        if (mask < PAGE_CACHE_SIZE - 1) {
737
                mask = PAGE_CACHE_SIZE - 1;
738
                printk("%s: set to minimum %lx\n", __FUNCTION__, mask);
739
        }
740
 
741
        q->seg_boundary_mask = mask;
742
}
743
 
744
EXPORT_SYMBOL(blk_queue_segment_boundary);
745
 
746
/**
747
 * blk_queue_dma_alignment - set dma length and memory alignment
748
 * @q:     the request queue for the device
749
 * @mask:  alignment mask
750
 *
751
 * description:
752
 *    set required memory and length aligment for direct dma transactions.
753
 *    this is used when buiding direct io requests for the queue.
754
 *
755
 **/
756
void blk_queue_dma_alignment(struct request_queue *q, int mask)
757
{
758
        q->dma_alignment = mask;
759
}
760
 
761
EXPORT_SYMBOL(blk_queue_dma_alignment);
762
 
763
/**
764
 * blk_queue_find_tag - find a request by its tag and queue
765
 * @q:   The request queue for the device
766
 * @tag: The tag of the request
767
 *
768
 * Notes:
769
 *    Should be used when a device returns a tag and you want to match
770
 *    it with a request.
771
 *
772
 *    no locks need be held.
773
 **/
774
struct request *blk_queue_find_tag(struct request_queue *q, int tag)
775
{
776
        return blk_map_queue_find_tag(q->queue_tags, tag);
777
}
778
 
779
EXPORT_SYMBOL(blk_queue_find_tag);
780
 
781
/**
782
 * __blk_free_tags - release a given set of tag maintenance info
783
 * @bqt:        the tag map to free
784
 *
785
 * Tries to free the specified @bqt@.  Returns true if it was
786
 * actually freed and false if there are still references using it
787
 */
788
static int __blk_free_tags(struct blk_queue_tag *bqt)
789
{
790
        int retval;
791
 
792
        retval = atomic_dec_and_test(&bqt->refcnt);
793
        if (retval) {
794
                BUG_ON(bqt->busy);
795
 
796
                kfree(bqt->tag_index);
797
                bqt->tag_index = NULL;
798
 
799
                kfree(bqt->tag_map);
800
                bqt->tag_map = NULL;
801
 
802
                kfree(bqt);
803
 
804
        }
805
 
806
        return retval;
807
}
808
 
809
/**
810
 * __blk_queue_free_tags - release tag maintenance info
811
 * @q:  the request queue for the device
812
 *
813
 *  Notes:
814
 *    blk_cleanup_queue() will take care of calling this function, if tagging
815
 *    has been used. So there's no need to call this directly.
816
 **/
817
static void __blk_queue_free_tags(struct request_queue *q)
818
{
819
        struct blk_queue_tag *bqt = q->queue_tags;
820
 
821
        if (!bqt)
822
                return;
823
 
824
        __blk_free_tags(bqt);
825
 
826
        q->queue_tags = NULL;
827
        q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
828
}
829
 
830
 
831
/**
832
 * blk_free_tags - release a given set of tag maintenance info
833
 * @bqt:        the tag map to free
834
 *
835
 * For externally managed @bqt@ frees the map.  Callers of this
836
 * function must guarantee to have released all the queues that
837
 * might have been using this tag map.
838
 */
839
void blk_free_tags(struct blk_queue_tag *bqt)
840
{
841
        if (unlikely(!__blk_free_tags(bqt)))
842
                BUG();
843
}
844
EXPORT_SYMBOL(blk_free_tags);
845
 
846
/**
847
 * blk_queue_free_tags - release tag maintenance info
848
 * @q:  the request queue for the device
849
 *
850
 *  Notes:
851
 *      This is used to disabled tagged queuing to a device, yet leave
852
 *      queue in function.
853
 **/
854
void blk_queue_free_tags(struct request_queue *q)
855
{
856
        clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
857
}
858
 
859
EXPORT_SYMBOL(blk_queue_free_tags);
860
 
861
static int
862
init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)
863
{
864
        struct request **tag_index;
865
        unsigned long *tag_map;
866
        int nr_ulongs;
867
 
868
        if (q && depth > q->nr_requests * 2) {
869
                depth = q->nr_requests * 2;
870
                printk(KERN_ERR "%s: adjusted depth to %d\n",
871
                                __FUNCTION__, depth);
872
        }
873
 
874
        tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC);
875
        if (!tag_index)
876
                goto fail;
877
 
878
        nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
879
        tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
880
        if (!tag_map)
881
                goto fail;
882
 
883
        tags->real_max_depth = depth;
884
        tags->max_depth = depth;
885
        tags->tag_index = tag_index;
886
        tags->tag_map = tag_map;
887
 
888
        return 0;
889
fail:
890
        kfree(tag_index);
891
        return -ENOMEM;
892
}
893
 
894
static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
895
                                                   int depth)
896
{
897
        struct blk_queue_tag *tags;
898
 
899
        tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
900
        if (!tags)
901
                goto fail;
902
 
903
        if (init_tag_map(q, tags, depth))
904
                goto fail;
905
 
906
        tags->busy = 0;
907
        atomic_set(&tags->refcnt, 1);
908
        return tags;
909
fail:
910
        kfree(tags);
911
        return NULL;
912
}
913
 
914
/**
915
 * blk_init_tags - initialize the tag info for an external tag map
916
 * @depth:      the maximum queue depth supported
917
 * @tags: the tag to use
918
 **/
919
struct blk_queue_tag *blk_init_tags(int depth)
920
{
921
        return __blk_queue_init_tags(NULL, depth);
922
}
923
EXPORT_SYMBOL(blk_init_tags);
924
 
925
/**
926
 * blk_queue_init_tags - initialize the queue tag info
927
 * @q:  the request queue for the device
928
 * @depth:  the maximum queue depth supported
929
 * @tags: the tag to use
930
 **/
931
int blk_queue_init_tags(struct request_queue *q, int depth,
932
                        struct blk_queue_tag *tags)
933
{
934
        int rc;
935
 
936
        BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
937
 
938
        if (!tags && !q->queue_tags) {
939
                tags = __blk_queue_init_tags(q, depth);
940
 
941
                if (!tags)
942
                        goto fail;
943
        } else if (q->queue_tags) {
944
                if ((rc = blk_queue_resize_tags(q, depth)))
945
                        return rc;
946
                set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
947
                return 0;
948
        } else
949
                atomic_inc(&tags->refcnt);
950
 
951
        /*
952
         * assign it, all done
953
         */
954
        q->queue_tags = tags;
955
        q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);
956
        INIT_LIST_HEAD(&q->tag_busy_list);
957
        return 0;
958
fail:
959
        kfree(tags);
960
        return -ENOMEM;
961
}
962
 
963
EXPORT_SYMBOL(blk_queue_init_tags);
964
 
965
/**
966
 * blk_queue_resize_tags - change the queueing depth
967
 * @q:  the request queue for the device
968
 * @new_depth: the new max command queueing depth
969
 *
970
 *  Notes:
971
 *    Must be called with the queue lock held.
972
 **/
973
int blk_queue_resize_tags(struct request_queue *q, int new_depth)
974
{
975
        struct blk_queue_tag *bqt = q->queue_tags;
976
        struct request **tag_index;
977
        unsigned long *tag_map;
978
        int max_depth, nr_ulongs;
979
 
980
        if (!bqt)
981
                return -ENXIO;
982
 
983
        /*
984
         * if we already have large enough real_max_depth.  just
985
         * adjust max_depth.  *NOTE* as requests with tag value
986
         * between new_depth and real_max_depth can be in-flight, tag
987
         * map can not be shrunk blindly here.
988
         */
989
        if (new_depth <= bqt->real_max_depth) {
990
                bqt->max_depth = new_depth;
991
                return 0;
992
        }
993
 
994
        /*
995
         * Currently cannot replace a shared tag map with a new
996
         * one, so error out if this is the case
997
         */
998
        if (atomic_read(&bqt->refcnt) != 1)
999
                return -EBUSY;
1000
 
1001
        /*
1002
         * save the old state info, so we can copy it back
1003
         */
1004
        tag_index = bqt->tag_index;
1005
        tag_map = bqt->tag_map;
1006
        max_depth = bqt->real_max_depth;
1007
 
1008
        if (init_tag_map(q, bqt, new_depth))
1009
                return -ENOMEM;
1010
 
1011
        memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
1012
        nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
1013
        memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
1014
 
1015
        kfree(tag_index);
1016
        kfree(tag_map);
1017
        return 0;
1018
}
1019
 
1020
EXPORT_SYMBOL(blk_queue_resize_tags);
1021
 
1022
/**
1023
 * blk_queue_end_tag - end tag operations for a request
1024
 * @q:  the request queue for the device
1025
 * @rq: the request that has completed
1026
 *
1027
 *  Description:
1028
 *    Typically called when end_that_request_first() returns 0, meaning
1029
 *    all transfers have been done for a request. It's important to call
1030
 *    this function before end_that_request_last(), as that will put the
1031
 *    request back on the free list thus corrupting the internal tag list.
1032
 *
1033
 *  Notes:
1034
 *   queue lock must be held.
1035
 **/
1036
void blk_queue_end_tag(struct request_queue *q, struct request *rq)
1037
{
1038
        struct blk_queue_tag *bqt = q->queue_tags;
1039
        int tag = rq->tag;
1040
 
1041
        BUG_ON(tag == -1);
1042
 
1043
        if (unlikely(tag >= bqt->real_max_depth))
1044
                /*
1045
                 * This can happen after tag depth has been reduced.
1046
                 * FIXME: how about a warning or info message here?
1047
                 */
1048
                return;
1049
 
1050
        list_del_init(&rq->queuelist);
1051
        rq->cmd_flags &= ~REQ_QUEUED;
1052
        rq->tag = -1;
1053
 
1054
        if (unlikely(bqt->tag_index[tag] == NULL))
1055
                printk(KERN_ERR "%s: tag %d is missing\n",
1056
                       __FUNCTION__, tag);
1057
 
1058
        bqt->tag_index[tag] = NULL;
1059
 
1060
        if (unlikely(!test_bit(tag, bqt->tag_map))) {
1061
                printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
1062
                       __FUNCTION__, tag);
1063
                return;
1064
        }
1065
        /*
1066
         * The tag_map bit acts as a lock for tag_index[bit], so we need
1067
         * unlock memory barrier semantics.
1068
         */
1069
        clear_bit_unlock(tag, bqt->tag_map);
1070
        bqt->busy--;
1071
}
1072
 
1073
EXPORT_SYMBOL(blk_queue_end_tag);
1074
 
1075
/**
1076
 * blk_queue_start_tag - find a free tag and assign it
1077
 * @q:  the request queue for the device
1078
 * @rq:  the block request that needs tagging
1079
 *
1080
 *  Description:
1081
 *    This can either be used as a stand-alone helper, or possibly be
1082
 *    assigned as the queue &prep_rq_fn (in which case &struct request
1083
 *    automagically gets a tag assigned). Note that this function
1084
 *    assumes that any type of request can be queued! if this is not
1085
 *    true for your device, you must check the request type before
1086
 *    calling this function.  The request will also be removed from
1087
 *    the request queue, so it's the drivers responsibility to readd
1088
 *    it if it should need to be restarted for some reason.
1089
 *
1090
 *  Notes:
1091
 *   queue lock must be held.
1092
 **/
1093
int blk_queue_start_tag(struct request_queue *q, struct request *rq)
1094
{
1095
        struct blk_queue_tag *bqt = q->queue_tags;
1096
        int tag;
1097
 
1098
        if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
1099
                printk(KERN_ERR
1100
                       "%s: request %p for device [%s] already tagged %d",
1101
                       __FUNCTION__, rq,
1102
                       rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
1103
                BUG();
1104
        }
1105
 
1106
        /*
1107
         * Protect against shared tag maps, as we may not have exclusive
1108
         * access to the tag map.
1109
         */
1110
        do {
1111
                tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
1112
                if (tag >= bqt->max_depth)
1113
                        return 1;
1114
 
1115
        } while (test_and_set_bit_lock(tag, bqt->tag_map));
1116
        /*
1117
         * We need lock ordering semantics given by test_and_set_bit_lock.
1118
         * See blk_queue_end_tag for details.
1119
         */
1120
 
1121
        rq->cmd_flags |= REQ_QUEUED;
1122
        rq->tag = tag;
1123
        bqt->tag_index[tag] = rq;
1124
        blkdev_dequeue_request(rq);
1125
        list_add(&rq->queuelist, &q->tag_busy_list);
1126
        bqt->busy++;
1127
        return 0;
1128
}
1129
 
1130
EXPORT_SYMBOL(blk_queue_start_tag);
1131
 
1132
/**
1133
 * blk_queue_invalidate_tags - invalidate all pending tags
1134
 * @q:  the request queue for the device
1135
 *
1136
 *  Description:
1137
 *   Hardware conditions may dictate a need to stop all pending requests.
1138
 *   In this case, we will safely clear the block side of the tag queue and
1139
 *   readd all requests to the request queue in the right order.
1140
 *
1141
 *  Notes:
1142
 *   queue lock must be held.
1143
 **/
1144
void blk_queue_invalidate_tags(struct request_queue *q)
1145
{
1146
        struct list_head *tmp, *n;
1147
 
1148
        list_for_each_safe(tmp, n, &q->tag_busy_list)
1149
                blk_requeue_request(q, list_entry_rq(tmp));
1150
}
1151
 
1152
EXPORT_SYMBOL(blk_queue_invalidate_tags);
1153
 
1154
void blk_dump_rq_flags(struct request *rq, char *msg)
1155
{
1156
        int bit;
1157
 
1158
        printk("%s: dev %s: type=%x, flags=%x\n", msg,
1159
                rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
1160
                rq->cmd_flags);
1161
 
1162
        printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,
1163
                                                       rq->nr_sectors,
1164
                                                       rq->current_nr_sectors);
1165
        printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);
1166
 
1167
        if (blk_pc_request(rq)) {
1168
                printk("cdb: ");
1169
                for (bit = 0; bit < sizeof(rq->cmd); bit++)
1170
                        printk("%02x ", rq->cmd[bit]);
1171
                printk("\n");
1172
        }
1173
}
1174
 
1175
EXPORT_SYMBOL(blk_dump_rq_flags);
1176
 
1177
void blk_recount_segments(struct request_queue *q, struct bio *bio)
1178
{
1179
        struct request rq;
1180
        struct bio *nxt = bio->bi_next;
1181
        rq.q = q;
1182
        rq.bio = rq.biotail = bio;
1183
        bio->bi_next = NULL;
1184
        blk_recalc_rq_segments(&rq);
1185
        bio->bi_next = nxt;
1186
        bio->bi_phys_segments = rq.nr_phys_segments;
1187
        bio->bi_hw_segments = rq.nr_hw_segments;
1188
        bio->bi_flags |= (1 << BIO_SEG_VALID);
1189
}
1190
EXPORT_SYMBOL(blk_recount_segments);
1191
 
1192
static void blk_recalc_rq_segments(struct request *rq)
1193
{
1194
        int nr_phys_segs;
1195
        int nr_hw_segs;
1196
        unsigned int phys_size;
1197
        unsigned int hw_size;
1198
        struct bio_vec *bv, *bvprv = NULL;
1199
        int seg_size;
1200
        int hw_seg_size;
1201
        int cluster;
1202
        struct req_iterator iter;
1203
        int high, highprv = 1;
1204
        struct request_queue *q = rq->q;
1205
 
1206
        if (!rq->bio)
1207
                return;
1208
 
1209
        cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
1210
        hw_seg_size = seg_size = 0;
1211
        phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
1212
        rq_for_each_segment(bv, rq, iter) {
1213
                /*
1214
                 * the trick here is making sure that a high page is never
1215
                 * considered part of another segment, since that might
1216
                 * change with the bounce page.
1217
                 */
1218
                high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
1219
                if (high || highprv)
1220
                        goto new_hw_segment;
1221
                if (cluster) {
1222
                        if (seg_size + bv->bv_len > q->max_segment_size)
1223
                                goto new_segment;
1224
                        if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
1225
                                goto new_segment;
1226
                        if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
1227
                                goto new_segment;
1228
                        if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
1229
                                goto new_hw_segment;
1230
 
1231
                        seg_size += bv->bv_len;
1232
                        hw_seg_size += bv->bv_len;
1233
                        bvprv = bv;
1234
                        continue;
1235
                }
1236
new_segment:
1237
                if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
1238
                    !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
1239
                        hw_seg_size += bv->bv_len;
1240
                else {
1241
new_hw_segment:
1242
                        if (nr_hw_segs == 1 &&
1243
                            hw_seg_size > rq->bio->bi_hw_front_size)
1244
                                rq->bio->bi_hw_front_size = hw_seg_size;
1245
                        hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
1246
                        nr_hw_segs++;
1247
                }
1248
 
1249
                nr_phys_segs++;
1250
                bvprv = bv;
1251
                seg_size = bv->bv_len;
1252
                highprv = high;
1253
        }
1254
 
1255
        if (nr_hw_segs == 1 &&
1256
            hw_seg_size > rq->bio->bi_hw_front_size)
1257
                rq->bio->bi_hw_front_size = hw_seg_size;
1258
        if (hw_seg_size > rq->biotail->bi_hw_back_size)
1259
                rq->biotail->bi_hw_back_size = hw_seg_size;
1260
        rq->nr_phys_segments = nr_phys_segs;
1261
        rq->nr_hw_segments = nr_hw_segs;
1262
}
1263
 
1264
static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
1265
                                   struct bio *nxt)
1266
{
1267
        if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
1268
                return 0;
1269
 
1270
        if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
1271
                return 0;
1272
        if (bio->bi_size + nxt->bi_size > q->max_segment_size)
1273
                return 0;
1274
 
1275
        /*
1276
         * bio and nxt are contigous in memory, check if the queue allows
1277
         * these two to be merged into one
1278
         */
1279
        if (BIO_SEG_BOUNDARY(q, bio, nxt))
1280
                return 1;
1281
 
1282
        return 0;
1283
}
1284
 
1285
static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio,
1286
                                 struct bio *nxt)
1287
{
1288
        if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1289
                blk_recount_segments(q, bio);
1290
        if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID)))
1291
                blk_recount_segments(q, nxt);
1292
        if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
1293
            BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))
1294
                return 0;
1295
        if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size)
1296
                return 0;
1297
 
1298
        return 1;
1299
}
1300
 
1301
/*
1302
 * map a request to scatterlist, return number of sg entries setup. Caller
1303
 * must make sure sg can hold rq->nr_phys_segments entries
1304
 */
1305
int blk_rq_map_sg(struct request_queue *q, struct request *rq,
1306
                  struct scatterlist *sglist)
1307
{
1308
        struct bio_vec *bvec, *bvprv;
1309
        struct req_iterator iter;
1310
        struct scatterlist *sg;
1311
        int nsegs, cluster;
1312
 
1313
        nsegs = 0;
1314
        cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
1315
 
1316
        /*
1317
         * for each bio in rq
1318
         */
1319
        bvprv = NULL;
1320
        sg = NULL;
1321
        rq_for_each_segment(bvec, rq, iter) {
1322
                int nbytes = bvec->bv_len;
1323
 
1324
                if (bvprv && cluster) {
1325
                        if (sg->length + nbytes > q->max_segment_size)
1326
                                goto new_segment;
1327
 
1328
                        if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
1329
                                goto new_segment;
1330
                        if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
1331
                                goto new_segment;
1332
 
1333
                        sg->length += nbytes;
1334
                } else {
1335
new_segment:
1336
                        if (!sg)
1337
                                sg = sglist;
1338
                        else {
1339
                                /*
1340
                                 * If the driver previously mapped a shorter
1341
                                 * list, we could see a termination bit
1342
                                 * prematurely unless it fully inits the sg
1343
                                 * table on each mapping. We KNOW that there
1344
                                 * must be more entries here or the driver
1345
                                 * would be buggy, so force clear the
1346
                                 * termination bit to avoid doing a full
1347
                                 * sg_init_table() in drivers for each command.
1348
                                 */
1349
                                sg->page_link &= ~0x02;
1350
                                sg = sg_next(sg);
1351
                        }
1352
 
1353
                        sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset);
1354
                        nsegs++;
1355
                }
1356
                bvprv = bvec;
1357
        } /* segments in rq */
1358
 
1359
        if (sg)
1360
                sg_mark_end(sg);
1361
 
1362
        return nsegs;
1363
}
1364
 
1365
EXPORT_SYMBOL(blk_rq_map_sg);
1366
 
1367
/*
1368
 * the standard queue merge functions, can be overridden with device
1369
 * specific ones if so desired
1370
 */
1371
 
1372
static inline int ll_new_mergeable(struct request_queue *q,
1373
                                   struct request *req,
1374
                                   struct bio *bio)
1375
{
1376
        int nr_phys_segs = bio_phys_segments(q, bio);
1377
 
1378
        if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
1379
                req->cmd_flags |= REQ_NOMERGE;
1380
                if (req == q->last_merge)
1381
                        q->last_merge = NULL;
1382
                return 0;
1383
        }
1384
 
1385
        /*
1386
         * A hw segment is just getting larger, bump just the phys
1387
         * counter.
1388
         */
1389
        req->nr_phys_segments += nr_phys_segs;
1390
        return 1;
1391
}
1392
 
1393
static inline int ll_new_hw_segment(struct request_queue *q,
1394
                                    struct request *req,
1395
                                    struct bio *bio)
1396
{
1397
        int nr_hw_segs = bio_hw_segments(q, bio);
1398
        int nr_phys_segs = bio_phys_segments(q, bio);
1399
 
1400
        if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
1401
            || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
1402
                req->cmd_flags |= REQ_NOMERGE;
1403
                if (req == q->last_merge)
1404
                        q->last_merge = NULL;
1405
                return 0;
1406
        }
1407
 
1408
        /*
1409
         * This will form the start of a new hw segment.  Bump both
1410
         * counters.
1411
         */
1412
        req->nr_hw_segments += nr_hw_segs;
1413
        req->nr_phys_segments += nr_phys_segs;
1414
        return 1;
1415
}
1416
 
1417
static int ll_back_merge_fn(struct request_queue *q, struct request *req,
1418
                            struct bio *bio)
1419
{
1420
        unsigned short max_sectors;
1421
        int len;
1422
 
1423
        if (unlikely(blk_pc_request(req)))
1424
                max_sectors = q->max_hw_sectors;
1425
        else
1426
                max_sectors = q->max_sectors;
1427
 
1428
        if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
1429
                req->cmd_flags |= REQ_NOMERGE;
1430
                if (req == q->last_merge)
1431
                        q->last_merge = NULL;
1432
                return 0;
1433
        }
1434
        if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID)))
1435
                blk_recount_segments(q, req->biotail);
1436
        if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1437
                blk_recount_segments(q, bio);
1438
        len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
1439
        if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) &&
1440
            !BIOVEC_VIRT_OVERSIZE(len)) {
1441
                int mergeable =  ll_new_mergeable(q, req, bio);
1442
 
1443
                if (mergeable) {
1444
                        if (req->nr_hw_segments == 1)
1445
                                req->bio->bi_hw_front_size = len;
1446
                        if (bio->bi_hw_segments == 1)
1447
                                bio->bi_hw_back_size = len;
1448
                }
1449
                return mergeable;
1450
        }
1451
 
1452
        return ll_new_hw_segment(q, req, bio);
1453
}
1454
 
1455
static int ll_front_merge_fn(struct request_queue *q, struct request *req,
1456
                             struct bio *bio)
1457
{
1458
        unsigned short max_sectors;
1459
        int len;
1460
 
1461
        if (unlikely(blk_pc_request(req)))
1462
                max_sectors = q->max_hw_sectors;
1463
        else
1464
                max_sectors = q->max_sectors;
1465
 
1466
 
1467
        if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
1468
                req->cmd_flags |= REQ_NOMERGE;
1469
                if (req == q->last_merge)
1470
                        q->last_merge = NULL;
1471
                return 0;
1472
        }
1473
        len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
1474
        if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1475
                blk_recount_segments(q, bio);
1476
        if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID)))
1477
                blk_recount_segments(q, req->bio);
1478
        if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
1479
            !BIOVEC_VIRT_OVERSIZE(len)) {
1480
                int mergeable =  ll_new_mergeable(q, req, bio);
1481
 
1482
                if (mergeable) {
1483
                        if (bio->bi_hw_segments == 1)
1484
                                bio->bi_hw_front_size = len;
1485
                        if (req->nr_hw_segments == 1)
1486
                                req->biotail->bi_hw_back_size = len;
1487
                }
1488
                return mergeable;
1489
        }
1490
 
1491
        return ll_new_hw_segment(q, req, bio);
1492
}
1493
 
1494
static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
1495
                                struct request *next)
1496
{
1497
        int total_phys_segments;
1498
        int total_hw_segments;
1499
 
1500
        /*
1501
         * First check if the either of the requests are re-queued
1502
         * requests.  Can't merge them if they are.
1503
         */
1504
        if (req->special || next->special)
1505
                return 0;
1506
 
1507
        /*
1508
         * Will it become too large?
1509
         */
1510
        if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)
1511
                return 0;
1512
 
1513
        total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
1514
        if (blk_phys_contig_segment(q, req->biotail, next->bio))
1515
                total_phys_segments--;
1516
 
1517
        if (total_phys_segments > q->max_phys_segments)
1518
                return 0;
1519
 
1520
        total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
1521
        if (blk_hw_contig_segment(q, req->biotail, next->bio)) {
1522
                int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size;
1523
                /*
1524
                 * propagate the combined length to the end of the requests
1525
                 */
1526
                if (req->nr_hw_segments == 1)
1527
                        req->bio->bi_hw_front_size = len;
1528
                if (next->nr_hw_segments == 1)
1529
                        next->biotail->bi_hw_back_size = len;
1530
                total_hw_segments--;
1531
        }
1532
 
1533
        if (total_hw_segments > q->max_hw_segments)
1534
                return 0;
1535
 
1536
        /* Merge is OK... */
1537
        req->nr_phys_segments = total_phys_segments;
1538
        req->nr_hw_segments = total_hw_segments;
1539
        return 1;
1540
}
1541
 
1542
/*
1543
 * "plug" the device if there are no outstanding requests: this will
1544
 * force the transfer to start only after we have put all the requests
1545
 * on the list.
1546
 *
1547
 * This is called with interrupts off and no requests on the queue and
1548
 * with the queue lock held.
1549
 */
1550
void blk_plug_device(struct request_queue *q)
1551
{
1552
        WARN_ON(!irqs_disabled());
1553
 
1554
        /*
1555
         * don't plug a stopped queue, it must be paired with blk_start_queue()
1556
         * which will restart the queueing
1557
         */
1558
        if (blk_queue_stopped(q))
1559
                return;
1560
 
1561
        if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
1562
                mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
1563
                blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
1564
        }
1565
}
1566
 
1567
EXPORT_SYMBOL(blk_plug_device);
1568
 
1569
/*
1570
 * remove the queue from the plugged list, if present. called with
1571
 * queue lock held and interrupts disabled.
1572
 */
1573
int blk_remove_plug(struct request_queue *q)
1574
{
1575
        WARN_ON(!irqs_disabled());
1576
 
1577
        if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
1578
                return 0;
1579
 
1580
        del_timer(&q->unplug_timer);
1581
        return 1;
1582
}
1583
 
1584
EXPORT_SYMBOL(blk_remove_plug);
1585
 
1586
/*
1587
 * remove the plug and let it rip..
1588
 */
1589
void __generic_unplug_device(struct request_queue *q)
1590
{
1591
        if (unlikely(blk_queue_stopped(q)))
1592
                return;
1593
 
1594
        if (!blk_remove_plug(q))
1595
                return;
1596
 
1597
        q->request_fn(q);
1598
}
1599
EXPORT_SYMBOL(__generic_unplug_device);
1600
 
1601
/**
1602
 * generic_unplug_device - fire a request queue
1603
 * @q:    The &struct request_queue in question
1604
 *
1605
 * Description:
1606
 *   Linux uses plugging to build bigger requests queues before letting
1607
 *   the device have at them. If a queue is plugged, the I/O scheduler
1608
 *   is still adding and merging requests on the queue. Once the queue
1609
 *   gets unplugged, the request_fn defined for the queue is invoked and
1610
 *   transfers started.
1611
 **/
1612
void generic_unplug_device(struct request_queue *q)
1613
{
1614
        spin_lock_irq(q->queue_lock);
1615
        __generic_unplug_device(q);
1616
        spin_unlock_irq(q->queue_lock);
1617
}
1618
EXPORT_SYMBOL(generic_unplug_device);
1619
 
1620
static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
1621
                                   struct page *page)
1622
{
1623
        struct request_queue *q = bdi->unplug_io_data;
1624
 
1625
        blk_unplug(q);
1626
}
1627
 
1628
static void blk_unplug_work(struct work_struct *work)
1629
{
1630
        struct request_queue *q =
1631
                container_of(work, struct request_queue, unplug_work);
1632
 
1633
        blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
1634
                                q->rq.count[READ] + q->rq.count[WRITE]);
1635
 
1636
        q->unplug_fn(q);
1637
}
1638
 
1639
static void blk_unplug_timeout(unsigned long data)
1640
{
1641
        struct request_queue *q = (struct request_queue *)data;
1642
 
1643
        blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
1644
                                q->rq.count[READ] + q->rq.count[WRITE]);
1645
 
1646
        kblockd_schedule_work(&q->unplug_work);
1647
}
1648
 
1649
void blk_unplug(struct request_queue *q)
1650
{
1651
        /*
1652
         * devices don't necessarily have an ->unplug_fn defined
1653
         */
1654
        if (q->unplug_fn) {
1655
                blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
1656
                                        q->rq.count[READ] + q->rq.count[WRITE]);
1657
 
1658
                q->unplug_fn(q);
1659
        }
1660
}
1661
EXPORT_SYMBOL(blk_unplug);
1662
 
1663
/**
1664
 * blk_start_queue - restart a previously stopped queue
1665
 * @q:    The &struct request_queue in question
1666
 *
1667
 * Description:
1668
 *   blk_start_queue() will clear the stop flag on the queue, and call
1669
 *   the request_fn for the queue if it was in a stopped state when
1670
 *   entered. Also see blk_stop_queue(). Queue lock must be held.
1671
 **/
1672
void blk_start_queue(struct request_queue *q)
1673
{
1674
        WARN_ON(!irqs_disabled());
1675
 
1676
        clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
1677
 
1678
        /*
1679
         * one level of recursion is ok and is much faster than kicking
1680
         * the unplug handling
1681
         */
1682
        if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
1683
                q->request_fn(q);
1684
                clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
1685
        } else {
1686
                blk_plug_device(q);
1687
                kblockd_schedule_work(&q->unplug_work);
1688
        }
1689
}
1690
 
1691
EXPORT_SYMBOL(blk_start_queue);
1692
 
1693
/**
1694
 * blk_stop_queue - stop a queue
1695
 * @q:    The &struct request_queue in question
1696
 *
1697
 * Description:
1698
 *   The Linux block layer assumes that a block driver will consume all
1699
 *   entries on the request queue when the request_fn strategy is called.
1700
 *   Often this will not happen, because of hardware limitations (queue
1701
 *   depth settings). If a device driver gets a 'queue full' response,
1702
 *   or if it simply chooses not to queue more I/O at one point, it can
1703
 *   call this function to prevent the request_fn from being called until
1704
 *   the driver has signalled it's ready to go again. This happens by calling
1705
 *   blk_start_queue() to restart queue operations. Queue lock must be held.
1706
 **/
1707
void blk_stop_queue(struct request_queue *q)
1708
{
1709
        blk_remove_plug(q);
1710
        set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
1711
}
1712
EXPORT_SYMBOL(blk_stop_queue);
1713
 
1714
/**
1715
 * blk_sync_queue - cancel any pending callbacks on a queue
1716
 * @q: the queue
1717
 *
1718
 * Description:
1719
 *     The block layer may perform asynchronous callback activity
1720
 *     on a queue, such as calling the unplug function after a timeout.
1721
 *     A block device may call blk_sync_queue to ensure that any
1722
 *     such activity is cancelled, thus allowing it to release resources
1723
 *     that the callbacks might use. The caller must already have made sure
1724
 *     that its ->make_request_fn will not re-add plugging prior to calling
1725
 *     this function.
1726
 *
1727
 */
1728
void blk_sync_queue(struct request_queue *q)
1729
{
1730
        del_timer_sync(&q->unplug_timer);
1731
        kblockd_flush_work(&q->unplug_work);
1732
}
1733
EXPORT_SYMBOL(blk_sync_queue);
1734
 
1735
/**
1736
 * blk_run_queue - run a single device queue
1737
 * @q:  The queue to run
1738
 */
1739
void blk_run_queue(struct request_queue *q)
1740
{
1741
        unsigned long flags;
1742
 
1743
        spin_lock_irqsave(q->queue_lock, flags);
1744
        blk_remove_plug(q);
1745
 
1746
        /*
1747
         * Only recurse once to avoid overrunning the stack, let the unplug
1748
         * handling reinvoke the handler shortly if we already got there.
1749
         */
1750
        if (!elv_queue_empty(q)) {
1751
                if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
1752
                        q->request_fn(q);
1753
                        clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
1754
                } else {
1755
                        blk_plug_device(q);
1756
                        kblockd_schedule_work(&q->unplug_work);
1757
                }
1758
        }
1759
 
1760
        spin_unlock_irqrestore(q->queue_lock, flags);
1761
}
1762
EXPORT_SYMBOL(blk_run_queue);
1763
 
1764
/**
1765
 * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed
1766
 * @kobj:    the kobj belonging of the request queue to be released
1767
 *
1768
 * Description:
1769
 *     blk_cleanup_queue is the pair to blk_init_queue() or
1770
 *     blk_queue_make_request().  It should be called when a request queue is
1771
 *     being released; typically when a block device is being de-registered.
1772
 *     Currently, its primary task it to free all the &struct request
1773
 *     structures that were allocated to the queue and the queue itself.
1774
 *
1775
 * Caveat:
1776
 *     Hopefully the low level driver will have finished any
1777
 *     outstanding requests first...
1778
 **/
1779
static void blk_release_queue(struct kobject *kobj)
1780
{
1781
        struct request_queue *q =
1782
                container_of(kobj, struct request_queue, kobj);
1783
        struct request_list *rl = &q->rq;
1784
 
1785
        blk_sync_queue(q);
1786
 
1787
        if (rl->rq_pool)
1788
                mempool_destroy(rl->rq_pool);
1789
 
1790
        if (q->queue_tags)
1791
                __blk_queue_free_tags(q);
1792
 
1793
        blk_trace_shutdown(q);
1794
 
1795
        bdi_destroy(&q->backing_dev_info);
1796
        kmem_cache_free(requestq_cachep, q);
1797
}
1798
 
1799
void blk_put_queue(struct request_queue *q)
1800
{
1801
        kobject_put(&q->kobj);
1802
}
1803
EXPORT_SYMBOL(blk_put_queue);
1804
 
1805
void blk_cleanup_queue(struct request_queue * q)
1806
{
1807
        mutex_lock(&q->sysfs_lock);
1808
        set_bit(QUEUE_FLAG_DEAD, &q->queue_flags);
1809
        mutex_unlock(&q->sysfs_lock);
1810
 
1811
        if (q->elevator)
1812
                elevator_exit(q->elevator);
1813
 
1814
        blk_put_queue(q);
1815
}
1816
 
1817
EXPORT_SYMBOL(blk_cleanup_queue);
1818
 
1819
static int blk_init_free_list(struct request_queue *q)
1820
{
1821
        struct request_list *rl = &q->rq;
1822
 
1823
        rl->count[READ] = rl->count[WRITE] = 0;
1824
        rl->starved[READ] = rl->starved[WRITE] = 0;
1825
        rl->elvpriv = 0;
1826
        init_waitqueue_head(&rl->wait[READ]);
1827
        init_waitqueue_head(&rl->wait[WRITE]);
1828
 
1829
        rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
1830
                                mempool_free_slab, request_cachep, q->node);
1831
 
1832
        if (!rl->rq_pool)
1833
                return -ENOMEM;
1834
 
1835
        return 0;
1836
}
1837
 
1838
struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
1839
{
1840
        return blk_alloc_queue_node(gfp_mask, -1);
1841
}
1842
EXPORT_SYMBOL(blk_alloc_queue);
1843
 
1844
static struct kobj_type queue_ktype;
1845
 
1846
struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
1847
{
1848
        struct request_queue *q;
1849
        int err;
1850
 
1851
        q = kmem_cache_alloc_node(requestq_cachep,
1852
                                gfp_mask | __GFP_ZERO, node_id);
1853
        if (!q)
1854
                return NULL;
1855
 
1856
        q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
1857
        q->backing_dev_info.unplug_io_data = q;
1858
        err = bdi_init(&q->backing_dev_info);
1859
        if (err) {
1860
                kmem_cache_free(requestq_cachep, q);
1861
                return NULL;
1862
        }
1863
 
1864
        init_timer(&q->unplug_timer);
1865
 
1866
        kobject_set_name(&q->kobj, "%s", "queue");
1867
        q->kobj.ktype = &queue_ktype;
1868
        kobject_init(&q->kobj);
1869
 
1870
        mutex_init(&q->sysfs_lock);
1871
 
1872
        return q;
1873
}
1874
EXPORT_SYMBOL(blk_alloc_queue_node);
1875
 
1876
/**
1877
 * blk_init_queue  - prepare a request queue for use with a block device
1878
 * @rfn:  The function to be called to process requests that have been
1879
 *        placed on the queue.
1880
 * @lock: Request queue spin lock
1881
 *
1882
 * Description:
1883
 *    If a block device wishes to use the standard request handling procedures,
1884
 *    which sorts requests and coalesces adjacent requests, then it must
1885
 *    call blk_init_queue().  The function @rfn will be called when there
1886
 *    are requests on the queue that need to be processed.  If the device
1887
 *    supports plugging, then @rfn may not be called immediately when requests
1888
 *    are available on the queue, but may be called at some time later instead.
1889
 *    Plugged queues are generally unplugged when a buffer belonging to one
1890
 *    of the requests on the queue is needed, or due to memory pressure.
1891
 *
1892
 *    @rfn is not required, or even expected, to remove all requests off the
1893
 *    queue, but only as many as it can handle at a time.  If it does leave
1894
 *    requests on the queue, it is responsible for arranging that the requests
1895
 *    get dealt with eventually.
1896
 *
1897
 *    The queue spin lock must be held while manipulating the requests on the
1898
 *    request queue; this lock will be taken also from interrupt context, so irq
1899
 *    disabling is needed for it.
1900
 *
1901
 *    Function returns a pointer to the initialized request queue, or NULL if
1902
 *    it didn't succeed.
1903
 *
1904
 * Note:
1905
 *    blk_init_queue() must be paired with a blk_cleanup_queue() call
1906
 *    when the block device is deactivated (such as at module unload).
1907
 **/
1908
 
1909
struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
1910
{
1911
        return blk_init_queue_node(rfn, lock, -1);
1912
}
1913
EXPORT_SYMBOL(blk_init_queue);
1914
 
1915
struct request_queue *
1916
blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
1917
{
1918
        struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
1919
 
1920
        if (!q)
1921
                return NULL;
1922
 
1923
        q->node = node_id;
1924
        if (blk_init_free_list(q)) {
1925
                kmem_cache_free(requestq_cachep, q);
1926
                return NULL;
1927
        }
1928 81 tac2
 
1929
 
1930 62 marcus.erl
        /*
1931
         * if caller didn't supply a lock, they get per-queue locking with
1932
         * our embedded lock
1933
         */
1934
        if (!lock) {
1935
                spin_lock_init(&q->__queue_lock);
1936
                lock = &q->__queue_lock;
1937
        }
1938
 
1939
        q->request_fn           = rfn;
1940
        q->prep_rq_fn           = NULL;
1941
        q->unplug_fn            = generic_unplug_device;
1942
        q->queue_flags          = (1 << QUEUE_FLAG_CLUSTER);
1943
        q->queue_lock           = lock;
1944
 
1945
        blk_queue_segment_boundary(q, 0xffffffff);
1946
 
1947
        blk_queue_make_request(q, __make_request);
1948 81 tac2
 
1949 62 marcus.erl
        blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
1950 81 tac2
 
1951 62 marcus.erl
        blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
1952 81 tac2
 
1953 62 marcus.erl
        blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
1954 81 tac2
 
1955 62 marcus.erl
        q->sg_reserved_size = INT_MAX;
1956 81 tac2
 
1957 62 marcus.erl
        /*
1958
         * all done
1959
         */
1960
        if (!elevator_init(q, NULL)) {
1961
                blk_queue_congestion_threshold(q);
1962
                return q;
1963
        }
1964
 
1965
        blk_put_queue(q);
1966
        return NULL;
1967
}
1968
EXPORT_SYMBOL(blk_init_queue_node);
1969
 
1970
int blk_get_queue(struct request_queue *q)
1971
{
1972
        if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
1973
                kobject_get(&q->kobj);
1974
                return 0;
1975
        }
1976
 
1977
        return 1;
1978
}
1979
 
1980
EXPORT_SYMBOL(blk_get_queue);
1981
 
1982
static inline void blk_free_request(struct request_queue *q, struct request *rq)
1983
{
1984
        if (rq->cmd_flags & REQ_ELVPRIV)
1985
                elv_put_request(q, rq);
1986
        mempool_free(rq, q->rq.rq_pool);
1987
}
1988
 
1989
static struct request *
1990
blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
1991
{
1992
        struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
1993
 
1994
        if (!rq)
1995
                return NULL;
1996
 
1997
        /*
1998
         * first three bits are identical in rq->cmd_flags and bio->bi_rw,
1999
         * see bio.h and blkdev.h
2000
         */
2001
        rq->cmd_flags = rw | REQ_ALLOCED;
2002
 
2003
        if (priv) {
2004
                if (unlikely(elv_set_request(q, rq, gfp_mask))) {
2005
                        mempool_free(rq, q->rq.rq_pool);
2006
                        return NULL;
2007
                }
2008
                rq->cmd_flags |= REQ_ELVPRIV;
2009
        }
2010
 
2011
        return rq;
2012
}
2013
 
2014
/*
2015
 * ioc_batching returns true if the ioc is a valid batching request and
2016
 * should be given priority access to a request.
2017
 */
2018
static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
2019
{
2020
        if (!ioc)
2021
                return 0;
2022
 
2023
        /*
2024
         * Make sure the process is able to allocate at least 1 request
2025
         * even if the batch times out, otherwise we could theoretically
2026
         * lose wakeups.
2027
         */
2028
        return ioc->nr_batch_requests == q->nr_batching ||
2029
                (ioc->nr_batch_requests > 0
2030
                && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
2031
}
2032
 
2033
/*
2034
 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
2035
 * will cause the process to be a "batcher" on all queues in the system. This
2036
 * is the behaviour we want though - once it gets a wakeup it should be given
2037
 * a nice run.
2038
 */
2039
static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
2040
{
2041
        if (!ioc || ioc_batching(q, ioc))
2042
                return;
2043
 
2044
        ioc->nr_batch_requests = q->nr_batching;
2045
        ioc->last_waited = jiffies;
2046
}
2047
 
2048
static void __freed_request(struct request_queue *q, int rw)
2049
{
2050
        struct request_list *rl = &q->rq;
2051
 
2052
        if (rl->count[rw] < queue_congestion_off_threshold(q))
2053
                blk_clear_queue_congested(q, rw);
2054
 
2055
        if (rl->count[rw] + 1 <= q->nr_requests) {
2056
                if (waitqueue_active(&rl->wait[rw]))
2057
                        wake_up(&rl->wait[rw]);
2058
 
2059
                blk_clear_queue_full(q, rw);
2060
        }
2061
}
2062
 
2063
/*
2064
 * A request has just been released.  Account for it, update the full and
2065
 * congestion status, wake up any waiters.   Called under q->queue_lock.
2066
 */
2067
static void freed_request(struct request_queue *q, int rw, int priv)
2068
{
2069
        struct request_list *rl = &q->rq;
2070
 
2071
        rl->count[rw]--;
2072
        if (priv)
2073
                rl->elvpriv--;
2074
 
2075
        __freed_request(q, rw);
2076
 
2077
        if (unlikely(rl->starved[rw ^ 1]))
2078
                __freed_request(q, rw ^ 1);
2079
}
2080
 
2081
#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
2082
/*
2083
 * Get a free request, queue_lock must be held.
2084
 * Returns NULL on failure, with queue_lock held.
2085
 * Returns !NULL on success, with queue_lock *not held*.
2086
 */
2087
static struct request *get_request(struct request_queue *q, int rw_flags,
2088
                                   struct bio *bio, gfp_t gfp_mask)
2089
{
2090
        struct request *rq = NULL;
2091
        struct request_list *rl = &q->rq;
2092
        struct io_context *ioc = NULL;
2093
        const int rw = rw_flags & 0x01;
2094
        int may_queue, priv;
2095
 
2096
        may_queue = elv_may_queue(q, rw_flags);
2097
        if (may_queue == ELV_MQUEUE_NO)
2098
                goto rq_starved;
2099
 
2100
        if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {
2101
                if (rl->count[rw]+1 >= q->nr_requests) {
2102
                        ioc = current_io_context(GFP_ATOMIC, q->node);
2103
                        /*
2104
                         * The queue will fill after this allocation, so set
2105
                         * it as full, and mark this process as "batching".
2106
                         * This process will be allowed to complete a batch of
2107
                         * requests, others will be blocked.
2108
                         */
2109
                        if (!blk_queue_full(q, rw)) {
2110
                                ioc_set_batching(q, ioc);
2111
                                blk_set_queue_full(q, rw);
2112
                        } else {
2113
                                if (may_queue != ELV_MQUEUE_MUST
2114
                                                && !ioc_batching(q, ioc)) {
2115
                                        /*
2116
                                         * The queue is full and the allocating
2117
                                         * process is not a "batcher", and not
2118
                                         * exempted by the IO scheduler
2119
                                         */
2120
                                        goto out;
2121
                                }
2122
                        }
2123
                }
2124
                blk_set_queue_congested(q, rw);
2125
        }
2126
 
2127
        /*
2128
         * Only allow batching queuers to allocate up to 50% over the defined
2129
         * limit of requests, otherwise we could have thousands of requests
2130
         * allocated with any setting of ->nr_requests
2131
         */
2132
        if (rl->count[rw] >= (3 * q->nr_requests / 2))
2133
                goto out;
2134
 
2135
        rl->count[rw]++;
2136
        rl->starved[rw] = 0;
2137
 
2138
        priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
2139
        if (priv)
2140
                rl->elvpriv++;
2141
 
2142
        spin_unlock_irq(q->queue_lock);
2143
 
2144
        rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
2145
        if (unlikely(!rq)) {
2146
                /*
2147
                 * Allocation failed presumably due to memory. Undo anything
2148
                 * we might have messed up.
2149
                 *
2150
                 * Allocating task should really be put onto the front of the
2151
                 * wait queue, but this is pretty rare.
2152
                 */
2153
                spin_lock_irq(q->queue_lock);
2154
                freed_request(q, rw, priv);
2155
 
2156
                /*
2157
                 * in the very unlikely event that allocation failed and no
2158
                 * requests for this direction was pending, mark us starved
2159
                 * so that freeing of a request in the other direction will
2160
                 * notice us. another possible fix would be to split the
2161
                 * rq mempool into READ and WRITE
2162
                 */
2163
rq_starved:
2164
                if (unlikely(rl->count[rw] == 0))
2165
                        rl->starved[rw] = 1;
2166
 
2167
                goto out;
2168
        }
2169
 
2170
        /*
2171
         * ioc may be NULL here, and ioc_batching will be false. That's
2172
         * OK, if the queue is under the request limit then requests need
2173
         * not count toward the nr_batch_requests limit. There will always
2174
         * be some limit enforced by BLK_BATCH_TIME.
2175
         */
2176
        if (ioc_batching(q, ioc))
2177
                ioc->nr_batch_requests--;
2178
 
2179
        rq_init(q, rq);
2180
 
2181
        blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
2182
out:
2183
        return rq;
2184
}
2185
 
2186
/*
2187
 * No available requests for this queue, unplug the device and wait for some
2188
 * requests to become available.
2189
 *
2190
 * Called with q->queue_lock held, and returns with it unlocked.
2191
 */
2192
static struct request *get_request_wait(struct request_queue *q, int rw_flags,
2193
                                        struct bio *bio)
2194
{
2195
        const int rw = rw_flags & 0x01;
2196
        struct request *rq;
2197
 
2198
        rq = get_request(q, rw_flags, bio, GFP_NOIO);
2199
        while (!rq) {
2200
                DEFINE_WAIT(wait);
2201
                struct request_list *rl = &q->rq;
2202
 
2203
                prepare_to_wait_exclusive(&rl->wait[rw], &wait,
2204
                                TASK_UNINTERRUPTIBLE);
2205
 
2206
                rq = get_request(q, rw_flags, bio, GFP_NOIO);
2207
 
2208
                if (!rq) {
2209
                        struct io_context *ioc;
2210
 
2211
                        blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
2212
 
2213
                        __generic_unplug_device(q);
2214
                        spin_unlock_irq(q->queue_lock);
2215
                        io_schedule();
2216
 
2217
                        /*
2218
                         * After sleeping, we become a "batching" process and
2219
                         * will be able to allocate at least one request, and
2220
                         * up to a big batch of them for a small period time.
2221
                         * See ioc_batching, ioc_set_batching
2222
                         */
2223
                        ioc = current_io_context(GFP_NOIO, q->node);
2224
                        ioc_set_batching(q, ioc);
2225
 
2226
                        spin_lock_irq(q->queue_lock);
2227
                }
2228
                finish_wait(&rl->wait[rw], &wait);
2229
        }
2230
 
2231
        return rq;
2232
}
2233
 
2234
struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
2235
{
2236
        struct request *rq;
2237
 
2238
        BUG_ON(rw != READ && rw != WRITE);
2239
 
2240
        spin_lock_irq(q->queue_lock);
2241
        if (gfp_mask & __GFP_WAIT) {
2242
                rq = get_request_wait(q, rw, NULL);
2243
        } else {
2244
                rq = get_request(q, rw, NULL, gfp_mask);
2245
                if (!rq)
2246
                        spin_unlock_irq(q->queue_lock);
2247
        }
2248
        /* q->queue_lock is unlocked at this point */
2249
 
2250
        return rq;
2251
}
2252
EXPORT_SYMBOL(blk_get_request);
2253
 
2254
/**
2255
 * blk_start_queueing - initiate dispatch of requests to device
2256
 * @q:          request queue to kick into gear
2257
 *
2258
 * This is basically a helper to remove the need to know whether a queue
2259
 * is plugged or not if someone just wants to initiate dispatch of requests
2260
 * for this queue.
2261
 *
2262
 * The queue lock must be held with interrupts disabled.
2263
 */
2264
void blk_start_queueing(struct request_queue *q)
2265
{
2266
        if (!blk_queue_plugged(q))
2267
                q->request_fn(q);
2268
        else
2269
                __generic_unplug_device(q);
2270
}
2271
EXPORT_SYMBOL(blk_start_queueing);
2272
 
2273
/**
2274
 * blk_requeue_request - put a request back on queue
2275
 * @q:          request queue where request should be inserted
2276
 * @rq:         request to be inserted
2277
 *
2278
 * Description:
2279
 *    Drivers often keep queueing requests until the hardware cannot accept
2280
 *    more, when that condition happens we need to put the request back
2281
 *    on the queue. Must be called with queue lock held.
2282
 */
2283
void blk_requeue_request(struct request_queue *q, struct request *rq)
2284
{
2285
        blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
2286
 
2287
        if (blk_rq_tagged(rq))
2288
                blk_queue_end_tag(q, rq);
2289
 
2290
        elv_requeue_request(q, rq);
2291
}
2292
 
2293
EXPORT_SYMBOL(blk_requeue_request);
2294
 
2295
/**
2296
 * blk_insert_request - insert a special request in to a request queue
2297
 * @q:          request queue where request should be inserted
2298
 * @rq:         request to be inserted
2299
 * @at_head:    insert request at head or tail of queue
2300
 * @data:       private data
2301
 *
2302
 * Description:
2303
 *    Many block devices need to execute commands asynchronously, so they don't
2304
 *    block the whole kernel from preemption during request execution.  This is
2305
 *    accomplished normally by inserting aritficial requests tagged as
2306
 *    REQ_SPECIAL in to the corresponding request queue, and letting them be
2307
 *    scheduled for actual execution by the request queue.
2308
 *
2309
 *    We have the option of inserting the head or the tail of the queue.
2310
 *    Typically we use the tail for new ioctls and so forth.  We use the head
2311
 *    of the queue for things like a QUEUE_FULL message from a device, or a
2312
 *    host that is unable to accept a particular command.
2313
 */
2314
void blk_insert_request(struct request_queue *q, struct request *rq,
2315
                        int at_head, void *data)
2316
{
2317
        int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
2318
        unsigned long flags;
2319
 
2320
        /*
2321
         * tell I/O scheduler that this isn't a regular read/write (ie it
2322
         * must not attempt merges on this) and that it acts as a soft
2323
         * barrier
2324
         */
2325
        rq->cmd_type = REQ_TYPE_SPECIAL;
2326
        rq->cmd_flags |= REQ_SOFTBARRIER;
2327
 
2328
        rq->special = data;
2329
 
2330
        spin_lock_irqsave(q->queue_lock, flags);
2331
 
2332
        /*
2333
         * If command is tagged, release the tag
2334
         */
2335
        if (blk_rq_tagged(rq))
2336
                blk_queue_end_tag(q, rq);
2337
 
2338
        drive_stat_acct(rq, 1);
2339
        __elv_add_request(q, rq, where, 0);
2340
        blk_start_queueing(q);
2341
        spin_unlock_irqrestore(q->queue_lock, flags);
2342
}
2343
 
2344
EXPORT_SYMBOL(blk_insert_request);
2345
 
2346
static int __blk_rq_unmap_user(struct bio *bio)
2347
{
2348
        int ret = 0;
2349
 
2350
        if (bio) {
2351
                if (bio_flagged(bio, BIO_USER_MAPPED))
2352
                        bio_unmap_user(bio);
2353
                else
2354
                        ret = bio_uncopy_user(bio);
2355
        }
2356
 
2357
        return ret;
2358
}
2359
 
2360
int blk_rq_append_bio(struct request_queue *q, struct request *rq,
2361
                      struct bio *bio)
2362
{
2363
        if (!rq->bio)
2364
                blk_rq_bio_prep(q, rq, bio);
2365
        else if (!ll_back_merge_fn(q, rq, bio))
2366
                return -EINVAL;
2367
        else {
2368
                rq->biotail->bi_next = bio;
2369
                rq->biotail = bio;
2370
 
2371
                rq->data_len += bio->bi_size;
2372
        }
2373
        return 0;
2374
}
2375
EXPORT_SYMBOL(blk_rq_append_bio);
2376
 
2377
static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
2378
                             void __user *ubuf, unsigned int len)
2379
{
2380
        unsigned long uaddr;
2381
        struct bio *bio, *orig_bio;
2382
        int reading, ret;
2383
 
2384
        reading = rq_data_dir(rq) == READ;
2385
 
2386
        /*
2387
         * if alignment requirement is satisfied, map in user pages for
2388
         * direct dma. else, set up kernel bounce buffers
2389
         */
2390
        uaddr = (unsigned long) ubuf;
2391
        if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q)))
2392
                bio = bio_map_user(q, NULL, uaddr, len, reading);
2393
        else
2394
                bio = bio_copy_user(q, uaddr, len, reading);
2395
 
2396
        if (IS_ERR(bio))
2397
                return PTR_ERR(bio);
2398
 
2399
        orig_bio = bio;
2400
        blk_queue_bounce(q, &bio);
2401
 
2402
        /*
2403
         * We link the bounce buffer in and could have to traverse it
2404
         * later so we have to get a ref to prevent it from being freed
2405
         */
2406
        bio_get(bio);
2407
 
2408
        ret = blk_rq_append_bio(q, rq, bio);
2409
        if (!ret)
2410
                return bio->bi_size;
2411
 
2412
        /* if it was boucned we must call the end io function */
2413
        bio_endio(bio, 0);
2414
        __blk_rq_unmap_user(orig_bio);
2415
        bio_put(bio);
2416
        return ret;
2417
}
2418
 
2419
/**
2420
 * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage
2421
 * @q:          request queue where request should be inserted
2422
 * @rq:         request structure to fill
2423
 * @ubuf:       the user buffer
2424
 * @len:        length of user data
2425
 *
2426
 * Description:
2427
 *    Data will be mapped directly for zero copy io, if possible. Otherwise
2428
 *    a kernel bounce buffer is used.
2429
 *
2430
 *    A matching blk_rq_unmap_user() must be issued at the end of io, while
2431
 *    still in process context.
2432
 *
2433
 *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
2434
 *    before being submitted to the device, as pages mapped may be out of
2435
 *    reach. It's the callers responsibility to make sure this happens. The
2436
 *    original bio must be passed back in to blk_rq_unmap_user() for proper
2437
 *    unmapping.
2438
 */
2439
int blk_rq_map_user(struct request_queue *q, struct request *rq,
2440
                    void __user *ubuf, unsigned long len)
2441
{
2442
        unsigned long bytes_read = 0;
2443
        struct bio *bio = NULL;
2444
        int ret;
2445
 
2446
        if (len > (q->max_hw_sectors << 9))
2447
                return -EINVAL;
2448
        if (!len || !ubuf)
2449
                return -EINVAL;
2450
 
2451
        while (bytes_read != len) {
2452
                unsigned long map_len, end, start;
2453
 
2454
                map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE);
2455
                end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1)
2456
                                                                >> PAGE_SHIFT;
2457
                start = (unsigned long)ubuf >> PAGE_SHIFT;
2458
 
2459
                /*
2460
                 * A bad offset could cause us to require BIO_MAX_PAGES + 1
2461
                 * pages. If this happens we just lower the requested
2462
                 * mapping len by a page so that we can fit
2463
                 */
2464
                if (end - start > BIO_MAX_PAGES)
2465
                        map_len -= PAGE_SIZE;
2466
 
2467
                ret = __blk_rq_map_user(q, rq, ubuf, map_len);
2468
                if (ret < 0)
2469
                        goto unmap_rq;
2470
                if (!bio)
2471
                        bio = rq->bio;
2472
                bytes_read += ret;
2473
                ubuf += ret;
2474
        }
2475
 
2476
        rq->buffer = rq->data = NULL;
2477
        return 0;
2478
unmap_rq:
2479
        blk_rq_unmap_user(bio);
2480
        return ret;
2481
}
2482
 
2483
EXPORT_SYMBOL(blk_rq_map_user);
2484
 
2485
/**
2486
 * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage
2487
 * @q:          request queue where request should be inserted
2488
 * @rq:         request to map data to
2489
 * @iov:        pointer to the iovec
2490
 * @iov_count:  number of elements in the iovec
2491
 * @len:        I/O byte count
2492
 *
2493
 * Description:
2494
 *    Data will be mapped directly for zero copy io, if possible. Otherwise
2495
 *    a kernel bounce buffer is used.
2496
 *
2497
 *    A matching blk_rq_unmap_user() must be issued at the end of io, while
2498
 *    still in process context.
2499
 *
2500
 *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
2501
 *    before being submitted to the device, as pages mapped may be out of
2502
 *    reach. It's the callers responsibility to make sure this happens. The
2503
 *    original bio must be passed back in to blk_rq_unmap_user() for proper
2504
 *    unmapping.
2505
 */
2506
int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
2507
                        struct sg_iovec *iov, int iov_count, unsigned int len)
2508
{
2509
        struct bio *bio;
2510
 
2511
        if (!iov || iov_count <= 0)
2512
                return -EINVAL;
2513
 
2514
        /* we don't allow misaligned data like bio_map_user() does.  If the
2515
         * user is using sg, they're expected to know the alignment constraints
2516
         * and respect them accordingly */
2517
        bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ);
2518
        if (IS_ERR(bio))
2519
                return PTR_ERR(bio);
2520
 
2521
        if (bio->bi_size != len) {
2522
                bio_endio(bio, 0);
2523
                bio_unmap_user(bio);
2524
                return -EINVAL;
2525
        }
2526
 
2527
        bio_get(bio);
2528
        blk_rq_bio_prep(q, rq, bio);
2529
        rq->buffer = rq->data = NULL;
2530
        return 0;
2531
}
2532
 
2533
EXPORT_SYMBOL(blk_rq_map_user_iov);
2534
 
2535
/**
2536
 * blk_rq_unmap_user - unmap a request with user data
2537
 * @bio:               start of bio list
2538
 *
2539
 * Description:
2540
 *    Unmap a rq previously mapped by blk_rq_map_user(). The caller must
2541
 *    supply the original rq->bio from the blk_rq_map_user() return, since
2542
 *    the io completion may have changed rq->bio.
2543
 */
2544
int blk_rq_unmap_user(struct bio *bio)
2545
{
2546
        struct bio *mapped_bio;
2547
        int ret = 0, ret2;
2548
 
2549
        while (bio) {
2550
                mapped_bio = bio;
2551
                if (unlikely(bio_flagged(bio, BIO_BOUNCED)))
2552
                        mapped_bio = bio->bi_private;
2553
 
2554
                ret2 = __blk_rq_unmap_user(mapped_bio);
2555
                if (ret2 && !ret)
2556
                        ret = ret2;
2557
 
2558
                mapped_bio = bio;
2559
                bio = bio->bi_next;
2560
                bio_put(mapped_bio);
2561
        }
2562
 
2563
        return ret;
2564
}
2565
 
2566
EXPORT_SYMBOL(blk_rq_unmap_user);
2567
 
2568
/**
2569
 * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage
2570
 * @q:          request queue where request should be inserted
2571
 * @rq:         request to fill
2572
 * @kbuf:       the kernel buffer
2573
 * @len:        length of user data
2574
 * @gfp_mask:   memory allocation flags
2575
 */
2576
int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
2577
                    unsigned int len, gfp_t gfp_mask)
2578
{
2579
        struct bio *bio;
2580
 
2581
        if (len > (q->max_hw_sectors << 9))
2582
                return -EINVAL;
2583
        if (!len || !kbuf)
2584
                return -EINVAL;
2585
 
2586
        bio = bio_map_kern(q, kbuf, len, gfp_mask);
2587
        if (IS_ERR(bio))
2588
                return PTR_ERR(bio);
2589
 
2590
        if (rq_data_dir(rq) == WRITE)
2591
                bio->bi_rw |= (1 << BIO_RW);
2592
 
2593
        blk_rq_bio_prep(q, rq, bio);
2594
        blk_queue_bounce(q, &rq->bio);
2595
        rq->buffer = rq->data = NULL;
2596
        return 0;
2597
}
2598
 
2599
EXPORT_SYMBOL(blk_rq_map_kern);
2600
 
2601
/**
2602
 * blk_execute_rq_nowait - insert a request into queue for execution
2603
 * @q:          queue to insert the request in
2604
 * @bd_disk:    matching gendisk
2605
 * @rq:         request to insert
2606
 * @at_head:    insert request at head or tail of queue
2607
 * @done:       I/O completion handler
2608
 *
2609
 * Description:
2610
 *    Insert a fully prepared request at the back of the io scheduler queue
2611
 *    for execution.  Don't wait for completion.
2612
 */
2613
void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
2614
                           struct request *rq, int at_head,
2615
                           rq_end_io_fn *done)
2616
{
2617
        int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
2618
 
2619
        rq->rq_disk = bd_disk;
2620
        rq->cmd_flags |= REQ_NOMERGE;
2621
        rq->end_io = done;
2622
        WARN_ON(irqs_disabled());
2623
        spin_lock_irq(q->queue_lock);
2624
        __elv_add_request(q, rq, where, 1);
2625
        __generic_unplug_device(q);
2626
        spin_unlock_irq(q->queue_lock);
2627
}
2628
EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
2629
 
2630
/**
2631
 * blk_execute_rq - insert a request into queue for execution
2632
 * @q:          queue to insert the request in
2633
 * @bd_disk:    matching gendisk
2634
 * @rq:         request to insert
2635
 * @at_head:    insert request at head or tail of queue
2636
 *
2637
 * Description:
2638
 *    Insert a fully prepared request at the back of the io scheduler queue
2639
 *    for execution and wait for completion.
2640
 */
2641
int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
2642
                   struct request *rq, int at_head)
2643
{
2644
        DECLARE_COMPLETION_ONSTACK(wait);
2645
        char sense[SCSI_SENSE_BUFFERSIZE];
2646
        int err = 0;
2647
 
2648
        /*
2649
         * we need an extra reference to the request, so we can look at
2650
         * it after io completion
2651
         */
2652
        rq->ref_count++;
2653
 
2654
        if (!rq->sense) {
2655
                memset(sense, 0, sizeof(sense));
2656
                rq->sense = sense;
2657
                rq->sense_len = 0;
2658
        }
2659
 
2660
        rq->end_io_data = &wait;
2661
        blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
2662
        wait_for_completion(&wait);
2663
 
2664
        if (rq->errors)
2665
                err = -EIO;
2666
 
2667
        return err;
2668
}
2669
 
2670
EXPORT_SYMBOL(blk_execute_rq);
2671
 
2672
static void bio_end_empty_barrier(struct bio *bio, int err)
2673
{
2674
        if (err)
2675
                clear_bit(BIO_UPTODATE, &bio->bi_flags);
2676
 
2677
        complete(bio->bi_private);
2678
}
2679
 
2680
/**
2681
 * blkdev_issue_flush - queue a flush
2682
 * @bdev:       blockdev to issue flush for
2683
 * @error_sector:       error sector
2684
 *
2685
 * Description:
2686
 *    Issue a flush for the block device in question. Caller can supply
2687
 *    room for storing the error offset in case of a flush error, if they
2688
 *    wish to.  Caller must run wait_for_completion() on its own.
2689
 */
2690
int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
2691
{
2692
        DECLARE_COMPLETION_ONSTACK(wait);
2693
        struct request_queue *q;
2694
        struct bio *bio;
2695
        int ret;
2696
 
2697
        if (bdev->bd_disk == NULL)
2698
                return -ENXIO;
2699
 
2700
        q = bdev_get_queue(bdev);
2701
        if (!q)
2702
                return -ENXIO;
2703
 
2704
        bio = bio_alloc(GFP_KERNEL, 0);
2705
        if (!bio)
2706
                return -ENOMEM;
2707
 
2708
        bio->bi_end_io = bio_end_empty_barrier;
2709
        bio->bi_private = &wait;
2710
        bio->bi_bdev = bdev;
2711
        submit_bio(1 << BIO_RW_BARRIER, bio);
2712
 
2713
        wait_for_completion(&wait);
2714
 
2715
        /*
2716
         * The driver must store the error location in ->bi_sector, if
2717
         * it supports it. For non-stacked drivers, this should be copied
2718
         * from rq->sector.
2719
         */
2720
        if (error_sector)
2721
                *error_sector = bio->bi_sector;
2722
 
2723
        ret = 0;
2724
        if (!bio_flagged(bio, BIO_UPTODATE))
2725
                ret = -EIO;
2726
 
2727
        bio_put(bio);
2728
        return ret;
2729
}
2730
 
2731
EXPORT_SYMBOL(blkdev_issue_flush);
2732
 
2733
static void drive_stat_acct(struct request *rq, int new_io)
2734
{
2735
        int rw = rq_data_dir(rq);
2736
 
2737
        if (!blk_fs_request(rq) || !rq->rq_disk)
2738
                return;
2739
 
2740
        if (!new_io) {
2741
                __disk_stat_inc(rq->rq_disk, merges[rw]);
2742
        } else {
2743
                disk_round_stats(rq->rq_disk);
2744
                rq->rq_disk->in_flight++;
2745
        }
2746
}
2747
 
2748
/*
2749
 * add-request adds a request to the linked list.
2750
 * queue lock is held and interrupts disabled, as we muck with the
2751
 * request queue list.
2752
 */
2753
static inline void add_request(struct request_queue * q, struct request * req)
2754
{
2755
        drive_stat_acct(req, 1);
2756
 
2757
        /*
2758
         * elevator indicated where it wants this request to be
2759
         * inserted at elevator_merge time
2760
         */
2761
        __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
2762
}
2763
 
2764
/*
2765
 * disk_round_stats()   - Round off the performance stats on a struct
2766
 * disk_stats.
2767
 *
2768
 * The average IO queue length and utilisation statistics are maintained
2769
 * by observing the current state of the queue length and the amount of
2770
 * time it has been in this state for.
2771
 *
2772
 * Normally, that accounting is done on IO completion, but that can result
2773
 * in more than a second's worth of IO being accounted for within any one
2774
 * second, leading to >100% utilisation.  To deal with that, we call this
2775
 * function to do a round-off before returning the results when reading
2776
 * /proc/diskstats.  This accounts immediately for all queue usage up to
2777
 * the current jiffies and restarts the counters again.
2778
 */
2779
void disk_round_stats(struct gendisk *disk)
2780
{
2781
        unsigned long now = jiffies;
2782
 
2783
        if (now == disk->stamp)
2784
                return;
2785
 
2786
        if (disk->in_flight) {
2787
                __disk_stat_add(disk, time_in_queue,
2788
                                disk->in_flight * (now - disk->stamp));
2789
                __disk_stat_add(disk, io_ticks, (now - disk->stamp));
2790
        }
2791
        disk->stamp = now;
2792
}
2793
 
2794
EXPORT_SYMBOL_GPL(disk_round_stats);
2795
 
2796
/*
2797
 * queue lock must be held
2798
 */
2799
void __blk_put_request(struct request_queue *q, struct request *req)
2800
{
2801
        if (unlikely(!q))
2802
                return;
2803
        if (unlikely(--req->ref_count))
2804
                return;
2805
 
2806
        elv_completed_request(q, req);
2807
 
2808
        /*
2809
         * Request may not have originated from ll_rw_blk. if not,
2810
         * it didn't come out of our reserved rq pools
2811
         */
2812
        if (req->cmd_flags & REQ_ALLOCED) {
2813
                int rw = rq_data_dir(req);
2814
                int priv = req->cmd_flags & REQ_ELVPRIV;
2815
 
2816
                BUG_ON(!list_empty(&req->queuelist));
2817
                BUG_ON(!hlist_unhashed(&req->hash));
2818
 
2819
                blk_free_request(q, req);
2820
                freed_request(q, rw, priv);
2821
        }
2822
}
2823
 
2824
EXPORT_SYMBOL_GPL(__blk_put_request);
2825
 
2826
void blk_put_request(struct request *req)
2827
{
2828
        unsigned long flags;
2829
        struct request_queue *q = req->q;
2830
 
2831
        /*
2832
         * Gee, IDE calls in w/ NULL q.  Fix IDE and remove the
2833
         * following if (q) test.
2834
         */
2835
        if (q) {
2836
                spin_lock_irqsave(q->queue_lock, flags);
2837
                __blk_put_request(q, req);
2838
                spin_unlock_irqrestore(q->queue_lock, flags);
2839
        }
2840
}
2841
 
2842
EXPORT_SYMBOL(blk_put_request);
2843
 
2844
/**
2845
 * blk_end_sync_rq - executes a completion event on a request
2846
 * @rq: request to complete
2847
 * @error: end io status of the request
2848
 */
2849
void blk_end_sync_rq(struct request *rq, int error)
2850
{
2851
        struct completion *waiting = rq->end_io_data;
2852
 
2853
        rq->end_io_data = NULL;
2854
        __blk_put_request(rq->q, rq);
2855
 
2856
        /*
2857
         * complete last, if this is a stack request the process (and thus
2858
         * the rq pointer) could be invalid right after this complete()
2859
         */
2860
        complete(waiting);
2861
}
2862
EXPORT_SYMBOL(blk_end_sync_rq);
2863
 
2864
/*
2865
 * Has to be called with the request spinlock acquired
2866
 */
2867
static int attempt_merge(struct request_queue *q, struct request *req,
2868
                          struct request *next)
2869
{
2870
        if (!rq_mergeable(req) || !rq_mergeable(next))
2871
                return 0;
2872
 
2873
        /*
2874
         * not contiguous
2875
         */
2876
        if (req->sector + req->nr_sectors != next->sector)
2877
                return 0;
2878
 
2879
        if (rq_data_dir(req) != rq_data_dir(next)
2880
            || req->rq_disk != next->rq_disk
2881
            || next->special)
2882
                return 0;
2883
 
2884
        /*
2885
         * If we are allowed to merge, then append bio list
2886
         * from next to rq and release next. merge_requests_fn
2887
         * will have updated segment counts, update sector
2888
         * counts here.
2889
         */
2890
        if (!ll_merge_requests_fn(q, req, next))
2891
                return 0;
2892
 
2893
        /*
2894
         * At this point we have either done a back merge
2895
         * or front merge. We need the smaller start_time of
2896
         * the merged requests to be the current request
2897
         * for accounting purposes.
2898
         */
2899
        if (time_after(req->start_time, next->start_time))
2900
                req->start_time = next->start_time;
2901
 
2902
        req->biotail->bi_next = next->bio;
2903
        req->biotail = next->biotail;
2904
 
2905
        req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
2906
 
2907
        elv_merge_requests(q, req, next);
2908
 
2909
        if (req->rq_disk) {
2910
                disk_round_stats(req->rq_disk);
2911
                req->rq_disk->in_flight--;
2912
        }
2913
 
2914
        req->ioprio = ioprio_best(req->ioprio, next->ioprio);
2915
 
2916
        __blk_put_request(q, next);
2917
        return 1;
2918
}
2919
 
2920
static inline int attempt_back_merge(struct request_queue *q,
2921
                                     struct request *rq)
2922
{
2923
        struct request *next = elv_latter_request(q, rq);
2924
 
2925
        if (next)
2926
                return attempt_merge(q, rq, next);
2927
 
2928
        return 0;
2929
}
2930
 
2931
static inline int attempt_front_merge(struct request_queue *q,
2932
                                      struct request *rq)
2933
{
2934
        struct request *prev = elv_former_request(q, rq);
2935
 
2936
        if (prev)
2937
                return attempt_merge(q, prev, rq);
2938
 
2939
        return 0;
2940
}
2941
 
2942
static void init_request_from_bio(struct request *req, struct bio *bio)
2943
{
2944
        req->cmd_type = REQ_TYPE_FS;
2945
 
2946
        /*
2947
         * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
2948
         */
2949
        if (bio_rw_ahead(bio) || bio_failfast(bio))
2950
                req->cmd_flags |= REQ_FAILFAST;
2951
 
2952
        /*
2953
         * REQ_BARRIER implies no merging, but lets make it explicit
2954
         */
2955
        if (unlikely(bio_barrier(bio)))
2956
                req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
2957
 
2958
        if (bio_sync(bio))
2959
                req->cmd_flags |= REQ_RW_SYNC;
2960
        if (bio_rw_meta(bio))
2961
                req->cmd_flags |= REQ_RW_META;
2962
 
2963
        req->errors = 0;
2964
        req->hard_sector = req->sector = bio->bi_sector;
2965
        req->ioprio = bio_prio(bio);
2966
        req->start_time = jiffies;
2967
        blk_rq_bio_prep(req->q, req, bio);
2968
}
2969
 
2970
static int __make_request(struct request_queue *q, struct bio *bio)
2971
{
2972
        struct request *req;
2973
        int el_ret, nr_sectors, barrier, err;
2974
        const unsigned short prio = bio_prio(bio);
2975
        const int sync = bio_sync(bio);
2976
        int rw_flags;
2977
 
2978
        nr_sectors = bio_sectors(bio);
2979
 
2980
        /*
2981
         * low level driver can indicate that it wants pages above a
2982
         * certain limit bounced to low memory (ie for highmem, or even
2983
         * ISA dma in theory)
2984
         */
2985
        blk_queue_bounce(q, &bio);
2986
 
2987
        barrier = bio_barrier(bio);
2988
        if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
2989
                err = -EOPNOTSUPP;
2990
                goto end_io;
2991
        }
2992
 
2993
        spin_lock_irq(q->queue_lock);
2994
 
2995
        if (unlikely(barrier) || elv_queue_empty(q))
2996
                goto get_rq;
2997
 
2998
        el_ret = elv_merge(q, &req, bio);
2999
        switch (el_ret) {
3000
                case ELEVATOR_BACK_MERGE:
3001
                        BUG_ON(!rq_mergeable(req));
3002
 
3003
                        if (!ll_back_merge_fn(q, req, bio))
3004
                                break;
3005
 
3006
                        blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
3007
 
3008
                        req->biotail->bi_next = bio;
3009
                        req->biotail = bio;
3010
                        req->nr_sectors = req->hard_nr_sectors += nr_sectors;
3011
                        req->ioprio = ioprio_best(req->ioprio, prio);
3012
                        drive_stat_acct(req, 0);
3013
                        if (!attempt_back_merge(q, req))
3014
                                elv_merged_request(q, req, el_ret);
3015
                        goto out;
3016
 
3017
                case ELEVATOR_FRONT_MERGE:
3018
                        BUG_ON(!rq_mergeable(req));
3019
 
3020
                        if (!ll_front_merge_fn(q, req, bio))
3021
                                break;
3022
 
3023
                        blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
3024
 
3025
                        bio->bi_next = req->bio;
3026
                        req->bio = bio;
3027
 
3028
                        /*
3029
                         * may not be valid. if the low level driver said
3030
                         * it didn't need a bounce buffer then it better
3031
                         * not touch req->buffer either...
3032
                         */
3033
                        req->buffer = bio_data(bio);
3034
                        req->current_nr_sectors = bio_cur_sectors(bio);
3035
                        req->hard_cur_sectors = req->current_nr_sectors;
3036
                        req->sector = req->hard_sector = bio->bi_sector;
3037
                        req->nr_sectors = req->hard_nr_sectors += nr_sectors;
3038
                        req->ioprio = ioprio_best(req->ioprio, prio);
3039
                        drive_stat_acct(req, 0);
3040
                        if (!attempt_front_merge(q, req))
3041
                                elv_merged_request(q, req, el_ret);
3042
                        goto out;
3043
 
3044
                /* ELV_NO_MERGE: elevator says don't/can't merge. */
3045
                default:
3046
                        ;
3047
        }
3048
 
3049
get_rq:
3050
        /*
3051
         * This sync check and mask will be re-done in init_request_from_bio(),
3052
         * but we need to set it earlier to expose the sync flag to the
3053
         * rq allocator and io schedulers.
3054
         */
3055
        rw_flags = bio_data_dir(bio);
3056
        if (sync)
3057
                rw_flags |= REQ_RW_SYNC;
3058
 
3059
        /*
3060
         * Grab a free request. This is might sleep but can not fail.
3061
         * Returns with the queue unlocked.
3062
         */
3063
        req = get_request_wait(q, rw_flags, bio);
3064
 
3065
        /*
3066
         * After dropping the lock and possibly sleeping here, our request
3067
         * may now be mergeable after it had proven unmergeable (above).
3068
         * We don't worry about that case for efficiency. It won't happen
3069
         * often, and the elevators are able to handle it.
3070
         */
3071
        init_request_from_bio(req, bio);
3072
 
3073
        spin_lock_irq(q->queue_lock);
3074
        if (elv_queue_empty(q))
3075
                blk_plug_device(q);
3076
        add_request(q, req);
3077
out:
3078
        if (sync)
3079
                __generic_unplug_device(q);
3080
 
3081
        spin_unlock_irq(q->queue_lock);
3082
        return 0;
3083
 
3084
end_io:
3085
        bio_endio(bio, err);
3086
        return 0;
3087
}
3088
 
3089
/*
3090
 * If bio->bi_dev is a partition, remap the location
3091
 */
3092
static inline void blk_partition_remap(struct bio *bio)
3093
{
3094
        struct block_device *bdev = bio->bi_bdev;
3095
 
3096
        if (bio_sectors(bio) && bdev != bdev->bd_contains) {
3097
                struct hd_struct *p = bdev->bd_part;
3098
                const int rw = bio_data_dir(bio);
3099
 
3100
                p->sectors[rw] += bio_sectors(bio);
3101
                p->ios[rw]++;
3102
 
3103
                bio->bi_sector += p->start_sect;
3104
                bio->bi_bdev = bdev->bd_contains;
3105
 
3106
                blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,
3107
                                    bdev->bd_dev, bio->bi_sector,
3108
                                    bio->bi_sector - p->start_sect);
3109
        }
3110
}
3111
 
3112
static void handle_bad_sector(struct bio *bio)
3113
{
3114
        char b[BDEVNAME_SIZE];
3115
 
3116
        printk(KERN_INFO "attempt to access beyond end of device\n");
3117
        printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
3118
                        bdevname(bio->bi_bdev, b),
3119
                        bio->bi_rw,
3120
                        (unsigned long long)bio->bi_sector + bio_sectors(bio),
3121
                        (long long)(bio->bi_bdev->bd_inode->i_size >> 9));
3122
 
3123
        set_bit(BIO_EOF, &bio->bi_flags);
3124
}
3125
 
3126
#ifdef CONFIG_FAIL_MAKE_REQUEST
3127
 
3128
static DECLARE_FAULT_ATTR(fail_make_request);
3129
 
3130
static int __init setup_fail_make_request(char *str)
3131
{
3132
        return setup_fault_attr(&fail_make_request, str);
3133
}
3134
__setup("fail_make_request=", setup_fail_make_request);
3135
 
3136
static int should_fail_request(struct bio *bio)
3137
{
3138
        if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) ||
3139
            (bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail))
3140
                return should_fail(&fail_make_request, bio->bi_size);
3141
 
3142
        return 0;
3143
}
3144
 
3145
static int __init fail_make_request_debugfs(void)
3146
{
3147
        return init_fault_attr_dentries(&fail_make_request,
3148
                                        "fail_make_request");
3149
}
3150
 
3151
late_initcall(fail_make_request_debugfs);
3152
 
3153
#else /* CONFIG_FAIL_MAKE_REQUEST */
3154
 
3155
static inline int should_fail_request(struct bio *bio)
3156
{
3157
        return 0;
3158
}
3159
 
3160
#endif /* CONFIG_FAIL_MAKE_REQUEST */
3161
 
3162
/*
3163
 * Check whether this bio extends beyond the end of the device.
3164
 */
3165
static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
3166
{
3167
        sector_t maxsector;
3168
 
3169
        if (!nr_sectors)
3170
                return 0;
3171
 
3172
        /* Test device or partition size, when known. */
3173
        maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
3174
        if (maxsector) {
3175
                sector_t sector = bio->bi_sector;
3176
 
3177
                if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
3178
                        /*
3179
                         * This may well happen - the kernel calls bread()
3180
                         * without checking the size of the device, e.g., when
3181
                         * mounting a device.
3182
                         */
3183
                        handle_bad_sector(bio);
3184
                        return 1;
3185
                }
3186
        }
3187
 
3188
        return 0;
3189
}
3190
 
3191
/**
3192
 * generic_make_request: hand a buffer to its device driver for I/O
3193
 * @bio:  The bio describing the location in memory and on the device.
3194
 *
3195
 * generic_make_request() is used to make I/O requests of block
3196
 * devices. It is passed a &struct bio, which describes the I/O that needs
3197
 * to be done.
3198
 *
3199
 * generic_make_request() does not return any status.  The
3200
 * success/failure status of the request, along with notification of
3201
 * completion, is delivered asynchronously through the bio->bi_end_io
3202
 * function described (one day) else where.
3203
 *
3204
 * The caller of generic_make_request must make sure that bi_io_vec
3205
 * are set to describe the memory buffer, and that bi_dev and bi_sector are
3206
 * set to describe the device address, and the
3207
 * bi_end_io and optionally bi_private are set to describe how
3208
 * completion notification should be signaled.
3209
 *
3210
 * generic_make_request and the drivers it calls may use bi_next if this
3211
 * bio happens to be merged with someone else, and may change bi_dev and
3212
 * bi_sector for remaps as it sees fit.  So the values of these fields
3213
 * should NOT be depended on after the call to generic_make_request.
3214
 */
3215
static inline void __generic_make_request(struct bio *bio)
3216
{
3217
        struct request_queue *q;
3218
        sector_t old_sector;
3219
        int ret, nr_sectors = bio_sectors(bio);
3220
        dev_t old_dev;
3221
        int err = -EIO;
3222
 
3223
        might_sleep();
3224
 
3225
        if (bio_check_eod(bio, nr_sectors))
3226
                goto end_io;
3227
 
3228
        /*
3229
         * Resolve the mapping until finished. (drivers are
3230
         * still free to implement/resolve their own stacking
3231
         * by explicitly returning 0)
3232
         *
3233
         * NOTE: we don't repeat the blk_size check for each new device.
3234
         * Stacking drivers are expected to know what they are doing.
3235
         */
3236
        old_sector = -1;
3237
        old_dev = 0;
3238
        do {
3239
                char b[BDEVNAME_SIZE];
3240
 
3241
                q = bdev_get_queue(bio->bi_bdev);
3242
                if (!q) {
3243
                        printk(KERN_ERR
3244
                               "generic_make_request: Trying to access "
3245
                                "nonexistent block-device %s (%Lu)\n",
3246
                                bdevname(bio->bi_bdev, b),
3247
                                (long long) bio->bi_sector);
3248
end_io:
3249
                        bio_endio(bio, err);
3250
                        break;
3251
                }
3252
 
3253
                if (unlikely(nr_sectors > q->max_hw_sectors)) {
3254
                        printk("bio too big device %s (%u > %u)\n",
3255
                                bdevname(bio->bi_bdev, b),
3256
                                bio_sectors(bio),
3257
                                q->max_hw_sectors);
3258
                        goto end_io;
3259
                }
3260
 
3261
                if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
3262
                        goto end_io;
3263
 
3264
                if (should_fail_request(bio))
3265
                        goto end_io;
3266
 
3267
                /*
3268
                 * If this device has partitions, remap block n
3269
                 * of partition p to block n+start(p) of the disk.
3270
                 */
3271
                blk_partition_remap(bio);
3272
 
3273
                if (old_sector != -1)
3274
                        blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
3275
                                            old_sector);
3276
 
3277
                blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
3278
 
3279
                old_sector = bio->bi_sector;
3280
                old_dev = bio->bi_bdev->bd_dev;
3281
 
3282
                if (bio_check_eod(bio, nr_sectors))
3283
                        goto end_io;
3284
                if (bio_empty_barrier(bio) && !q->prepare_flush_fn) {
3285
                        err = -EOPNOTSUPP;
3286
                        goto end_io;
3287
                }
3288
 
3289
                ret = q->make_request_fn(q, bio);
3290
        } while (ret);
3291
}
3292
 
3293
/*
3294
 * We only want one ->make_request_fn to be active at a time,
3295
 * else stack usage with stacked devices could be a problem.
3296
 * So use current->bio_{list,tail} to keep a list of requests
3297
 * submited by a make_request_fn function.
3298
 * current->bio_tail is also used as a flag to say if
3299
 * generic_make_request is currently active in this task or not.
3300
 * If it is NULL, then no make_request is active.  If it is non-NULL,
3301
 * then a make_request is active, and new requests should be added
3302
 * at the tail
3303
 */
3304
void generic_make_request(struct bio *bio)
3305
{
3306
        if (current->bio_tail) {
3307
                /* make_request is active */
3308
                *(current->bio_tail) = bio;
3309
                bio->bi_next = NULL;
3310
                current->bio_tail = &bio->bi_next;
3311
                return;
3312
        }
3313
        /* following loop may be a bit non-obvious, and so deserves some
3314
         * explanation.
3315
         * Before entering the loop, bio->bi_next is NULL (as all callers
3316
         * ensure that) so we have a list with a single bio.
3317
         * We pretend that we have just taken it off a longer list, so
3318
         * we assign bio_list to the next (which is NULL) and bio_tail
3319
         * to &bio_list, thus initialising the bio_list of new bios to be
3320
         * added.  __generic_make_request may indeed add some more bios
3321
         * through a recursive call to generic_make_request.  If it
3322
         * did, we find a non-NULL value in bio_list and re-enter the loop
3323
         * from the top.  In this case we really did just take the bio
3324
         * of the top of the list (no pretending) and so fixup bio_list and
3325
         * bio_tail or bi_next, and call into __generic_make_request again.
3326
         *
3327
         * The loop was structured like this to make only one call to
3328
         * __generic_make_request (which is important as it is large and
3329
         * inlined) and to keep the structure simple.
3330
         */
3331
        BUG_ON(bio->bi_next);
3332
        do {
3333
                current->bio_list = bio->bi_next;
3334
                if (bio->bi_next == NULL)
3335
                        current->bio_tail = &current->bio_list;
3336
                else
3337
                        bio->bi_next = NULL;
3338
                __generic_make_request(bio);
3339
                bio = current->bio_list;
3340
        } while (bio);
3341
        current->bio_tail = NULL; /* deactivate */
3342
}
3343
 
3344
EXPORT_SYMBOL(generic_make_request);
3345
 
3346
/**
3347
 * submit_bio: submit a bio to the block device layer for I/O
3348
 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
3349
 * @bio: The &struct bio which describes the I/O
3350
 *
3351
 * submit_bio() is very similar in purpose to generic_make_request(), and
3352
 * uses that function to do most of the work. Both are fairly rough
3353
 * interfaces, @bio must be presetup and ready for I/O.
3354
 *
3355
 */
3356
void submit_bio(int rw, struct bio *bio)
3357
{
3358
        int count = bio_sectors(bio);
3359
 
3360
        bio->bi_rw |= rw;
3361
 
3362
        /*
3363
         * If it's a regular read/write or a barrier with data attached,
3364
         * go through the normal accounting stuff before submission.
3365
         */
3366
        if (!bio_empty_barrier(bio)) {
3367
 
3368
                BIO_BUG_ON(!bio->bi_size);
3369
                BIO_BUG_ON(!bio->bi_io_vec);
3370
 
3371
                if (rw & WRITE) {
3372
                        count_vm_events(PGPGOUT, count);
3373
                } else {
3374
                        task_io_account_read(bio->bi_size);
3375
                        count_vm_events(PGPGIN, count);
3376
                }
3377
 
3378
                if (unlikely(block_dump)) {
3379
                        char b[BDEVNAME_SIZE];
3380
                        printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
3381
                        current->comm, task_pid_nr(current),
3382
                                (rw & WRITE) ? "WRITE" : "READ",
3383
                                (unsigned long long)bio->bi_sector,
3384
                                bdevname(bio->bi_bdev,b));
3385
                }
3386
        }
3387
 
3388
        generic_make_request(bio);
3389
}
3390
 
3391
EXPORT_SYMBOL(submit_bio);
3392
 
3393
static void blk_recalc_rq_sectors(struct request *rq, int nsect)
3394
{
3395
        if (blk_fs_request(rq)) {
3396
                rq->hard_sector += nsect;
3397
                rq->hard_nr_sectors -= nsect;
3398
 
3399
                /*
3400
                 * Move the I/O submission pointers ahead if required.
3401
                 */
3402
                if ((rq->nr_sectors >= rq->hard_nr_sectors) &&
3403
                    (rq->sector <= rq->hard_sector)) {
3404
                        rq->sector = rq->hard_sector;
3405
                        rq->nr_sectors = rq->hard_nr_sectors;
3406
                        rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
3407
                        rq->current_nr_sectors = rq->hard_cur_sectors;
3408
                        rq->buffer = bio_data(rq->bio);
3409
                }
3410
 
3411
                /*
3412
                 * if total number of sectors is less than the first segment
3413
                 * size, something has gone terribly wrong
3414
                 */
3415
                if (rq->nr_sectors < rq->current_nr_sectors) {
3416
                        printk("blk: request botched\n");
3417
                        rq->nr_sectors = rq->current_nr_sectors;
3418
                }
3419
        }
3420
}
3421
 
3422
static int __end_that_request_first(struct request *req, int uptodate,
3423
                                    int nr_bytes)
3424
{
3425
        int total_bytes, bio_nbytes, error, next_idx = 0;
3426
        struct bio *bio;
3427
 
3428
        blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
3429
 
3430
        /*
3431
         * extend uptodate bool to allow < 0 value to be direct io error
3432
         */
3433
        error = 0;
3434
        if (end_io_error(uptodate))
3435
                error = !uptodate ? -EIO : uptodate;
3436
 
3437
        /*
3438
         * for a REQ_BLOCK_PC request, we want to carry any eventual
3439
         * sense key with us all the way through
3440
         */
3441
        if (!blk_pc_request(req))
3442
                req->errors = 0;
3443
 
3444
        if (!uptodate) {
3445
                if (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))
3446
                        printk("end_request: I/O error, dev %s, sector %llu\n",
3447
                                req->rq_disk ? req->rq_disk->disk_name : "?",
3448
                                (unsigned long long)req->sector);
3449
        }
3450
 
3451
        if (blk_fs_request(req) && req->rq_disk) {
3452
                const int rw = rq_data_dir(req);
3453
 
3454
                disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9);
3455
        }
3456
 
3457
        total_bytes = bio_nbytes = 0;
3458
        while ((bio = req->bio) != NULL) {
3459
                int nbytes;
3460
 
3461
                /*
3462
                 * For an empty barrier request, the low level driver must
3463
                 * store a potential error location in ->sector. We pass
3464
                 * that back up in ->bi_sector.
3465
                 */
3466
                if (blk_empty_barrier(req))
3467
                        bio->bi_sector = req->sector;
3468
 
3469
                if (nr_bytes >= bio->bi_size) {
3470
                        req->bio = bio->bi_next;
3471
                        nbytes = bio->bi_size;
3472
                        req_bio_endio(req, bio, nbytes, error);
3473
                        next_idx = 0;
3474
                        bio_nbytes = 0;
3475
                } else {
3476
                        int idx = bio->bi_idx + next_idx;
3477
 
3478
                        if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
3479
                                blk_dump_rq_flags(req, "__end_that");
3480
                                printk("%s: bio idx %d >= vcnt %d\n",
3481
                                                __FUNCTION__,
3482
                                                bio->bi_idx, bio->bi_vcnt);
3483
                                break;
3484
                        }
3485
 
3486
                        nbytes = bio_iovec_idx(bio, idx)->bv_len;
3487
                        BIO_BUG_ON(nbytes > bio->bi_size);
3488
 
3489
                        /*
3490
                         * not a complete bvec done
3491
                         */
3492
                        if (unlikely(nbytes > nr_bytes)) {
3493
                                bio_nbytes += nr_bytes;
3494
                                total_bytes += nr_bytes;
3495
                                break;
3496
                        }
3497
 
3498
                        /*
3499
                         * advance to the next vector
3500
                         */
3501
                        next_idx++;
3502
                        bio_nbytes += nbytes;
3503
                }
3504
 
3505
                total_bytes += nbytes;
3506
                nr_bytes -= nbytes;
3507
 
3508
                if ((bio = req->bio)) {
3509
                        /*
3510
                         * end more in this run, or just return 'not-done'
3511
                         */
3512
                        if (unlikely(nr_bytes <= 0))
3513
                                break;
3514
                }
3515
        }
3516
 
3517
        /*
3518
         * completely done
3519
         */
3520
        if (!req->bio)
3521
                return 0;
3522
 
3523
        /*
3524
         * if the request wasn't completed, update state
3525
         */
3526
        if (bio_nbytes) {
3527
                req_bio_endio(req, bio, bio_nbytes, error);
3528
                bio->bi_idx += next_idx;
3529
                bio_iovec(bio)->bv_offset += nr_bytes;
3530
                bio_iovec(bio)->bv_len -= nr_bytes;
3531
        }
3532
 
3533
        blk_recalc_rq_sectors(req, total_bytes >> 9);
3534
        blk_recalc_rq_segments(req);
3535
        return 1;
3536
}
3537
 
3538
/**
3539
 * end_that_request_first - end I/O on a request
3540
 * @req:      the request being processed
3541
 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
3542
 * @nr_sectors: number of sectors to end I/O on
3543
 *
3544
 * Description:
3545
 *     Ends I/O on a number of sectors attached to @req, and sets it up
3546
 *     for the next range of segments (if any) in the cluster.
3547
 *
3548
 * Return:
3549
 *     0 - we are done with this request, call end_that_request_last()
3550
 *     1 - still buffers pending for this request
3551
 **/
3552
int end_that_request_first(struct request *req, int uptodate, int nr_sectors)
3553
{
3554
        return __end_that_request_first(req, uptodate, nr_sectors << 9);
3555
}
3556
 
3557
EXPORT_SYMBOL(end_that_request_first);
3558
 
3559
/**
3560
 * end_that_request_chunk - end I/O on a request
3561
 * @req:      the request being processed
3562
 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
3563
 * @nr_bytes: number of bytes to complete
3564
 *
3565
 * Description:
3566
 *     Ends I/O on a number of bytes attached to @req, and sets it up
3567
 *     for the next range of segments (if any). Like end_that_request_first(),
3568
 *     but deals with bytes instead of sectors.
3569
 *
3570
 * Return:
3571
 *     0 - we are done with this request, call end_that_request_last()
3572
 *     1 - still buffers pending for this request
3573
 **/
3574
int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes)
3575
{
3576
        return __end_that_request_first(req, uptodate, nr_bytes);
3577
}
3578
 
3579
EXPORT_SYMBOL(end_that_request_chunk);
3580
 
3581
/*
3582
 * splice the completion data to a local structure and hand off to
3583
 * process_completion_queue() to complete the requests
3584
 */
3585
static void blk_done_softirq(struct softirq_action *h)
3586
{
3587
        struct list_head *cpu_list, local_list;
3588
 
3589
        local_irq_disable();
3590
        cpu_list = &__get_cpu_var(blk_cpu_done);
3591
        list_replace_init(cpu_list, &local_list);
3592
        local_irq_enable();
3593
 
3594
        while (!list_empty(&local_list)) {
3595
                struct request *rq = list_entry(local_list.next, struct request, donelist);
3596
 
3597
                list_del_init(&rq->donelist);
3598
                rq->q->softirq_done_fn(rq);
3599
        }
3600
}
3601
 
3602
static int __cpuinit blk_cpu_notify(struct notifier_block *self, unsigned long action,
3603
                          void *hcpu)
3604
{
3605
        /*
3606
         * If a CPU goes away, splice its entries to the current CPU
3607
         * and trigger a run of the softirq
3608
         */
3609
        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
3610
                int cpu = (unsigned long) hcpu;
3611
 
3612
                local_irq_disable();
3613
                list_splice_init(&per_cpu(blk_cpu_done, cpu),
3614
                                 &__get_cpu_var(blk_cpu_done));
3615
                raise_softirq_irqoff(BLOCK_SOFTIRQ);
3616
                local_irq_enable();
3617
        }
3618
 
3619
        return NOTIFY_OK;
3620
}
3621
 
3622
 
3623
static struct notifier_block blk_cpu_notifier __cpuinitdata = {
3624
        .notifier_call  = blk_cpu_notify,
3625
};
3626
 
3627
/**
3628
 * blk_complete_request - end I/O on a request
3629
 * @req:      the request being processed
3630
 *
3631
 * Description:
3632
 *     Ends all I/O on a request. It does not handle partial completions,
3633
 *     unless the driver actually implements this in its completion callback
3634
 *     through requeueing. The actual completion happens out-of-order,
3635
 *     through a softirq handler. The user must have registered a completion
3636
 *     callback through blk_queue_softirq_done().
3637
 **/
3638
 
3639
void blk_complete_request(struct request *req)
3640
{
3641
        struct list_head *cpu_list;
3642
        unsigned long flags;
3643
 
3644
        BUG_ON(!req->q->softirq_done_fn);
3645
 
3646
        local_irq_save(flags);
3647
 
3648
        cpu_list = &__get_cpu_var(blk_cpu_done);
3649
        list_add_tail(&req->donelist, cpu_list);
3650
        raise_softirq_irqoff(BLOCK_SOFTIRQ);
3651
 
3652
        local_irq_restore(flags);
3653
}
3654
 
3655
EXPORT_SYMBOL(blk_complete_request);
3656
 
3657
/*
3658
 * queue lock must be held
3659
 */
3660
void end_that_request_last(struct request *req, int uptodate)
3661
{
3662
        struct gendisk *disk = req->rq_disk;
3663
        int error;
3664
 
3665
        /*
3666
         * extend uptodate bool to allow < 0 value to be direct io error
3667
         */
3668
        error = 0;
3669
        if (end_io_error(uptodate))
3670
                error = !uptodate ? -EIO : uptodate;
3671
 
3672
        if (unlikely(laptop_mode) && blk_fs_request(req))
3673
                laptop_io_completion();
3674
 
3675
        /*
3676
         * Account IO completion.  bar_rq isn't accounted as a normal
3677
         * IO on queueing nor completion.  Accounting the containing
3678
         * request is enough.
3679
         */
3680
        if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {
3681
                unsigned long duration = jiffies - req->start_time;
3682
                const int rw = rq_data_dir(req);
3683
 
3684
                __disk_stat_inc(disk, ios[rw]);
3685
                __disk_stat_add(disk, ticks[rw], duration);
3686
                disk_round_stats(disk);
3687
                disk->in_flight--;
3688
        }
3689
        if (req->end_io)
3690
                req->end_io(req, error);
3691
        else
3692
                __blk_put_request(req->q, req);
3693
}
3694
 
3695
EXPORT_SYMBOL(end_that_request_last);
3696
 
3697
static inline void __end_request(struct request *rq, int uptodate,
3698
                                 unsigned int nr_bytes, int dequeue)
3699
{
3700
        if (!end_that_request_chunk(rq, uptodate, nr_bytes)) {
3701
                if (dequeue)
3702
                        blkdev_dequeue_request(rq);
3703
                add_disk_randomness(rq->rq_disk);
3704
                end_that_request_last(rq, uptodate);
3705
        }
3706
}
3707
 
3708
static unsigned int rq_byte_size(struct request *rq)
3709
{
3710
        if (blk_fs_request(rq))
3711
                return rq->hard_nr_sectors << 9;
3712
 
3713
        return rq->data_len;
3714
}
3715
 
3716
/**
3717
 * end_queued_request - end all I/O on a queued request
3718
 * @rq:         the request being processed
3719
 * @uptodate:   error value or 0/1 uptodate flag
3720
 *
3721
 * Description:
3722
 *     Ends all I/O on a request, and removes it from the block layer queues.
3723
 *     Not suitable for normal IO completion, unless the driver still has
3724
 *     the request attached to the block layer.
3725
 *
3726
 **/
3727
void end_queued_request(struct request *rq, int uptodate)
3728
{
3729
        __end_request(rq, uptodate, rq_byte_size(rq), 1);
3730
}
3731
EXPORT_SYMBOL(end_queued_request);
3732
 
3733
/**
3734
 * end_dequeued_request - end all I/O on a dequeued request
3735
 * @rq:         the request being processed
3736
 * @uptodate:   error value or 0/1 uptodate flag
3737
 *
3738
 * Description:
3739
 *     Ends all I/O on a request. The request must already have been
3740
 *     dequeued using blkdev_dequeue_request(), as is normally the case
3741
 *     for most drivers.
3742
 *
3743
 **/
3744
void end_dequeued_request(struct request *rq, int uptodate)
3745
{
3746
        __end_request(rq, uptodate, rq_byte_size(rq), 0);
3747
}
3748
EXPORT_SYMBOL(end_dequeued_request);
3749
 
3750
 
3751
/**
3752
 * end_request - end I/O on the current segment of the request
3753
 * @req:        the request being processed
3754
 * @uptodate:   error value or 0/1 uptodate flag
3755
 *
3756
 * Description:
3757
 *     Ends I/O on the current segment of a request. If that is the only
3758
 *     remaining segment, the request is also completed and freed.
3759
 *
3760
 *     This is a remnant of how older block drivers handled IO completions.
3761
 *     Modern drivers typically end IO on the full request in one go, unless
3762
 *     they have a residual value to account for. For that case this function
3763
 *     isn't really useful, unless the residual just happens to be the
3764
 *     full current segment. In other words, don't use this function in new
3765
 *     code. Either use end_request_completely(), or the
3766
 *     end_that_request_chunk() (along with end_that_request_last()) for
3767
 *     partial completions.
3768
 *
3769
 **/
3770
void end_request(struct request *req, int uptodate)
3771
{
3772
        __end_request(req, uptodate, req->hard_cur_sectors << 9, 1);
3773
}
3774
EXPORT_SYMBOL(end_request);
3775
 
3776
static void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
3777
                            struct bio *bio)
3778
{
3779
        /* first two bits are identical in rq->cmd_flags and bio->bi_rw */
3780
        rq->cmd_flags |= (bio->bi_rw & 3);
3781
 
3782
        rq->nr_phys_segments = bio_phys_segments(q, bio);
3783
        rq->nr_hw_segments = bio_hw_segments(q, bio);
3784
        rq->current_nr_sectors = bio_cur_sectors(bio);
3785
        rq->hard_cur_sectors = rq->current_nr_sectors;
3786
        rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
3787
        rq->buffer = bio_data(bio);
3788
        rq->data_len = bio->bi_size;
3789
 
3790
        rq->bio = rq->biotail = bio;
3791
 
3792
        if (bio->bi_bdev)
3793
                rq->rq_disk = bio->bi_bdev->bd_disk;
3794
}
3795
 
3796
int kblockd_schedule_work(struct work_struct *work)
3797
{
3798
        return queue_work(kblockd_workqueue, work);
3799
}
3800
 
3801
EXPORT_SYMBOL(kblockd_schedule_work);
3802
 
3803
void kblockd_flush_work(struct work_struct *work)
3804
{
3805
        cancel_work_sync(work);
3806
}
3807
EXPORT_SYMBOL(kblockd_flush_work);
3808
 
3809
int __init blk_dev_init(void)
3810
{
3811
        int i;
3812
 
3813
        kblockd_workqueue = create_workqueue("kblockd");
3814
        if (!kblockd_workqueue)
3815
                panic("Failed to create kblockd\n");
3816
 
3817
        request_cachep = kmem_cache_create("blkdev_requests",
3818
                        sizeof(struct request), 0, SLAB_PANIC, NULL);
3819
 
3820
        requestq_cachep = kmem_cache_create("blkdev_queue",
3821
                        sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
3822
 
3823
        iocontext_cachep = kmem_cache_create("blkdev_ioc",
3824
                        sizeof(struct io_context), 0, SLAB_PANIC, NULL);
3825
 
3826
        for_each_possible_cpu(i)
3827
                INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
3828
 
3829
        open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
3830
        register_hotcpu_notifier(&blk_cpu_notifier);
3831
 
3832
        blk_max_low_pfn = max_low_pfn - 1;
3833
        blk_max_pfn = max_pfn - 1;
3834
 
3835
        return 0;
3836
}
3837
 
3838
/*
3839
 * IO Context helper functions
3840
 */
3841
void put_io_context(struct io_context *ioc)
3842
{
3843
        if (ioc == NULL)
3844
                return;
3845
 
3846
        BUG_ON(atomic_read(&ioc->refcount) == 0);
3847
 
3848
        if (atomic_dec_and_test(&ioc->refcount)) {
3849
                struct cfq_io_context *cic;
3850
 
3851
                rcu_read_lock();
3852
                if (ioc->aic && ioc->aic->dtor)
3853
                        ioc->aic->dtor(ioc->aic);
3854
                if (ioc->cic_root.rb_node != NULL) {
3855
                        struct rb_node *n = rb_first(&ioc->cic_root);
3856
 
3857
                        cic = rb_entry(n, struct cfq_io_context, rb_node);
3858
                        cic->dtor(ioc);
3859
                }
3860
                rcu_read_unlock();
3861
 
3862
                kmem_cache_free(iocontext_cachep, ioc);
3863
        }
3864
}
3865
EXPORT_SYMBOL(put_io_context);
3866
 
3867
/* Called by the exitting task */
3868
void exit_io_context(void)
3869
{
3870
        struct io_context *ioc;
3871
        struct cfq_io_context *cic;
3872
 
3873
        task_lock(current);
3874
        ioc = current->io_context;
3875
        current->io_context = NULL;
3876
        task_unlock(current);
3877
 
3878
        ioc->task = NULL;
3879
        if (ioc->aic && ioc->aic->exit)
3880
                ioc->aic->exit(ioc->aic);
3881
        if (ioc->cic_root.rb_node != NULL) {
3882
                cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
3883
                cic->exit(ioc);
3884
        }
3885
 
3886
        put_io_context(ioc);
3887
}
3888
 
3889
/*
3890
 * If the current task has no IO context then create one and initialise it.
3891
 * Otherwise, return its existing IO context.
3892
 *
3893
 * This returned IO context doesn't have a specifically elevated refcount,
3894
 * but since the current task itself holds a reference, the context can be
3895
 * used in general code, so long as it stays within `current` context.
3896
 */
3897
static struct io_context *current_io_context(gfp_t gfp_flags, int node)
3898
{
3899
        struct task_struct *tsk = current;
3900
        struct io_context *ret;
3901
 
3902
        ret = tsk->io_context;
3903
        if (likely(ret))
3904
                return ret;
3905
 
3906
        ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
3907
        if (ret) {
3908
                atomic_set(&ret->refcount, 1);
3909
                ret->task = current;
3910
                ret->ioprio_changed = 0;
3911
                ret->last_waited = jiffies; /* doesn't matter... */
3912
                ret->nr_batch_requests = 0; /* because this is 0 */
3913
                ret->aic = NULL;
3914
                ret->cic_root.rb_node = NULL;
3915
                ret->ioc_data = NULL;
3916
                /* make sure set_task_ioprio() sees the settings above */
3917
                smp_wmb();
3918
                tsk->io_context = ret;
3919
        }
3920
 
3921
        return ret;
3922
}
3923
 
3924
/*
3925
 * If the current task has no IO context then create one and initialise it.
3926
 * If it does have a context, take a ref on it.
3927
 *
3928
 * This is always called in the context of the task which submitted the I/O.
3929
 */
3930
struct io_context *get_io_context(gfp_t gfp_flags, int node)
3931
{
3932
        struct io_context *ret;
3933
        ret = current_io_context(gfp_flags, node);
3934
        if (likely(ret))
3935
                atomic_inc(&ret->refcount);
3936
        return ret;
3937
}
3938
EXPORT_SYMBOL(get_io_context);
3939
 
3940
void copy_io_context(struct io_context **pdst, struct io_context **psrc)
3941
{
3942
        struct io_context *src = *psrc;
3943
        struct io_context *dst = *pdst;
3944
 
3945
        if (src) {
3946
                BUG_ON(atomic_read(&src->refcount) == 0);
3947
                atomic_inc(&src->refcount);
3948
                put_io_context(dst);
3949
                *pdst = src;
3950
        }
3951
}
3952
EXPORT_SYMBOL(copy_io_context);
3953
 
3954
void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)
3955
{
3956
        struct io_context *temp;
3957
        temp = *ioc1;
3958
        *ioc1 = *ioc2;
3959
        *ioc2 = temp;
3960
}
3961
EXPORT_SYMBOL(swap_io_context);
3962
 
3963
/*
3964
 * sysfs parts below
3965
 */
3966
struct queue_sysfs_entry {
3967
        struct attribute attr;
3968
        ssize_t (*show)(struct request_queue *, char *);
3969
        ssize_t (*store)(struct request_queue *, const char *, size_t);
3970
};
3971
 
3972
static ssize_t
3973
queue_var_show(unsigned int var, char *page)
3974
{
3975
        return sprintf(page, "%d\n", var);
3976
}
3977
 
3978
static ssize_t
3979
queue_var_store(unsigned long *var, const char *page, size_t count)
3980
{
3981
        char *p = (char *) page;
3982
 
3983
        *var = simple_strtoul(p, &p, 10);
3984
        return count;
3985
}
3986
 
3987
static ssize_t queue_requests_show(struct request_queue *q, char *page)
3988
{
3989
        return queue_var_show(q->nr_requests, (page));
3990
}
3991
 
3992
static ssize_t
3993
queue_requests_store(struct request_queue *q, const char *page, size_t count)
3994
{
3995
        struct request_list *rl = &q->rq;
3996
        unsigned long nr;
3997
        int ret = queue_var_store(&nr, page, count);
3998
        if (nr < BLKDEV_MIN_RQ)
3999
                nr = BLKDEV_MIN_RQ;
4000
 
4001
        spin_lock_irq(q->queue_lock);
4002
        q->nr_requests = nr;
4003
        blk_queue_congestion_threshold(q);
4004
 
4005
        if (rl->count[READ] >= queue_congestion_on_threshold(q))
4006
                blk_set_queue_congested(q, READ);
4007
        else if (rl->count[READ] < queue_congestion_off_threshold(q))
4008
                blk_clear_queue_congested(q, READ);
4009
 
4010
        if (rl->count[WRITE] >= queue_congestion_on_threshold(q))
4011
                blk_set_queue_congested(q, WRITE);
4012
        else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
4013
                blk_clear_queue_congested(q, WRITE);
4014
 
4015
        if (rl->count[READ] >= q->nr_requests) {
4016
                blk_set_queue_full(q, READ);
4017
        } else if (rl->count[READ]+1 <= q->nr_requests) {
4018
                blk_clear_queue_full(q, READ);
4019
                wake_up(&rl->wait[READ]);
4020
        }
4021
 
4022
        if (rl->count[WRITE] >= q->nr_requests) {
4023
                blk_set_queue_full(q, WRITE);
4024
        } else if (rl->count[WRITE]+1 <= q->nr_requests) {
4025
                blk_clear_queue_full(q, WRITE);
4026
                wake_up(&rl->wait[WRITE]);
4027
        }
4028
        spin_unlock_irq(q->queue_lock);
4029
        return ret;
4030
}
4031
 
4032
static ssize_t queue_ra_show(struct request_queue *q, char *page)
4033
{
4034
        int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
4035
 
4036
        return queue_var_show(ra_kb, (page));
4037
}
4038
 
4039
static ssize_t
4040
queue_ra_store(struct request_queue *q, const char *page, size_t count)
4041
{
4042
        unsigned long ra_kb;
4043
        ssize_t ret = queue_var_store(&ra_kb, page, count);
4044
 
4045
        spin_lock_irq(q->queue_lock);
4046
        q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
4047
        spin_unlock_irq(q->queue_lock);
4048
 
4049
        return ret;
4050
}
4051
 
4052
static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
4053
{
4054
        int max_sectors_kb = q->max_sectors >> 1;
4055
 
4056
        return queue_var_show(max_sectors_kb, (page));
4057
}
4058
 
4059
static ssize_t
4060
queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
4061
{
4062
        unsigned long max_sectors_kb,
4063
                        max_hw_sectors_kb = q->max_hw_sectors >> 1,
4064
                        page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
4065
        ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
4066
 
4067
        if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
4068
                return -EINVAL;
4069
        /*
4070
         * Take the queue lock to update the readahead and max_sectors
4071
         * values synchronously:
4072
         */
4073
        spin_lock_irq(q->queue_lock);
4074
        q->max_sectors = max_sectors_kb << 1;
4075
        spin_unlock_irq(q->queue_lock);
4076
 
4077
        return ret;
4078
}
4079
 
4080
static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
4081
{
4082
        int max_hw_sectors_kb = q->max_hw_sectors >> 1;
4083
 
4084
        return queue_var_show(max_hw_sectors_kb, (page));
4085
}
4086
 
4087
 
4088
static struct queue_sysfs_entry queue_requests_entry = {
4089
        .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
4090
        .show = queue_requests_show,
4091
        .store = queue_requests_store,
4092
};
4093
 
4094
static struct queue_sysfs_entry queue_ra_entry = {
4095
        .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
4096
        .show = queue_ra_show,
4097
        .store = queue_ra_store,
4098
};
4099
 
4100
static struct queue_sysfs_entry queue_max_sectors_entry = {
4101
        .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
4102
        .show = queue_max_sectors_show,
4103
        .store = queue_max_sectors_store,
4104
};
4105
 
4106
static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
4107
        .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
4108
        .show = queue_max_hw_sectors_show,
4109
};
4110
 
4111
static struct queue_sysfs_entry queue_iosched_entry = {
4112
        .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
4113
        .show = elv_iosched_show,
4114
        .store = elv_iosched_store,
4115
};
4116
 
4117
static struct attribute *default_attrs[] = {
4118
        &queue_requests_entry.attr,
4119
        &queue_ra_entry.attr,
4120
        &queue_max_hw_sectors_entry.attr,
4121
        &queue_max_sectors_entry.attr,
4122
        &queue_iosched_entry.attr,
4123
        NULL,
4124
};
4125
 
4126
#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
4127
 
4128
static ssize_t
4129
queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
4130
{
4131
        struct queue_sysfs_entry *entry = to_queue(attr);
4132
        struct request_queue *q =
4133
                container_of(kobj, struct request_queue, kobj);
4134
        ssize_t res;
4135
 
4136
        if (!entry->show)
4137
                return -EIO;
4138
        mutex_lock(&q->sysfs_lock);
4139
        if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
4140
                mutex_unlock(&q->sysfs_lock);
4141
                return -ENOENT;
4142
        }
4143
        res = entry->show(q, page);
4144
        mutex_unlock(&q->sysfs_lock);
4145
        return res;
4146
}
4147
 
4148
static ssize_t
4149
queue_attr_store(struct kobject *kobj, struct attribute *attr,
4150
                    const char *page, size_t length)
4151
{
4152
        struct queue_sysfs_entry *entry = to_queue(attr);
4153
        struct request_queue *q = container_of(kobj, struct request_queue, kobj);
4154
 
4155
        ssize_t res;
4156
 
4157
        if (!entry->store)
4158
                return -EIO;
4159
        mutex_lock(&q->sysfs_lock);
4160
        if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
4161
                mutex_unlock(&q->sysfs_lock);
4162
                return -ENOENT;
4163
        }
4164
        res = entry->store(q, page, length);
4165
        mutex_unlock(&q->sysfs_lock);
4166
        return res;
4167
}
4168
 
4169
static struct sysfs_ops queue_sysfs_ops = {
4170
        .show   = queue_attr_show,
4171
        .store  = queue_attr_store,
4172
};
4173
 
4174
static struct kobj_type queue_ktype = {
4175
        .sysfs_ops      = &queue_sysfs_ops,
4176
        .default_attrs  = default_attrs,
4177
        .release        = blk_release_queue,
4178
};
4179
 
4180
int blk_register_queue(struct gendisk *disk)
4181
{
4182
        int ret;
4183
 
4184
        struct request_queue *q = disk->queue;
4185
 
4186
        if (!q || !q->request_fn)
4187
                return -ENXIO;
4188
 
4189
        q->kobj.parent = kobject_get(&disk->kobj);
4190
 
4191
        ret = kobject_add(&q->kobj);
4192
        if (ret < 0)
4193
                return ret;
4194
 
4195
        kobject_uevent(&q->kobj, KOBJ_ADD);
4196
 
4197
        ret = elv_register_queue(q);
4198
        if (ret) {
4199
                kobject_uevent(&q->kobj, KOBJ_REMOVE);
4200
                kobject_del(&q->kobj);
4201
                return ret;
4202
        }
4203
 
4204
        return 0;
4205
}
4206
 
4207
void blk_unregister_queue(struct gendisk *disk)
4208
{
4209
        struct request_queue *q = disk->queue;
4210
 
4211
        if (q && q->request_fn) {
4212
                elv_unregister_queue(q);
4213
 
4214
                kobject_uevent(&q->kobj, KOBJ_REMOVE);
4215
                kobject_del(&q->kobj);
4216
                kobject_put(&disk->kobj);
4217
        }
4218
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.