./drivers/block/ll_rw

      /*
       *  linux/drivers/block/ll_rw_blk.c
       *
       * Copyright (C) 1991, 1992 Linus Torvalds
       * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
       * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
       * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
       * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> -  July2000
       */
      
      /*
       * This handles all read/write requests to block devices
       */
      #include <linux/sched.h>
      #include <linux/kernel.h>
      #include <linux/kernel_stat.h>
      #include <linux/errno.h>
      #include <linux/string.h>
      #include <linux/config.h>
      #include <linux/locks.h>
      #include <linux/mm.h>
      #include <linux/init.h>
      #include <linux/smp_lock.h>
      
      #include <asm/system.h>
      #include <asm/io.h>
      #include <linux/blk.h>
      #include <linux/highmem.h>
      #include <linux/raid/md.h>
      
      #include <linux/module.h>
      
      /*
       * MAC Floppy IWM hooks
       */
      
      #ifdef CONFIG_MAC_FLOPPY_IWM
      extern int mac_floppy_init(void);
      #endif
      
      extern int lvm_init(void);
      
      /*
       * For the allocated request tables
       */
      static kmem_cache_t *request_cachep;
      
      /*
       * The "disk" task queue is used to start the actual requests
       * after a plug
       */
      DECLARE_TASK_QUEUE(tq_disk);
      
      /*
       * Protect the request list against multiple users..
       *
       * With this spinlock the Linux block IO subsystem is 100% SMP threaded
       * from the IRQ event side, and almost 100% SMP threaded from the syscall
       * side (we still have protect against block device array operations, and
       * the do_request() side is casually still unsafe. The kernel lock protects
       * this part currently.).
       *
       * there is a fair chance that things will work just OK if these functions
       * are called with no global kernel lock held ...
       */
      spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
      
      /* This specifies how many sectors to read ahead on the disk. */
      
      int read_ahead[MAX_BLKDEV];
      
      /* blk_dev_struct is:
       *	*request_fn
       *	*current_request
       */
      struct blk_dev_struct blk_dev[MAX_BLKDEV]; /* initialized by blk_dev_init() */
      
      /*
       * blk_size contains the size of all block-devices in units of 1024 byte
       * sectors:
       *
       * blk_size[MAJOR][MINOR]
       *
       * if (!blk_size[MAJOR]) then no minor size checking is done.
       */
      int * blk_size[MAX_BLKDEV];
      
      /*
       * blksize_size contains the size of all block-devices:
       *
       * blksize_size[MAJOR][MINOR]
       *
       * if (!blksize_size[MAJOR]) then 1024 bytes is assumed.
       */
      int * blksize_size[MAX_BLKDEV];
      
      /*
       * hardsect_size contains the size of the hardware sector of a device.
       *
       * hardsect_size[MAJOR][MINOR]
       *
       * if (!hardsect_size[MAJOR])
       *		then 512 bytes is assumed.
       * else
       *		sector_size is hardsect_size[MAJOR][MINOR]
       * This is currently set by some scsi devices and read by the msdos fs driver.
       * Other uses may appear later.
       */
      int * hardsect_size[MAX_BLKDEV];
      
      /*
       * The following tunes the read-ahead algorithm in mm/filemap.c
       */
      int * max_readahead[MAX_BLKDEV];
      
      /*
       * Max number of sectors per request
       */
      int * max_sectors[MAX_BLKDEV];
      
 121  static inline int get_max_sectors(kdev_t dev)
      {
 123  	if (!max_sectors[MAJOR(dev)])
 124  		return MAX_SECTORS;
 125  	return max_sectors[MAJOR(dev)][MINOR(dev)];
      }
      
 128  static inline request_queue_t *__blk_get_queue(kdev_t dev)
      {
      	struct blk_dev_struct *bdev = blk_dev + MAJOR(dev);
      
 132  	if (bdev->queue)
 133  		return bdev->queue(dev);
 134  	else
 135  		return &blk_dev[MAJOR(dev)].request_queue;
      }
      
      /*
       * NOTE: the device-specific queue() functions
       * have to be atomic!
       */
 142  request_queue_t *blk_get_queue(kdev_t dev)
      {
      	request_queue_t *ret;
      	unsigned long flags;
      
 147  	spin_lock_irqsave(&io_request_lock,flags);
      	ret = __blk_get_queue(dev);
 149  	spin_unlock_irqrestore(&io_request_lock,flags);
      
 151  	return ret;
      }
      
 154  static int __blk_cleanup_queue(struct list_head *head)
      {
      	struct list_head *entry;
      	struct request *rq;
      	int i = 0;
      
 160  	if (list_empty(head))
 161  		return 0;
      
      	entry = head->next;
 164  	do {
      		rq = list_entry(entry, struct request, table);
      		entry = entry->next;
      		list_del(&rq->table);
      		kmem_cache_free(request_cachep, rq);
      		i++;
 170  	} while (!list_empty(head));
      
 172  	return i;
      }
      
      /**
       * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
       * @q:    the request queue to be released
       *
       * Description:
       *     blk_cleanup_queue is the pair to blk_init_queue().  It should
       *     be called when a request queue is being released; typically
       *     when a block device is being de-registered.  Currently, its
       *     primary task it to free all the &struct request structures that
       *     were allocated to the queue.
       * Caveat: 
       *     Hopefully the low level driver will have finished any
       *     outstanding requests first...
       **/
 189  void blk_cleanup_queue(request_queue_t * q)
      {
      	int count = QUEUE_NR_REQUESTS;
      
      	count -= __blk_cleanup_queue(&q->request_freelist[READ]);
      	count -= __blk_cleanup_queue(&q->request_freelist[WRITE]);
      
 196  	if (count)
      		printk("blk_cleanup_queue: leaked requests (%d)\n", count);
      
      	memset(q, 0, sizeof(*q));
      }
      
      /**
       * blk_queue_headactive - indicate whether head of request queue may be active
       * @q:       The queue which this applies to.
       * @active:  A flag indication where the head of the queue is active.
       *
       * Description:
       *    The driver for a block device may choose to leave the currently active
       *    request on the request queue, removing it only when it has completed.
       *    The queue handling routines assume this by default for safety reasons
       *    and will not involve the head of the request queue in any merging or
       *    reordering of requests when the queue is unplugged (and thus may be
       *    working on this particular request).
       *
       *    If a driver removes requests from the queue before processing them, then
       *    it may indicate that it does so, there by allowing the head of the queue
       *    to be involved in merging and reordering.  This is done be calling
       *    blk_queue_headactive() with an @active flag of %0.
       *
       *    If a driver processes several requests at once, it must remove them (or
       *    at least all but one of them) from the request queue.
       *
       *    When a queue is plugged (see blk_queue_pluggable()) the head will be
       *    assumed to be inactive.
       **/
       
 227  void blk_queue_headactive(request_queue_t * q, int active)
      {
      	q->head_active = active;
      }
      
      /**
       * blk_queue_pluggable - define a plugging function for a request queue
       * @q:   the request queue to which the function will apply
       * @plug: the function to be called to plug a queue
       *
       * Description:
       *   A request queue will be "plugged" if a request is added to it
       *   while it is empty.  This allows a number of requests to be added
       *   before any are processed, thus providing an opportunity for these
       *   requests to be merged or re-ordered.
       *   The default plugging function (generic_plug_device()) sets the
       *   "plugged" flag for the queue and adds a task to the $tq_disk task
       *   queue to unplug the queue and call the request function at a
       *   later time.
       *
       *   A device driver may provide an alternate plugging function by
       *   passing it to blk_queue_pluggable().  This function should set
       *   the "plugged" flag if it want calls to the request_function to be
       *   blocked, and should place a task on $tq_disk which will unplug
       *   the queue.  Alternately it can simply do nothing and there-by
       *   disable plugging of the device.
       **/
      
 255  void blk_queue_pluggable (request_queue_t * q, plug_device_fn *plug)
      {
      	q->plug_device_fn = plug;
      }
      
      
      /**
       * blk_queue_make_request - define an alternate make_request function for a device
       * @q:  the request queue for the device to be affected
       * @mfn: the alternate make_request function
       *
       * Description:
       *    The normal way for &struct buffer_heads to be passed to a device
       *    driver is for them to be collected into requests on a request
       *    queue, and then to allow the device driver to select requests
       *    off that queue when it is ready.  This works well for many block
       *    devices. However some block devices (typically virtual devices
       *    such as md or lvm) do not benefit from the processing on the
       *    request queue, and are served best by having the requests passed
       *    directly to them.  This can be achieved by providing a function
       *    to blk_queue_make_request().
       *
       * Caveat:
       *    The driver that does this *must* be able to deal appropriately
       *    with buffers in "highmemory", either by calling bh_kmap() to get
       *    a kernel mapping, to by calling create_bounce() to create a
       *    buffer in normal memory.
       **/
      
 284  void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
      {
      	q->make_request_fn = mfn;
      }
      
 289  static inline int ll_new_segment(request_queue_t *q, struct request *req, int max_segments)
      {
 291  	if (req->nr_segments < max_segments) {
      		req->nr_segments++;
      		q->elevator.nr_segments++;
 294  		return 1;
      	}
 296  	return 0;
      }
      
 299  static int ll_back_merge_fn(request_queue_t *q, struct request *req, 
      			    struct buffer_head *bh, int max_segments)
      {
 302  	if (req->bhtail->b_data + req->bhtail->b_size == bh->b_data)
 303  		return 1;
 304  	return ll_new_segment(q, req, max_segments);
      }
      
 307  static int ll_front_merge_fn(request_queue_t *q, struct request *req, 
      			     struct buffer_head *bh, int max_segments)
      {
 310  	if (bh->b_data + bh->b_size == req->bh->b_data)
 311  		return 1;
 312  	return ll_new_segment(q, req, max_segments);
      }
      
 315  static int ll_merge_requests_fn(request_queue_t *q, struct request *req,
      				struct request *next, int max_segments)
      {
      	int total_segments = req->nr_segments + next->nr_segments;
      	int same_segment;
      
      	same_segment = 0;
 322  	if (req->bhtail->b_data + req->bhtail->b_size == next->bh->b_data) {
      		total_segments--;
      		same_segment = 1;
      	}
          
 327  	if (total_segments > max_segments)
 328  		return 0;
      
      	q->elevator.nr_segments -= same_segment;
      	req->nr_segments = total_segments;
 332  	return 1;
      }
      
      /*
       * "plug" the device if there are no outstanding requests: this will
       * force the transfer to start only after we have put all the requests
       * on the list.
       *
       * This is called with interrupts off and no requests on the queue.
       * (and with the request spinlock acquired)
       */
 343  static void generic_plug_device(request_queue_t *q, kdev_t dev)
      {
      	/*
      	 * no need to replug device
      	 */
 348  	if (!list_empty(&q->queue_head) || q->plugged)
 349  		return;
      
      	q->plugged = 1;
      	queue_task(&q->plug_tq, &tq_disk);
      }
      
      /*
       * remove the plug and let it rip..
       */
 358  static inline void __generic_unplug_device(request_queue_t *q)
      {
 360  	if (q->plugged) {
      		q->plugged = 0;
 362  		if (!list_empty(&q->queue_head))
      			q->request_fn(q);
      	}
      }
      
 367  static void generic_unplug_device(void *data)
      {
      	request_queue_t *q = (request_queue_t *) data;
      	unsigned long flags;
      
 372  	spin_lock_irqsave(&io_request_lock, flags);
      	__generic_unplug_device(q);
 374  	spin_unlock_irqrestore(&io_request_lock, flags);
      }
      
 377  static void blk_init_free_list(request_queue_t *q)
      {
      	struct request *rq;
      	int i;
      
      	/*
      	 * Divide requests in half between read and write. This used to
      	 * be a 2/3 advantage for reads, but now reads can steal from
      	 * the write free list.
      	 */
 387  	for (i = 0; i < QUEUE_NR_REQUESTS; i++) {
      		rq = kmem_cache_alloc(request_cachep, SLAB_KERNEL);
      		rq->rq_status = RQ_INACTIVE;
      		list_add(&rq->table, &q->request_freelist[i & 1]);
      	}
      
      	init_waitqueue_head(&q->wait_for_request);
 394  	spin_lock_init(&q->request_lock);
      }
      
      static int __make_request(request_queue_t * q, int rw, struct buffer_head * bh);
      
      /**
       * blk_init_queue  - prepare a request queue for use with a block device
       * @q:    The &request_queue_t to be initialised
       * @rfn:  The function to be called to process requests that have been
       *        placed on the queue.
       *
       * Description:
       *    If a block device wishes to use the standard request handling procedures,
       *    which sorts requests and coalesces adjacent requests, then it must
       *    call blk_init_queue().  The function @rfn will be called when there
       *    are requests on the queue that need to be processed.  If the device
       *    supports plugging, then @rfn may not be called immediately when requests
       *    are available on the queue, but may be called at some time later instead.
       *    Plugged queues are generally unplugged when a buffer belonging to one
       *    of the requests on the queue is needed, or due to memory pressure.
       *
       *    @rfn is not required, or even expected, to remove all requests off the
       *    queue, but only as many as it can handle at a time.  If it does leave
       *    requests on the queue, it is responsible for arranging that the requests
       *    get dealt with eventually.
       *
       *    A global spin lock $io_request_lock must be held while manipulating the
       *    requests on the request queue.
       *
       *    The request on the head of the queue is by default assumed to be
       *    potentially active, and it is not considered for re-ordering or merging
       *    whenever the given queue is unplugged. This behaviour can be changed with
       *    blk_queue_headactive().
       *
       * Note:
       *    blk_init_queue() must be paired with a blk_cleanup-queue() call
       *    when the block device is deactivated (such as at module unload).
       **/
 432  void blk_init_queue(request_queue_t * q, request_fn_proc * rfn)
      {
 434  	INIT_LIST_HEAD(&q->queue_head);
 435  	INIT_LIST_HEAD(&q->request_freelist[READ]);
 436  	INIT_LIST_HEAD(&q->request_freelist[WRITE]);
      	elevator_init(&q->elevator, ELEVATOR_LINUS);
      	blk_init_free_list(q);
      	q->request_fn     	= rfn;
      	q->back_merge_fn       	= ll_back_merge_fn;
      	q->front_merge_fn      	= ll_front_merge_fn;
      	q->merge_requests_fn	= ll_merge_requests_fn;
      	q->make_request_fn	= __make_request;
      	q->plug_tq.sync		= 0;
      	q->plug_tq.routine	= &generic_unplug_device;
      	q->plug_tq.data		= q;
      	q->plugged        	= 0;
      	/*
      	 * These booleans describe the queue properties.  We set the
      	 * default (and most common) values here.  Other drivers can
      	 * use the appropriate functions to alter the queue properties.
      	 * as appropriate.
      	 */
      	q->plug_device_fn 	= generic_plug_device;
      	q->head_active    	= 1;
      }
      
      
      #define blkdev_free_rq(list) list_entry((list)->next, struct request, table);
      /*
       * Get a free request. io_request_lock must be held and interrupts
       * disabled on the way in.
       */
 464  static inline struct request *get_request(request_queue_t *q, int rw)
      {
      	struct list_head *list = &q->request_freelist[rw];
      	struct request *rq;
      
      	/*
      	 * Reads get preferential treatment and are allowed to steal
      	 * from the write free list if necessary.
      	 */
 473  	if (!list_empty(list)) {
      		rq = blkdev_free_rq(list);
 475  		goto got_rq;
      	}
      
      	/*
      	 * if the WRITE list is non-empty, we know that rw is READ
      	 * and that the READ list is empty. allow reads to 'steal'
      	 * from the WRITE list.
      	 */
 483  	if (!list_empty(&q->request_freelist[WRITE])) {
      		list = &q->request_freelist[WRITE];
      		rq = blkdev_free_rq(list);
 486  		goto got_rq;
      	}
      
 489  	return NULL;
      
      got_rq:
      	list_del(&rq->table);
      	rq->free_list = list;
      	rq->rq_status = RQ_ACTIVE;
      	rq->special = NULL;
      	rq->q = q;
 497  	return rq;
      }
      
      /*
       * No available requests for this queue, unplug the device.
       */
 503  static struct request *__get_request_wait(request_queue_t *q, int rw)
      {
      	register struct request *rq;
      	DECLARE_WAITQUEUE(wait, current);
      
      	add_wait_queue_exclusive(&q->wait_for_request, &wait);
 509  	for (;;) {
 510  		__set_current_state(TASK_UNINTERRUPTIBLE);
 511  		spin_lock_irq(&io_request_lock);
      		rq = get_request(q, rw);
 513  		spin_unlock_irq(&io_request_lock);
 514  		if (rq)
 515  			break;
      		generic_unplug_device(q);
      		schedule();
      	}
      	remove_wait_queue(&q->wait_for_request, &wait);
      	current->state = TASK_RUNNING;
 521  	return rq;
      }
      
 524  static inline struct request *get_request_wait(request_queue_t *q, int rw)
      {
      	register struct request *rq;
      
 528  	spin_lock_irq(&io_request_lock);
      	rq = get_request(q, rw);
 530  	spin_unlock_irq(&io_request_lock);
 531  	if (rq)
 532  		return rq;
 533  	return __get_request_wait(q, rw);
      }
      
      /* RO fail safe mechanism */
      
      static long ro_bits[MAX_BLKDEV][8];
      
 540  int is_read_only(kdev_t dev)
      {
      	int minor,major;
      
      	major = MAJOR(dev);
      	minor = MINOR(dev);
 546  	if (major < 0 || major >= MAX_BLKDEV) return 0;
 547  	return ro_bits[major][minor >> 5] & (1 << (minor & 31));
      }
      
 550  void set_device_ro(kdev_t dev,int flag)
      {
      	int minor,major;
      
      	major = MAJOR(dev);
      	minor = MINOR(dev);
 556  	if (major < 0 || major >= MAX_BLKDEV) return;
 557  	if (flag) ro_bits[major][minor >> 5] |= 1 << (minor & 31);
 558  	else ro_bits[major][minor >> 5] &= ~(1 << (minor & 31));
      }
      
 561  inline void drive_stat_acct (kdev_t dev, int rw,
      				unsigned long nr_sectors, int new_io)
      {
      	unsigned int major = MAJOR(dev);
      	unsigned int index;
      
      	index = disk_index(dev);
 568  	if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
 569  		return;
      
      	kstat.dk_drive[major][index] += new_io;
 572  	if (rw == READ) {
      		kstat.dk_drive_rio[major][index] += new_io;
      		kstat.dk_drive_rblk[major][index] += nr_sectors;
 575  	} else if (rw == WRITE) {
      		kstat.dk_drive_wio[major][index] += new_io;
      		kstat.dk_drive_wblk[major][index] += nr_sectors;
 578  	} else
      		printk(KERN_ERR "drive_stat_acct: cmd not R/W?\n");
      }
      
      /*
       * add-request adds a request to the linked list.
       * It disables interrupts (acquires the request spinlock) so that it can muck
       * with the request-lists in peace. Thus it should be called with no spinlocks
       * held.
       *
       * By this point, req->cmd is always either READ/WRITE, never READA,
       * which is important for drive_stat_acct() above.
       */
      
 592  static inline void add_request(request_queue_t * q, struct request * req,
      			       struct list_head *head, int lat)
      {
      	int major;
      
      	drive_stat_acct(req->rq_dev, req->cmd, req->nr_sectors, 1);
      
      	/*
      	 * let selected elevator insert the request
      	 */
      	q->elevator.elevator_fn(req, &q->elevator, &q->queue_head, head, lat);
      
              /*
      	 * FIXME(eric) I don't understand why there is a need for this
      	 * special case code.  It clearly doesn't fit any more with
      	 * the new queueing architecture, and it got added in 2.3.10.
      	 * I am leaving this in here until I hear back from the COMPAQ
      	 * people.
               */
      	major = MAJOR(req->rq_dev);
 612  	if (major >= COMPAQ_SMART2_MAJOR+0 && major <= COMPAQ_SMART2_MAJOR+7)
      		(q->request_fn)(q);
 614  	if (major >= COMPAQ_CISS_MAJOR+0 && major <= COMPAQ_CISS_MAJOR+7)
                      (q->request_fn)(q);
 616  	if (major >= DAC960_MAJOR+0 && major <= DAC960_MAJOR+7)
      		(q->request_fn)(q);
      }
      
      /*
       * Must be called with io_request_lock held and interrupts disabled
       */
 623  void inline blkdev_release_request(struct request *req)
      {
      	req->rq_status = RQ_INACTIVE;
      
      	/*
      	 * Request may not have originated from ll_rw_blk
      	 */
 630  	if (req->free_list) {
      		list_add(&req->table, req->free_list);
      		req->free_list = NULL;
      		wake_up(&req->q->wait_for_request);
      	}
      }
      
      /*
       * Has to be called with the request spinlock acquired
       */
 640  static void attempt_merge(request_queue_t * q,
      			  struct request *req,
      			  int max_sectors,
      			  int max_segments)
      {
      	struct request *next;
        
      	next = blkdev_next_request(req);
 648  	if (req->sector + req->nr_sectors != next->sector)
 649  		return;
      	if (req->cmd != next->cmd
      	    || req->rq_dev != next->rq_dev
      	    || req->nr_sectors + next->nr_sectors > max_sectors
 653  	    || next->sem)
 654  		return;
      	/*
      	 * If we are not allowed to merge these requests, then
      	 * return.  If we are allowed to merge, then the count
      	 * will have been updated to the appropriate number,
      	 * and we shouldn't do it here too.
      	 */
 661  	if(!(q->merge_requests_fn)(q, req, next, max_segments))
 662  		return;
      
      	req->bhtail->b_reqnext = next->bh;
      	req->bhtail = next->bhtail;
      	req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
      	list_del(&next->queue);
      	blkdev_release_request(next);
      }
      
 671  static inline void attempt_back_merge(request_queue_t * q,
      				      struct request *req,
      				      int max_sectors,
      				      int max_segments)
      {
 676  	if (&req->queue == q->queue_head.prev)
 677  		return;
      	attempt_merge(q, req, max_sectors, max_segments);
      }
      
 681  static inline void attempt_front_merge(request_queue_t * q,
      				       struct list_head * head,
      				       struct request *req,
      				       int max_sectors,
      				       int max_segments)
      {
      	struct list_head * prev;
      
      	prev = req->queue.prev;
 690  	if (head == prev)
 691  		return;
      	attempt_merge(q, blkdev_entry_to_request(prev), max_sectors, max_segments);
      }
      
 695  static int __make_request(request_queue_t * q, int rw,
      				  struct buffer_head * bh)
      {
      	unsigned int sector, count;
      	int max_segments = MAX_SEGMENTS;
      	struct request * req = NULL, *freereq = NULL;
      	int rw_ahead, max_sectors, el_ret;
      	struct list_head *head;
      	int latency;
      	elevator_t *elevator = &q->elevator;
      
      	count = bh->b_size >> 9;
      	sector = bh->b_rsector;
      
      	rw_ahead = 0;	/* normal case; gets changed below for READA */
 710  	switch (rw) {
 711  		case READA:
      			rw_ahead = 1;
      			rw = READ;	/* drop into READ */
 714  		case READ:
 715  		case WRITE:
 716  			break;
 717  		default:
 718  			BUG();
 719  			goto end_io;
      	}
      
      	/* We'd better have a real physical mapping!
      	   Check this bit only if the buffer was dirty and just locked
      	   down by us so at this point flushpage will block and
      	   won't clear the mapped bit under us. */
 726  	if (!buffer_mapped(bh))
 727  		BUG();
      
      	/*
      	 * Temporary solution - in 2.5 this will be done by the lowlevel
      	 * driver. Create a bounce buffer if the buffer data points into
      	 * high memory - keep the original buffer otherwise.
      	 */
      #if CONFIG_HIGHMEM
      	bh = create_bounce(rw, bh);
      #endif
      
      /* look for a free request. */
      	/*
      	 * Try to coalesce the new request with old requests
      	 */
      	max_sectors = get_max_sectors(bh->b_rdev);
      
      	latency = elevator_request_latency(elevator, rw);
      
      	/*
      	 * Now we acquire the request spinlock, we have to be mega careful
      	 * not to schedule or do something nonatomic
      	 */
      again:
 751  	spin_lock_irq(&io_request_lock);
      
      	/*
      	 * skip first entry, for devices with active queue head
      	 */
      	head = &q->queue_head;
 757  	if (q->head_active && !q->plugged)
      		head = head->next;
      
 760  	if (list_empty(head)) {
      		q->plug_device_fn(q, bh->b_rdev); /* is atomic */
 762  		goto get_rq;
      	}
      
      	el_ret = elevator->elevator_merge_fn(q, &req, bh, rw,
      					     &max_sectors, &max_segments);
 767  	switch (el_ret) {
      
 769  		case ELEVATOR_BACK_MERGE:
 770  			if (!q->back_merge_fn(q, req, bh, max_segments))
 771  				break;
      			req->bhtail->b_reqnext = bh;
      			req->bhtail = bh;
      			req->nr_sectors = req->hard_nr_sectors += count;
      			req->e = elevator;
      			drive_stat_acct(req->rq_dev, req->cmd, count, 0);
      			attempt_back_merge(q, req, max_sectors, max_segments);
 778  			goto out;
      
 780  		case ELEVATOR_FRONT_MERGE:
 781  			if (!q->front_merge_fn(q, req, bh, max_segments))
 782  				break;
      			bh->b_reqnext = req->bh;
      			req->bh = bh;
      			req->buffer = bh->b_data;
      			req->current_nr_sectors = count;
      			req->sector = req->hard_sector = sector;
      			req->nr_sectors = req->hard_nr_sectors += count;
      			req->e = elevator;
      			drive_stat_acct(req->rq_dev, req->cmd, count, 0);
      			attempt_front_merge(q, head, req, max_sectors, max_segments);
 792  			goto out;
      		/*
      		 * elevator says don't/can't merge. get new request
      		 */
 796  		case ELEVATOR_NO_MERGE:
 797  			break;
      
 799  		default:
      			printk("elevator returned crap (%d)\n", el_ret);
 801  			BUG();
      	}
      		
      	/*
      	 * Grab a free request from the freelist. Read first try their
      	 * own queue - if that is empty, we steal from the write list.
      	 * Writes must block if the write list is empty, and read aheads
      	 * are not crucial.
      	 */
      get_rq:
 811  	if (freereq) {
      		req = freereq;
      		freereq = NULL;
 814  	} else if ((req = get_request(q, rw)) == NULL) {
 815  		spin_unlock_irq(&io_request_lock);
 816  		if (rw_ahead)
 817  			goto end_io;
      
      		freereq = __get_request_wait(q, rw);
 820  		goto again;
      	}
      
      /* fill up the request-info, and add it to the queue */
      	req->cmd = rw;
      	req->errors = 0;
      	req->hard_sector = req->sector = sector;
      	req->hard_nr_sectors = req->nr_sectors = count;
      	req->current_nr_sectors = count;
      	req->nr_segments = 1; /* Always 1 for a new request. */
      	req->nr_hw_segments = 1; /* Always 1 for a new request. */
      	req->buffer = bh->b_data;
      	req->sem = NULL;
      	req->bh = bh;
      	req->bhtail = bh;
      	req->rq_dev = bh->b_rdev;
      	req->e = elevator;
      	add_request(q, req, head, latency);
      out:
 839  	if (!q->plugged)
      		(q->request_fn)(q);
 841  	if (freereq)
      		blkdev_release_request(freereq);
 843  	spin_unlock_irq(&io_request_lock);
 844  	return 0;
      end_io:
      	bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
 847  	return 0;
      }
      
      /**
       * generic_make_request: hand a buffer head to it's device driver for I/O
       * @rw:  READ, WRITE, or READA - what sort of I/O is desired.
       * @bh:  The buffer head describing the location in memory and on the device.
       *
       * generic_make_request() is used to make I/O requests of block
       * devices. It is passed a &struct buffer_head and a &rw value.  The
       * %READ and %WRITE options are (hopefully) obvious in meaning.  The
       * %READA value means that a read is required, but that the driver is
       * free to fail the request if, for example, it cannot get needed
       * resources immediately.
       *
       * generic_make_request() does not return any status.  The
       * success/failure status of the request, along with notification of
       * completion, is delivered asynchronously through the bh->b_end_io
       * function described (one day) else where.
       *
       * The caller of generic_make_request must make sure that b_page,
       * b_addr, b_size are set to describe the memory buffer, that b_rdev
       * and b_rsector are set to describe the device address, and the
       * b_end_io and optionally b_private are set to describe how
       * completion notification should be signaled.  BH_Mapped should also
       * be set (to confirm that b_dev and b_blocknr are valid).
       *
       * generic_make_request and the drivers it calls may use b_reqnext,
       * and may change b_rdev and b_rsector.  So the values of these fields
       * should NOT be depended on after the call to generic_make_request.
       * Because of this, the caller should record the device address
       * information in b_dev and b_blocknr.
       *
       * Apart from those fields mentioned above, no other fields, and in
       * particular, no other flags, are changed by generic_make_request or
       * any lower level drivers.
       * */
 884  void generic_make_request (int rw, struct buffer_head * bh)
      {
      	int major = MAJOR(bh->b_rdev);
      	request_queue_t *q;
      
 889  	if (!bh->b_end_io) BUG();
 890  	if (blk_size[major]) {
      		unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1;
      		unsigned int sector, count;
      
      		count = bh->b_size >> 9;
      		sector = bh->b_rsector;
      
 897  		if (maxsector < count || maxsector - count < sector) {
      			bh->b_state &= (1 << BH_Lock) | (1 << BH_Mapped);
 899  			if (blk_size[major][MINOR(bh->b_rdev)]) {
      				
      				/* This may well happen - the kernel calls bread()
      				   without checking the size of the device, e.g.,
      				   when mounting a device. */
      				printk(KERN_INFO
      				       "attempt to access beyond end of device\n");
      				printk(KERN_INFO "%s: rw=%d, want=%d, limit=%d\n",
      				       kdevname(bh->b_rdev), rw,
      				       (sector + count)>>1,
      				       blk_size[major][MINOR(bh->b_rdev)]);
      			}
      			bh->b_end_io(bh, 0);
 912  			return;
      		}
      	}
      
      	/*
      	 * Resolve the mapping until finished. (drivers are
      	 * still free to implement/resolve their own stacking
      	 * by explicitly returning 0)
      	 */
      	/* NOTE: we don't repeat the blk_size check for each new device.
      	 * Stacking drivers are expected to know what they are doing.
      	 */
 924  	do {
      		q = blk_get_queue(bh->b_rdev);
 926  		if (!q) {
      			printk(KERN_ERR
      			       "generic_make_request: Trying to access nonexistent block-device %s (%ld)\n",
      			       kdevname(bh->b_rdev), bh->b_rsector);
      			buffer_IO_error(bh);
 931  			break;
      		}
      
      	}
 935  	while (q->make_request_fn(q, rw, bh));
      }
      
      
      /**
       * submit_bh: submit a buffer_head to the block device later for I/O
       * @rw: whether to %READ or %WRITE, or mayve to %READA (read ahead)
       * @bh: The &struct buffer_head which describes the I/O
       *
       * submit_bh() is very similar in purpose to generic_make_request(), and
       * uses that function to do most of the work.
       *
       * The extra functionality provided by submit_bh is to determine
       * b_rsector from b_blocknr and b_size, and to set b_rdev from b_dev.
       * This is is appropriate for IO requests that come from the buffer
       * cache and page cache which (currently) always use aligned blocks.
       */
 952  void submit_bh(int rw, struct buffer_head * bh)
      {
 954  	if (!test_bit(BH_Lock, &bh->b_state))
 955  		BUG();
      
      	set_bit(BH_Req, &bh->b_state);
      
      	/*
      	 * First step, 'identity mapping' - RAID or LVM might
      	 * further remap this.
      	 */
      	bh->b_rdev = bh->b_dev;
      	bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
      
      	generic_make_request(rw, bh);
      
 968  	switch (rw) {
 969  		case WRITE:
      			kstat.pgpgout++;
 971  			break;
 972  		default:
      			kstat.pgpgin++;
 974  			break;
      	}
      }
      
      /*
       * Default IO end handler, used by "ll_rw_block()".
       */
 981  static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
      {
      	mark_buffer_uptodate(bh, uptodate);
      	unlock_buffer(bh);
      }
      
      /**
       * ll_rw_block: low-level access to block devices
       * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
       * @nr: number of &struct buffer_heads in the array
       * @bhs: array of pointers to &struct buffer_head
       *
       * ll_rw_block() takes an array of pointers to &struct buffer_heads,
       * and requests an I/O operation on them, either a %READ or a %WRITE.
       * The third %READA option is described in the documentation for
       * generic_make_request() which ll_rw_block() calls.
       *
       * This function provides extra functionality that is not in
       * generic_make_request() that is relevant to buffers in the buffer
       * cache or page cache.  In particular it drops any buffer that it
       * cannot get a lock on (with the BH_Lock state bit), any buffer that
       * appears to be clean when doing a write request, and any buffer that
       * appears to be up-to-date when doing read request.  Further it marks
       * as clean buffers that are processed for writing (the buffer cache
       * wont assume that they are actually clean until the buffer gets
       * unlocked).
       *
       * ll_rw_block sets b_end_io to simple completion handler that marks
       * the buffer up-to-date (if approriate), unlocks the buffer and wakes
       * any waiters.  As client that needs a more interesting completion
       * routine should call submit_bh() (or generic_make_request())
       * directly.
       *
       * Caveat:
       *  All of the buffers must be for the same device, and must also be
       *  of the current approved size for the device.  */
      
1018  void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
      {
      	unsigned int major;
      	int correct_size;
      	int i;
      
      	major = MAJOR(bhs[0]->b_dev);
      
      	/* Determine correct block size for this device. */
      	correct_size = BLOCK_SIZE;
1028  	if (blksize_size[major]) {
      		i = blksize_size[major][MINOR(bhs[0]->b_dev)];
1030  		if (i)
      			correct_size = i;
      	}
      
      	/* Verify requested block sizes. */
1035  	for (i = 0; i < nr; i++) {
      		struct buffer_head *bh;
      		bh = bhs[i];
1038  		if (bh->b_size != correct_size) {
      			printk(KERN_NOTICE "ll_rw_block: device %s: "
      			       "only %d-char blocks implemented (%u)\n",
      			       kdevname(bhs[0]->b_dev),
      			       correct_size, bh->b_size);
1043  			goto sorry;
      		}
      	}
      
1047  	if ((rw & WRITE) && is_read_only(bhs[0]->b_dev)) {
      		printk(KERN_NOTICE "Can't write to read-only device %s\n",
      		       kdevname(bhs[0]->b_dev));
1050  		goto sorry;
      	}
      
1053  	for (i = 0; i < nr; i++) {
      		struct buffer_head *bh;
      		bh = bhs[i];
      
      		/* Only one thread can actually submit the I/O. */
1058  		if (test_and_set_bit(BH_Lock, &bh->b_state))
1059  			continue;
      
      		/* We have the buffer lock */
      		bh->b_end_io = end_buffer_io_sync;
      
1064  		switch(rw) {
1065  		case WRITE:
1066  			if (!atomic_set_buffer_clean(bh))
      				/* Hmmph! Nothing to write */
1068  				goto end_io;
      			__mark_buffer_clean(bh);
1070  			break;
      
1072  		case READA:
1073  		case READ:
1074  			if (buffer_uptodate(bh))
      				/* Hmmph! Already have it */
1076  				goto end_io;
1077  			break;
1078  		default:
1079  			BUG();
      	end_io:
      			bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1082  			continue;
      		}
      
      		submit_bh(rw, bh);
      	}
1087  	return;
      
      sorry:
      	/* Make sure we don't get infinite dirty retries.. */
1091  	for (i = 0; i < nr; i++)
      		mark_buffer_clean(bhs[i]);
      }
      
      #ifdef CONFIG_STRAM_SWAP
      extern int stram_device_init (void);
      #endif
      
      /*
       * First step of what used to be end_request
       *
       * 0 means continue with end_that_request_last,
       * 1 means we are done
       */
      
1106  int end_that_request_first (struct request *req, int uptodate, char *name)
      {
      	struct buffer_head * bh;
      	int nsect;
      
      	req->errors = 0;
1112  	if (!uptodate)
      		printk("end_request: I/O error, dev %s (%s), sector %lu\n",
      			kdevname(req->rq_dev), name, req->sector);
      
1116  	if ((bh = req->bh) != NULL) {
      		nsect = bh->b_size >> 9;
      		req->bh = bh->b_reqnext;
      		bh->b_reqnext = NULL;
      		bh->b_end_io(bh, uptodate);
1121  		if ((bh = req->bh) != NULL) {
      			req->hard_sector += nsect;
      			req->hard_nr_sectors -= nsect;
      			req->sector = req->hard_sector;
      			req->nr_sectors = req->hard_nr_sectors;
      
      			req->current_nr_sectors = bh->b_size >> 9;
1128  			if (req->nr_sectors < req->current_nr_sectors) {
      				req->nr_sectors = req->current_nr_sectors;
      				printk("end_request: buffer-list destroyed\n");
      			}
      			req->buffer = bh->b_data;
1133  			return 1;
      		}
      	}
1136  	return 0;
      }
      
1139  void end_that_request_last(struct request *req)
      {
1141  	if (req->e) {
      		printk("end_that_request_last called with non-dequeued req\n");
1143  		BUG();
      	}
1145  	if (req->sem != NULL)
      		up(req->sem);
      
      	blkdev_release_request(req);
      }
      
1151  int __init blk_dev_init(void)
      {
      	struct blk_dev_struct *dev;
      
      	request_cachep = kmem_cache_create("blkdev_requests",
      					   sizeof(struct request),
      					   0, SLAB_HWCACHE_ALIGN, NULL, NULL);
      
1159  	if (!request_cachep)
      		panic("Can't create request pool slab cache\n");
      
1162  	for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;)
      		dev->queue = NULL;
      
      	memset(ro_bits,0,sizeof(ro_bits));
      	memset(max_readahead, 0, sizeof(max_readahead));
      	memset(max_sectors, 0, sizeof(max_sectors));
      #ifdef CONFIG_AMIGA_Z2RAM
      	z2_init();
      #endif
      #ifdef CONFIG_STRAM_SWAP
      	stram_device_init();
      #endif
      #ifdef CONFIG_BLK_DEV_RAM
      	rd_init();
      #endif
      #ifdef CONFIG_BLK_DEV_LOOP
      	loop_init();
      #endif
      #ifdef CONFIG_ISP16_CDI
      	isp16_init();
      #endif
      #if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_IDE)
      	ide_init();		/* this MUST precede hd_init */
      #endif
      #if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_HD)
      	hd_init();
      #endif
      #ifdef CONFIG_BLK_DEV_PS2
      	ps2esdi_init();
      #endif
      #ifdef CONFIG_BLK_DEV_XD
      	xd_init();
      #endif
      #ifdef CONFIG_BLK_DEV_MFM
      	mfm_init();
      #endif
      #ifdef CONFIG_PARIDE
      	{ extern void paride_init(void); paride_init(); };
      #endif
      #ifdef CONFIG_MAC_FLOPPY
      	swim3_init();
      #endif
      #ifdef CONFIG_BLK_DEV_SWIM_IOP
      	swimiop_init();
      #endif
      #ifdef CONFIG_AMIGA_FLOPPY
      	amiga_floppy_init();
      #endif
      #ifdef CONFIG_ATARI_FLOPPY
      	atari_floppy_init();
      #endif
      #ifdef CONFIG_BLK_DEV_FD
      	floppy_init();
      #else
      #if defined(__i386__)	/* Do we even need this? */
      	outb_p(0xc, 0x3f2);
      #endif
      #endif
      #ifdef CONFIG_CDU31A
      	cdu31a_init();
      #endif
      #ifdef CONFIG_ATARI_ACSI
      	acsi_init();
      #endif
      #ifdef CONFIG_MCD
      	mcd_init();
      #endif
      #ifdef CONFIG_MCDX
      	mcdx_init();
      #endif
      #ifdef CONFIG_SBPCD
      	sbpcd_init();
      #endif
      #ifdef CONFIG_AZTCD
      	aztcd_init();
      #endif
      #ifdef CONFIG_CDU535
      	sony535_init();
      #endif
      #ifdef CONFIG_GSCD
      	gscd_init();
      #endif
      #ifdef CONFIG_CM206
      	cm206_init();
      #endif
      #ifdef CONFIG_OPTCD
      	optcd_init();
      #endif
      #ifdef CONFIG_SJCD
      	sjcd_init();
      #endif
      #ifdef CONFIG_APBLOCK
      	ap_init();
      #endif
      #ifdef CONFIG_DDV
      	ddv_init();
      #endif
      #ifdef CONFIG_BLK_DEV_NBD
      	nbd_init();
      #endif
      #ifdef CONFIG_MDISK
      	mdisk_init();
      #endif
      #ifdef CONFIG_DASD
      	dasd_init();
      #endif
      #ifdef CONFIG_SUN_JSFLASH
      	jsfd_init();
      #endif
      #ifdef CONFIG_BLK_DEV_LVM
      	lvm_init();
      #endif
1274  	return 0;
      };
      
      EXPORT_SYMBOL(io_request_lock);
      EXPORT_SYMBOL(end_that_request_first);
      EXPORT_SYMBOL(end_that_request_last);
      EXPORT_SYMBOL(blk_init_queue);
      EXPORT_SYMBOL(blk_get_queue);
      EXPORT_SYMBOL(blk_cleanup_queue);
      EXPORT_SYMBOL(blk_queue_headactive);
      EXPORT_SYMBOL(blk_queue_pluggable);
      EXPORT_SYMBOL(blk_queue_make_request);
      EXPORT_SYMBOL(generic_make_request);
      EXPORT_SYMBOL(blkdev_release_request);