[kernel] r15772 - in dists/sid/linux-2.6/debian: . patches/features/all/openvz

Wed May 26 05:20:28 UTC 2010

Author: maks
Date: Wed May 26 05:20:20 2010
New Revision: 15772

Log:
update openvz patch

uff full blkio backport.

Modified:
   dists/sid/linux-2.6/debian/changelog
   dists/sid/linux-2.6/debian/patches/features/all/openvz/openvz.patch

Modified: dists/sid/linux-2.6/debian/changelog
==============================================================================

--- dists/sid/linux-2.6/debian/changelog	Wed May 26 04:02:28 2010	(r15771)
+++ dists/sid/linux-2.6/debian/changelog	Wed May 26 05:20:20 2010	(r15772)
@@ -28,6 +28,7 @@
   [ maximilian attems]
   * Add stable 2.6.32.14-rc1.
   * Add drm changes from stable 2.6.33.5.
+  * Update openvz patch to 509eb1f29c43.
 
  -- Ben Hutchings <ben at decadent.org.uk>  Tue, 18 May 2010 02:13:44 +0100
 

Modified: dists/sid/linux-2.6/debian/patches/features/all/openvz/openvz.patch
==============================================================================
--- dists/sid/linux-2.6/debian/patches/features/all/openvz/openvz.patch	Wed May 26 04:02:28 2010	(r15771)
+++ dists/sid/linux-2.6/debian/patches/features/all/openvz/openvz.patch	Wed May 26 05:20:20 2010	(r15772)
@@ -1,3 +1,1613 @@
+commit 509eb1f29c4301126a0ccda8e001dfd0af0d56d2
+Author: Pavel Emelyanov <xemul at openvz.org>
+Date:   Mon May 24 14:27:05 2010 +0400
+
+    OpenVZ kernel 2.6.32-balandin released
+    
+    Named after Aleksandr Nikolayevich Balandin - a Russian cosmonaut.
+    
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit eb28ec67376e267760e72c96ca3d54346d39a56f
+Author: Pavel Emelyanov <xemul at openvz.org>
+Date:   Mon May 24 15:10:31 2010 +0400
+
+    sysctl: Compilation fix after merge of sysctl fixes
+    
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 0bb7a0e0615e134b7ae9f7e2e2737be5ff76881b
+Author: Cyrill Gorcunov <gorcunov at openvz.org>
+Date:   Mon May 24 14:23:28 2010 +0400
+
+    fs: Don't list non-VE fs in /proc/filesistems
+    
+    Which is due to luck of a virtualized filesystems filter.
+    Implement it.
+    
+    http://bugzilla.openvz.org/show_bug.cgi?id=1504
+    
+    Reported-by: Kir Kolyshkin <kir at openvz.org>
+    Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 866f4866b2d988c1ac1222f0397efd1e6e64d443
+Author: Andrey Vagin <avagin at openvz.org>
+Date:   Mon May 24 13:14:58 2010 +0400
+
+    Fix sysctl warnings about unknown sysctl binary
+    
+    Switch this entry over to use CTL_UNNUMBERED, because
+    nobody use it via sys_sysctl.
+    
+    http://bugzilla.openvz.org/show_bug.cgi?id=1463
+    
+    Signed-off-by: Andrey Vagin <avagin at openvz.org>
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 2412f2cf0853b5303af7740000c99179eeece3e4
+Author: Andrey Vagin <avagin at openvz.org>
+Date:   Mon May 24 13:15:37 2010 +0400
+
+    susctl: Add sysctl_data_ve helper
+    
+    This helper is analogous to proc_dointvec_ve
+    
+    Add generic method for sys_syscal access to per ve values.
+    
+    The extra1 field of ctl_table contains data field offset from ve_struct begin.
+    without CONFIG_VE use address from .data field.
+    
+    Signed-off-by: Andrey Vagin <avagin at openvz.org>
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 39f9a055139faf313a1ad823b145e535d5485f5c
+Author: Andrey Vagin <avagin at openvz.org>
+Date:   Mon May 24 13:16:11 2010 +0400
+
+    Fix sysctl warnings about msissing strategy for randomize_va_space
+    
+    http://bugzilla.openvz.org/show_bug.cgi?id=1463
+    
+    Signed-off-by: Andrey Vagin <avagin at openvz.org>
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit de3a7aab2eeab095a81f414d0e5e855da1d99c61
+Author: Andrey Vagin <avagin at openvz.ru>
+Date:   Mon May 24 13:13:36 2010 +0400
+
+    cpt: use shem_file for dump inode content of shm
+    
+    Files with shm_file_operations save link in private_data on
+    the file with shmem_file_operation. For dumping inode content
+    we use read from shmem_file_operation, but pass the file with
+    shm_file_operations.
+    
+    shmem_file_operation use do_sync_read, which uses file->f_op->aio_read,
+    but it's absent in smh_file_operation.
+    
+    do_read
+      do_sync_read(*f, ...)
+        f->f_op->aio_read -> Oops
+    
+    http://bugzilla.openvz.org/show_bug.cgi?id=1500
+    
+    Signed-off-by: Andrey Vagin <avagin at openvz.org>
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 1c4eba47b2d5d3d26c186485de8adf8ef293ebb5
+Author: Stanislav Kinsbursky <skinsbursky at openvz.org>
+Date:   Mon May 24 14:05:44 2010 +0400
+
+    tun: device_create_file omitted if net level is not init_net
+    
+    device_create_file() calls are omitted in tun_set_iff() if net is inside container.
+    Used the same condition check like in netdev_register_kobject().
+    
+    http://bugzilla.openvz.org/show_bug.cgi?id=1497
+    
+    Signed-off-by: Stanislav Kinsbursky <skinsbursky at openvz.org>
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 98447fa5c37746da0699b9f8d8bbd59d8147d9bc
+Author: Kir Kolyshkin <kir at openvz.org>
+Date:   Mon May 24 13:04:17 2010 +0400
+
+    Revert "mm mmap zero length kludge"
+    
+    This kludge was made for really old rpm versions which were since then
+    fixed (see references to RH bugzilla in OpenVZ bug #893). More to say,
+    it now makes rpm itself segfault in our templates when locale is set,
+    details are in OpenVZ bug #1502. So remove it and hope for the best.
+    
+    http://bugzilla.openvz.org/1502
+    http://bugzilla.openvz.org/893
+    
+    This reverts commit d252a93b32d6d251fcc73863b75b91edaa801b95.
+    
+    Signed-off-by: Kir Kolyshkin <kir at openvz.org>
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 57358efc0e639282309d8b6aea8efb8ae3d6d9ad
+Merge: 42a0a10 1cd8211
+Author: Pavel Emelyanov <xemul at openvz.org>
+Date:   Mon May 24 12:59:24 2010 +0400
+
+    Merged linux-2.6.32.13
+    
+    Conflicts:
+    
+    	Makefile
+    
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 42a0a1071d3872af254373c1cc07085b9bf24d3a
+Author: Konstantin Khlebnikov <khlebnikov at openvz.org>
+Date:   Mon May 24 12:56:47 2010 +0400
+
+    ioprio: Make it possible to set ve ioprio finally
+    
+    Add ioprio compat call for blk-cgroup. Simulate the old ioprio with
+    the new blk-cgroup weight.
+    
+    Signed-off-by: Konstantin Khlebnikov <khlebnikov at openvz.org>
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit a4452f1cc33f6e4f7d8f58abab818ede313cdfbc
+Author: Konstantin Khlebnikov <khlebnikov at openvz.org>
+Date:   Mon May 24 12:55:43 2010 +0400
+
+    cgroup-lite: Set task css properly
+    
+    Fix task moving between cgroups at ve create and enter.
+    Add a helper to attach a task to a cgroup set (based on the
+    cgroup_attach_task).
+    
+    Signed-off-by: Konstantin Khlebnikov <khlebnikov at openvz.org>
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 983bb0952f838b55130f20a9486a04c92ae5826b
+Author: Konstantin Khlebnikov <khlebnikov at openvz.org>
+Date:   Mon May 24 12:54:09 2010 +0400
+
+    cgroup-lite: add cgroup-id for blk-cgroups
+    
+    Use one id for all subsystems in one cgroup. Store the id right
+    on the cgroup struct instead of hacking around css_id structures.
+    
+    Plus add other cgroup tree related functions required by blk-cgroup.
+    
+    Signed-off-by: Konstantin Khlebnikov <khlebnikov at openvz.org>
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit f54f5b3e0a014f3bb5c530b4c13d443a2fc92b52
+Author: Konstantin Khlebnikov <khlebnikov at openvz.org>
+Date:   Mon May 24 12:50:31 2010 +0400
+
+    cgroup-lite: fix subsys state refcnt
+    
+    Add missed __css_put and fix refcnt initial state: for alive css refcnt
+    starts from 1, see the init_cgroup_css and the cgroup_clear_css_refs.
+    
+    Signed-off-by: Konstantin Khlebnikov <khlebnikov at openvz.org>
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 1cd8211f07663ebeac04b19ac849de7ed5eef969
+Author: Greg Kroah-Hartman <gregkh at suse.de>
+Date:   Wed May 12 15:11:42 2010 -0700
+
+    Revert "module: fix __module_ref_addr()"
+    
+    This reverts commit d150a2b96558a7349cbf3a72a279c37bc67d50fb.
+    
+    Thanks to Jiri Benc for finding the problem that this patch is
+    not correct for the 2.6.32-stable series.
+    
+    Cc: Jiri Kosina <jkosina at suse.cz>
+    Signed-off-by: Greg Kroah-Hartman <gregkh at suse.de>
+
+commit dd480cee5d48b5fd88f4f074743b542fab6d9e70
+Author: Shaohua Li <shaohua.li at intel.com>
+Date:   Tue Apr 27 16:52:01 2010 +0400
+
+    cfq-iosched: split seeky coop queues after one slice
+    
+    Currently we split seeky coop queues after 1s, which is too big. Below patch
+    marks seeky coop queue split_coop flag after one slice. After that, if new
+    requests come in, the queues will be splitted. Patch is suggested by Corrado.
+    
+    Signed-off-by: Shaohua Li <shaohua.li at intel.com>
+    Reviewed-by: Corrado Zoccolo <czoccolo at gmail.com>
+    Acked-by: Jeff Moyer <jmoyer at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 187231a1fad899839137f76c08dd016a81245abb
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:51:57 2010 +0400
+
+    cfq-iosched: Do not idle on async queues
+    
+    Few weeks back, Shaohua Li had posted similar patch. I am reposting it
+    with more test results.
+    
+    This patch does two things.
+    
+    - Do not idle on async queues.
+    
+    - It also changes the write queue depth CFQ drives (cfq_may_dispatch()).
+      Currently, we seem to driving queue depth of 1 always for WRITES. This is
+      true even if there is only one write queue in the system and all the logic
+      of infinite queue depth in case of single busy queue as well as slowly
+      increasing queue depth based on last delayed sync request does not seem to
+      be kicking in at all.
+    
+    This patch will allow deeper WRITE queue depths (subjected to the other
+    WRITE queue depth contstraints like cfq_quantum and last delayed sync
+    request).
+    
+    Shaohua Li had reported getting more out of his SSD. For me, I have got
+    one Lun exported from an HP EVA and when pure buffered writes are on, I
+    can get more out of the system. Following are test results of pure
+    buffered writes (with end_fsync=1) with vanilla and patched kernel. These
+    results are average of 3 sets of run with increasing number of threads.
+    
+    AVERAGE[bufwfs][vanilla]
+    -------
+    job       Set NR  ReadBW(KB/s)   MaxClat(us)    WriteBW(KB/s)  MaxClat(us)
+    ---       --- --  ------------   -----------    -------------  -----------
+    bufwfs    3   1   0              0              95349          474141
+    bufwfs    3   2   0              0              100282         806926
+    bufwfs    3   4   0              0              109989         2.7301e+06
+    bufwfs    3   8   0              0              116642         3762231
+    bufwfs    3   16  0              0              118230         6902970
+    
+    AVERAGE[bufwfs] [patched kernel]
+    -------
+    bufwfs    3   1   0              0              270722         404352
+    bufwfs    3   2   0              0              206770         1.06552e+06
+    bufwfs    3   4   0              0              195277         1.62283e+06
+    bufwfs    3   8   0              0              260960         2.62979e+06
+    bufwfs    3   16  0              0              299260         1.70731e+06
+    
+    I also ran buffered writes along with some sequential reads and some
+    buffered reads going on in the system on a SATA disk because the potential
+    risk could be that we should not be driving queue depth higher in presence
+    of sync IO going to keep the max clat low.
+    
+    With some random and sequential reads going on in the system on one SATA
+    disk I did not see any significant increase in max clat. So it looks like
+    other WRITE queue depth control logic is doing its job. Here are the
+    results.
+    
+    AVERAGE[brr, bsr, bufw together] [vanilla]
+    -------
+    job       Set NR  ReadBW(KB/s)   MaxClat(us)    WriteBW(KB/s)  MaxClat(us)
+    ---       --- --  ------------   -----------    -------------  -----------
+    brr       3   1   850            546345         0              0
+    bsr       3   1   14650          729543         0              0
+    bufw      3   1   0              0              23908          8274517
+    
+    brr       3   2   981.333        579395         0              0
+    bsr       3   2   14149.7        1175689        0              0
+    bufw      3   2   0              0              21921          1.28108e+07
+    
+    brr       3   4   898.333        1.75527e+06    0              0
+    bsr       3   4   12230.7        1.40072e+06    0              0
+    bufw      3   4   0              0              19722.3        2.4901e+07
+    
+    brr       3   8   900            3160594        0              0
+    bsr       3   8   9282.33        1.91314e+06    0              0
+    bufw      3   8   0              0              18789.3        23890622
+    
+    AVERAGE[brr, bsr, bufw mixed] [patched kernel]
+    -------
+    job       Set NR  ReadBW(KB/s)   MaxClat(us)    WriteBW(KB/s)  MaxClat(us)
+    ---       --- --  ------------   -----------    -------------  -----------
+    brr       3   1   837            417973         0              0
+    bsr       3   1   14357.7        591275         0              0
+    bufw      3   1   0              0              24869.7        8910662
+    
+    brr       3   2   1038.33        543434         0              0
+    bsr       3   2   13351.3        1205858        0              0
+    bufw      3   2   0              0              18626.3        13280370
+    
+    brr       3   4   913            1.86861e+06    0              0
+    bsr       3   4   12652.3        1430974        0              0
+    bufw      3   4   0              0              15343.3        2.81305e+07
+    
+    brr       3   8   890            2.92695e+06    0              0
+    bsr       3   8   9635.33        1.90244e+06    0              0
+    bufw      3   8   0              0              17200.3        24424392
+    
+    So looks like it might make sense to include this patch.
+    
+    Thanks
+    Vivek
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 9027160e254ff7ea55338a1857843144445d57aa
+Author: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+Date:   Tue Apr 27 16:51:53 2010 +0400
+
+    blk-cgroup: Fix potential deadlock in blk-cgroup
+    
+    I triggered a lockdep warning as following.
+    
+    =======================================================
+    [ INFO: possible circular locking dependency detected ]
+    2.6.33-rc2 #1
+    -------------------------------------------------------
+    test_io_control/7357 is trying to acquire lock:
+     (blkio_list_lock){+.+...}, at: [<c053a990>] blkiocg_weight_write+0x82/0x9e
+    
+    but task is already holding lock:
+     (&(&blkcg->lock)->rlock){......}, at: [<c053a949>] blkiocg_weight_write+0x3b/0x9e
+    
+    which lock already depends on the new lock.
+    
+    the existing dependency chain (in reverse order) is:
+    
+    -> #2 (&(&blkcg->lock)->rlock){......}:
+           [<c04583b7>] validate_chain+0x8bc/0xb9c
+           [<c0458dba>] __lock_acquire+0x723/0x789
+           [<c0458eb0>] lock_acquire+0x90/0xa7
+           [<c0692b0a>] _raw_spin_lock_irqsave+0x27/0x5a
+           [<c053a4e1>] blkiocg_add_blkio_group+0x1a/0x6d
+           [<c053cac7>] cfq_get_queue+0x225/0x3de
+           [<c053eec2>] cfq_set_request+0x217/0x42d
+           [<c052c8a6>] elv_set_request+0x17/0x26
+           [<c0532a0f>] get_request+0x203/0x2c5
+           [<c0532ae9>] get_request_wait+0x18/0x10e
+           [<c0533470>] __make_request+0x2ba/0x375
+           [<c0531985>] generic_make_request+0x28d/0x30f
+           [<c0532da7>] submit_bio+0x8a/0x8f
+           [<c04d827a>] submit_bh+0xf0/0x10f
+           [<c04d91d2>] ll_rw_block+0xc0/0xf9
+           [<f86e9705>] ext3_find_entry+0x319/0x544 [ext3]
+           [<f86eae58>] ext3_lookup+0x2c/0xb9 [ext3]
+           [<c04c3e1b>] do_lookup+0xd3/0x172
+           [<c04c56c8>] link_path_walk+0x5fb/0x95c
+           [<c04c5a65>] path_walk+0x3c/0x81
+           [<c04c5b63>] do_path_lookup+0x21/0x8a
+           [<c04c66cc>] do_filp_open+0xf0/0x978
+           [<c04c0c7e>] open_exec+0x1b/0xb7
+           [<c04c1436>] do_execve+0xbb/0x266
+           [<c04081a9>] sys_execve+0x24/0x4a
+           [<c04028a2>] ptregs_execve+0x12/0x18
+    
+    -> #1 (&(&q->__queue_lock)->rlock){..-.-.}:
+           [<c04583b7>] validate_chain+0x8bc/0xb9c
+           [<c0458dba>] __lock_acquire+0x723/0x789
+           [<c0458eb0>] lock_acquire+0x90/0xa7
+           [<c0692b0a>] _raw_spin_lock_irqsave+0x27/0x5a
+           [<c053dd2a>] cfq_unlink_blkio_group+0x17/0x41
+           [<c053a6eb>] blkiocg_destroy+0x72/0xc7
+           [<c0467df0>] cgroup_diput+0x4a/0xb2
+           [<c04ca473>] dentry_iput+0x93/0xb7
+           [<c04ca4b3>] d_kill+0x1c/0x36
+           [<c04cb5c5>] dput+0xf5/0xfe
+           [<c04c6084>] do_rmdir+0x95/0xbe
+           [<c04c60ec>] sys_rmdir+0x10/0x12
+           [<c04027cc>] sysenter_do_call+0x12/0x32
+    
+    -> #0 (blkio_list_lock){+.+...}:
+           [<c0458117>] validate_chain+0x61c/0xb9c
+           [<c0458dba>] __lock_acquire+0x723/0x789
+           [<c0458eb0>] lock_acquire+0x90/0xa7
+           [<c06929fd>] _raw_spin_lock+0x1e/0x4e
+           [<c053a990>] blkiocg_weight_write+0x82/0x9e
+           [<c0467f1e>] cgroup_file_write+0xc6/0x1c0
+           [<c04bd2f3>] vfs_write+0x8c/0x116
+           [<c04bd7c6>] sys_write+0x3b/0x60
+           [<c04027cc>] sysenter_do_call+0x12/0x32
+    
+    other info that might help us debug this:
+    
+    1 lock held by test_io_control/7357:
+     #0:  (&(&blkcg->lock)->rlock){......}, at: [<c053a949>] blkiocg_weight_write+0x3b/0x9e
+    stack backtrace:
+    Pid: 7357, comm: test_io_control Not tainted 2.6.33-rc2 #1
+    Call Trace:
+     [<c045754f>] print_circular_bug+0x91/0x9d
+     [<c0458117>] validate_chain+0x61c/0xb9c
+     [<c0458dba>] __lock_acquire+0x723/0x789
+     [<c0458eb0>] lock_acquire+0x90/0xa7
+     [<c053a990>] ? blkiocg_weight_write+0x82/0x9e
+     [<c06929fd>] _raw_spin_lock+0x1e/0x4e
+     [<c053a990>] ? blkiocg_weight_write+0x82/0x9e
+     [<c053a990>] blkiocg_weight_write+0x82/0x9e
+     [<c0467f1e>] cgroup_file_write+0xc6/0x1c0
+     [<c0454df5>] ? trace_hardirqs_off+0xb/0xd
+     [<c044d93a>] ? cpu_clock+0x2e/0x44
+     [<c050e6ec>] ? security_file_permission+0xf/0x11
+     [<c04bcdda>] ? rw_verify_area+0x8a/0xad
+     [<c0467e58>] ? cgroup_file_write+0x0/0x1c0
+     [<c04bd2f3>] vfs_write+0x8c/0x116
+     [<c04bd7c6>] sys_write+0x3b/0x60
+     [<c04027cc>] sysenter_do_call+0x12/0x32
+    
+    To prevent deadlock, we should take locks as following sequence:
+    
+    blkio_list_lock -> queue_lock ->  blkcg_lock.
+    
+    The following patch should fix this bug.
+    
+    Signed-off-by: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 0460ada9ec82e679632588772a3084652c1db996
+Author: Divyesh Shah <dpshah at google.com>
+Date:   Tue Apr 27 16:51:48 2010 +0400
+
+    cfq-iosched: Respect ioprio_class when preempting
+    
+    In cfq_should_preempt(), we currently allow some cases where a non-RT request
+    can preempt an ongoing RT cfqq timeslice. This should not happen.
+    Examples include:
+    
+    o A sync_noidle wl type non-RT request pre-empting a sync_noidle wl type cfqq
+      on which we are idling.
+    o Once we have per-cgroup async queues, a non-RT sync request pre-empting a RT
+      async cfqq.
+    
+    Signed-off-by: Divyesh Shah<dpshah at google.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 58244fb9adfe3f58b17be18c9f27d59dbf4977fe
+Author: Shaohua Li <shaohua.li at intel.com>
+Date:   Tue Apr 27 16:51:44 2010 +0400
+
+    cfq-iosched: don't regard requests with long distance as close
+    
+    seek_mean could be very big sometimes, using it as close criteria is meaningless
+    as this doen't improve any performance. So if it's big, let's fallback to
+    default value.
+    
+    Reviewed-by: Corrado Zoccolo <czoccolo at gmail.com>
+    Signed-off-by: Shaohua Li<shaohua.li at intel.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 875add11b7efa93199cd179e17786c8c83cf77ea
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:51:39 2010 +0400
+
+    cfq-iosched: Remove prio_change logic for workload selection
+    
+    o CFQ now internally divides cfq queues in therr workload categories. sync-idle,
+      sync-noidle and async. Which workload to run depends primarily on rb_key
+      offset across three service trees. Which is a combination of mulitiple things
+      including what time queue got queued on the service tree.
+    
+      There is one exception though. That is if we switched the prio class, say
+      we served some RT tasks and again started serving BE class, then with-in
+      BE class we always started with sync-noidle workload irrespective of rb_key
+      offset in service trees.
+    
+      This can provide better latencies for sync-noidle workload in the presence
+      of RT tasks.
+    
+    o This patch gets rid of that exception and which workload to run with-in
+      class always depends on lowest rb_key across service trees. The reason
+      being that now we have multiple BE class groups and if we always switch
+      to sync-noidle workload with-in group, we can potentially starve a sync-idle
+      workload with-in group. Same is true for async workload which will be in
+      root group. Also the workload-switching with-in group will become very
+      unpredictable as it now depends whether some RT workload was running in
+      the system or not.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Reviewed-by: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+    Acked-by: Corrado Zoccolo <czoccolo at gmail.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 98a3d07b1fe96e53a15cbab963ea26b68b573194
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:51:35 2010 +0400
+
+    cfq-iosched: Get rid of nr_groups
+    
+    o Currently code does not seem to be using cfqd->nr_groups. Get rid of it.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Reviewed-by: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit af90feaf148382f0f79b9411fc50d88bd861710a
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:51:31 2010 +0400
+
+    cfq-iosched: Remove the check for same cfq group from allow_merge
+    
+    o allow_merge() already checks if submitting task is pointing to same cfqq
+      as rq has been queued in. If everything is fine, we should not be having
+      a task in one cgroup and having a pointer to cfqq in other cgroup.
+    
+      Well I guess in some situations it can happen and that is, when a random
+      IO queue has been moved into root cgroup for group_isolation=0. In
+      this case, tasks's cgroup/group is different from where actually cfqq is,
+      but this is intentional and in this case merging should be allowed.
+    
+      The second situation is where due to close cooperator patches, multiple
+      processes can be sharing a cfqq. If everything implemented right, we should
+      not end up in a situation where tasks from different processes in different
+      groups are sharing the same cfqq as we allow merging of cooperating queues
+      only if they are in same group.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Reviewed-by: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 76160ce0edc2aeeaa4df9292700aecdd0c4c36cb
+Author: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+Date:   Tue Apr 27 16:51:27 2010 +0400
+
+    cfq: set workload as expired if it doesn't have any slice left
+    
+    When a group is resumed, if it doesn't have workload slice left,
+    we should set workload_expires as expired. Otherwise, we might
+    start from where we left in previous group by error.
+    Thanks the idea from Corrado.
+    
+    Signed-off-by: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 6a78ef2e36ba6a63c5617326b38e268820cdd893
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:51:23 2010 +0400
+
+    Fix a CFQ crash in "for-2.6.33" branch of block tree
+    
+    I think my previous patch introduced a bug which can lead to CFQ hitting
+    BUG_ON().
+    
+    The offending commit in for-2.6.33 branch is.
+    
+    commit 7667aa0630407bc07dc38dcc79d29cc0a65553c1
+    Author: Vivek Goyal <vgoyal at redhat.com>
+    Date:   Tue Dec 8 17:52:58 2009 -0500
+    
+        cfq-iosched: Take care of corner cases of group losing share due to deletion
+    
+    While doing some stress testing on my box, I enountered following.
+    
+    login: [ 3165.148841] BUG: scheduling while
+    atomic: swapper/0/0x10000100
+    [ 3165.149821] Modules linked in: cfq_iosched dm_multipath qla2xxx igb
+    scsi_transport_fc dm_snapshot [last unloaded: scsi_wait_scan]
+    [ 3165.149821] Pid: 0, comm: swapper Not tainted
+    2.6.32-block-for-33-merged-new #3
+    [ 3165.149821] Call Trace:
+    [ 3165.149821]  <IRQ>  [<ffffffff8103fab8>] __schedule_bug+0x5c/0x60
+    [ 3165.149821]  [<ffffffff8103afd7>] ? __wake_up+0x44/0x4d
+    [ 3165.149821]  [<ffffffff8153a979>] schedule+0xe3/0x7bc
+    [ 3165.149821]  [<ffffffff8103a796>] ? cpumask_next+0x1d/0x1f
+    [ 3165.149821]  [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
+    [cfq_iosched]
+    [ 3165.149821]  [<ffffffff810422d8>] __cond_resched+0x2a/0x35
+    [ 3165.149821]  [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
+    [cfq_iosched]
+    [ 3165.149821]  [<ffffffff8153b1ee>] _cond_resched+0x2c/0x37
+    [ 3165.149821]  [<ffffffff8100e2db>] is_valid_bugaddr+0x16/0x2f
+    [ 3165.149821]  [<ffffffff811e4161>] report_bug+0x18/0xac
+    [ 3165.149821]  [<ffffffff8100f1fc>] die+0x39/0x63
+    [ 3165.149821]  [<ffffffff8153cde1>] do_trap+0x11a/0x129
+    [ 3165.149821]  [<ffffffff8100d470>] do_invalid_op+0x96/0x9f
+    [ 3165.149821]  [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
+    [cfq_iosched]
+    [ 3165.149821]  [<ffffffff81034b4d>] ? enqueue_task+0x5c/0x67
+    [ 3165.149821]  [<ffffffff8103ae83>] ? task_rq_unlock+0x11/0x13
+    [ 3165.149821]  [<ffffffff81041aae>] ? try_to_wake_up+0x292/0x2a4
+    [ 3165.149821]  [<ffffffff8100c935>] invalid_op+0x15/0x20
+    [ 3165.149821]  [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
+    [cfq_iosched]
+    [ 3165.149821]  [<ffffffff810df5a6>] ? virt_to_head_page+0xe/0x2f
+    [ 3165.149821]  [<ffffffff811d8c2a>] blk_peek_request+0x191/0x1a7
+    [ 3165.149821]  [<ffffffff811e5b8d>] ? kobject_get+0x1a/0x21
+    [ 3165.149821]  [<ffffffff812c8d4c>] scsi_request_fn+0x82/0x3df
+    [ 3165.149821]  [<ffffffff8110b2de>] ? bio_fs_destructor+0x15/0x17
+    [ 3165.149821]  [<ffffffff810df5a6>] ? virt_to_head_page+0xe/0x2f
+    [ 3165.149821]  [<ffffffff811d931f>] __blk_run_queue+0x42/0x71
+    [ 3165.149821]  [<ffffffff811d9403>] blk_run_queue+0x26/0x3a
+    [ 3165.149821]  [<ffffffff812c8761>] scsi_run_queue+0x2de/0x375
+    [ 3165.149821]  [<ffffffff812b60ac>] ? put_device+0x17/0x19
+    [ 3165.149821]  [<ffffffff812c92d7>] scsi_next_command+0x3b/0x4b
+    [ 3165.149821]  [<ffffffff812c9b9f>] scsi_io_completion+0x1c9/0x3f5
+    [ 3165.149821]  [<ffffffff812c3c36>] scsi_finish_command+0xb5/0xbe
+    
+    I think I have hit following BUG_ON() in cfq_dispatch_request().
+    
+    BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
+    
+    Please find attached the patch to fix it. I have done some stress testing
+    with it and have not seen it happening again.
+    
+    o We should wait on a queue even after slice expiry only if it is empty. If
+      queue is not empty then continue to expire it.
+    
+    o If we decide to keep the queue then make cfqq=NULL. Otherwise select_queue()
+      will return a valid cfqq and cfq_dispatch_request() can hit following
+      BUG_ON().
+    
+      BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list))
+    
+    Reviewed-by: Jeff Moyer <jmoyer at redhat.com>
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 086fcfd4a9aec3209a9a8b2c591734850bbca097
+Author: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+Date:   Tue Apr 27 16:51:18 2010 +0400
+
+    cfq: Remove wait_request flag when idle time is being deleted
+    
+    Remove wait_request flag when idle time is being deleted, otherwise
+    it'll hit this path every time when a request is enqueued.
+    
+    Signed-off-by: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 9714cf0030da3ceaea312be05cc056d4b36fe118
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date:   Tue Apr 27 16:51:14 2010 +0400
+
+    cfq-iosched: commenting non-obvious initialization
+    
+    Added a comment to explain the initialization of last_delayed_sync.
+    
+    Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+    Acked-by: Jeff Moyer <jmoyer at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 75e3bc83c0d1f9c909bd0bce56ac377623c22807
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:51:10 2010 +0400
+
+    cfq-iosched: Take care of corner cases of group losing share due to deletion
+    
+    If there is a sequential reader running in a group, we wait for next request
+    to come in that group after slice expiry and once new request is in, we expire
+    the queue. Otherwise we delete the group from service tree and group looses
+    its fair share.
+    
+    So far I was marking a queue as wait_busy if it had consumed its slice and
+    it was last queue in the group. But this condition did not cover following
+    two cases.
+    
+    1.If a request completed and slice has not expired yet. Next request comes
+      in and is dispatched to disk. Now select_queue() hits and slice has expired.
+      This group will be deleted. Because request is still in the disk, this queue
+      will never get a chance to wait_busy.
+    
+    2.If request completed and slice has not expired yet. Before next request
+      comes in (delay due to think time), select_queue() hits and expires the
+      queue hence group. This queue never got a chance to wait busy.
+    
+    Gui was hitting the boundary condition 1 and not getting fairness numbers
+    proportional to weight.
+    
+    This patch puts the checks for above two conditions and improves the fairness
+    numbers for sequential workload on rotational media. Check in select_queue()
+    takes care of case 1 and additional check in should_wait_busy() takes care
+    of case 2.
+    
+    Reported-by: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 6c866a0686a169f5098da254fb6b0f8812318469
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:51:06 2010 +0400
+
+    cfq-iosched: Get rid of cfqq wait_busy_done flag
+    
+    o Get rid of wait_busy_done flag. This flag only tells we were doing wait
+      busy on a queue and that queue got request so expire it. That information
+      can easily be obtained by (cfq_cfqq_wait_busy() && queue_is_not_empty). So
+      remove this flag and keep code simple.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 44c156f1191391dddb02f1abff022a61c2f94a17
+Author: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+Date:   Tue Apr 27 16:51:02 2010 +0400
+
+    cfq: Optimization for close cooperating queue searching
+    
+    It doesn't make any sense to try to find out a close cooperating
+    queue if current cfqq is the only one in the group.
+    
+    Signed-off-by: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit bd4386b49b4ba2c012dc22c7a80512681a5ade15
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date:   Tue Apr 27 16:50:58 2010 +0400
+
+    cfq-iosched: reduce write depth only if sync was delayed
+    
+    The introduction of ramp-up formula for async queue depths has
+    slowed down dirty page reclaim, by reducing async write performance.
+    This patch makes sure the formula kicks in only when sync request
+    was recently delayed.
+    
+    Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 433c9d47f26fcb9141f1a1c3f15245a8391c5a08
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:50:52 2010 +0400
+
+    cfq-iosched: Do not access cfqq after freeing it
+    
+    Fix a crash during boot reported by Jeff Moyer. Fix the issue of accessing
+    cfqq after freeing it.
+    
+    Reported-by: Jeff Moyer <jmoyer at redhat.com>
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Reviewed-by: Jeff Moyer <jmoyer at redhat.com>
+    Signed-off-by: Jens Axboe <axboe at carl.(none)>
+
+commit 21e7ec5499dfae1930bc103e1f2430b262ac0c61
+Author: Stephen Rothwell <sfr at canb.auug.org.au>
+Date:   Tue Apr 27 16:50:48 2010 +0400
+
+    block: include linux/err.h to use ERR_PTR
+    
+    Signed-off-by: Stephen Rothwell <sfr at canb.auug.org.au>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit ba750bcbce0558bfe7ea2fd4a9b9ca74e1eac70f
+Author: Jens Axboe <jens.axboe at oracle.com>
+Date:   Tue Apr 27 16:50:44 2010 +0400
+
+    cfq-iosched: use call_rcu() instead of doing grace period stall on queue exit
+    
+    After the merge of the IO controller patches, booting on my megaraid
+    box ran much slower. Vivek Goyal traced it down to megaraid discovery
+    creating tons of devices, each suffering a grace period when they later
+    kill that queue (if no device is found).
+    
+    So lets use call_rcu() to batch these deferred frees, instead of taking
+    the grace period hit for each one.
+    
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 291282276037c26045453190e5dd441ff03e319a
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:50:40 2010 +0400
+
+    blkio: Implement dynamic io controlling policy registration
+    
+    o One of the goals of block IO controller is that it should be able to
+      support mulitple io control policies, some of which be operational at
+      higher level in storage hierarchy.
+    
+    o To begin with, we had one io controlling policy implemented by CFQ, and
+      I hard coded the CFQ functions called by blkio. This created issues when
+      CFQ is compiled as module.
+    
+    o This patch implements a basic dynamic io controlling policy registration
+      functionality in blkio. This is similar to elevator functionality where
+      ioschedulers register the functions dynamically.
+    
+    o Now in future, when more IO controlling policies are implemented, these
+      can dynakically register with block IO controller.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 7701338499b73355707c41ae27358a4dd5bc4b84
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:50:36 2010 +0400
+
+    blkio: Export some symbols from blkio as its user CFQ can be a module
+    
+    o blkio controller is inside the kernel and cfq makes use of interfaces
+      exported by blkio. CFQ can be a module too, hence export symbols used
+      by CFQ.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 8dfe981d81c7a967b6040d73fcae9780ef1519ae
+Author: Shaohua Li <shaohua.li at intel.com>
+Date:   Tue Apr 27 16:50:31 2010 +0400
+
+    cfq-iosched: make nonrot check logic consistent
+    
+    cfq_arm_slice_timer() has logic to disable idle window for SSD device. The same
+    thing should be done at cfq_select_queue() too, otherwise we will still see
+    idle window. This makes the nonrot check logic consistent in cfq.
+    Tests in a intel SSD with low_latency knob close, below patch can triple disk
+    thoughput for muti-thread sequential read.
+    
+    Signed-off-by: Shaohua Li <shaohua.li at intel.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit e1853aca5799c76d0dd8ff97c5bed8c2e6059fa2
+Author: Jens Axboe <jens.axboe at oracle.com>
+Date:   Tue Apr 27 16:50:28 2010 +0400
+
+    cfq-iosched: move IO controller declerations to a header file
+    
+    They should not be declared inside some other file that's not related
+    to CFQ.
+    
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 16ca6c55c9c1961dbd748a5c94883ab1d65bb04f
+Author: Jens Axboe <jens.axboe at oracle.com>
+Date:   Tue Apr 27 16:50:24 2010 +0400
+
+    cfq-iosched: fix compile problem with !CONFIG_CGROUP
+    
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 5260a89b72023fcad7242552059312e31a864bf2
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:50:19 2010 +0400
+
+    blkio: Wait on sync-noidle queue even if rq_noidle = 1
+    
+    o rq_noidle() is supposed to tell cfq that do not expect a request after this
+      one, hence don't idle. But this does not seem to work very well. For example
+      for direct random readers, rq_noidle = 1 but there is next request coming
+      after this. Not idling, leads to a group not getting its share even if
+      group_isolation=1.
+    
+    o The right solution for this issue is to scan the higher layers and set
+      right flag (WRITE_SYNC or WRITE_ODIRECT). For the time being, this single
+      line fix helps. This should not have any significant impact when we are
+      not using cgroups. I will later figure out IO paths in higher layer and
+      fix it.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 3647d976033973a4502696fb45a980baa8cf1350
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:50:15 2010 +0400
+
+    blkio: Implement group_isolation tunable
+    
+    o If a group is running only a random reader, then it will not have enough
+      traffic to keep disk busy and we will reduce overall throughput. This
+      should result in better latencies for random reader though. If we don't
+      idle on random reader service tree, then this random reader will experience
+      large latencies if there are other groups present in system with sequential
+      readers running in these.
+    
+    o One solution suggested by corrado is that by default keep the random readers
+      or sync-noidle workload in root group so that during one dispatch round
+      we idle only once on sync-noidle tree. This means that all the sync-idle
+      workload queues will be in their respective group and we will see service
+      differentiation in those but not on sync-noidle workload.
+    
+    o Provide a tunable group_isolation. If set, this will make sure that even
+      sync-noidle queues go in their respective group and we wait on these. This
+      provides stronger isolation between groups but at the expense of throughput
+      if group does not have enough traffic to keep the disk busy.
+    
+    o By default group_isolation = 0
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit d7d266e74623a5ff4a196c9ba35edb33d844078d
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:50:09 2010 +0400
+
+    blkio: Determine async workload length based on total number of queues
+    
+    o Async queues are not per group. Instead these are system wide and maintained
+      in root group. Hence their workload slice length should be calculated
+      based on total number of queues in the system and not just queues in the
+      root group.
+    
+    o As root group's default weight is 1000, make sure to charge async queue
+      more in terms of vtime so that it does not get more time on disk because
+      root group has higher weight.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 853b022fdecf1394bc6f56ed4391acfcdac76a77
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:50:06 2010 +0400
+
+    blkio: Wait for cfq queue to get backlogged if group is empty
+    
+    o If a queue consumes its slice and then gets deleted from service tree, its
+      associated group will also get deleted from service tree if this was the
+      only queue in the group. That will make group loose its share.
+    
+    o For the queues on which we have idling on and if these have used their
+      slice, wait a bit for these queues to get backlogged again and then
+      expire these queues so that group does not loose its share.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 6b1099c5bbc770dc0e00e447c91cc2c70abfcd4d
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:49:55 2010 +0400
+
+    blkio: Propagate cgroup weight updation to cfq groups
+    
+    o Propagate blkio cgroup weight updation to associated cfq groups.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit b8e49f6ef8a5b19dcc3596a957b10ff7783ca8e3
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:49:51 2010 +0400
+
+    blkio: Drop the reference to queue once the task changes cgroup
+    
+    o If a task changes cgroup, drop reference to the cfqq associated with io
+      context and set cfqq pointer stored in ioc to NULL so that upon next request
+      arrival we will allocate a  new queue in new group.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit f0939f2fb5a93f52e4c38c96dd403a20412635ac
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:49:47 2010 +0400
+
+    blkio: Provide some isolation between groups
+    
+    o Do not allow following three operations across groups for isolation.
+    	- selection of co-operating queues
+    	- preemtpions across groups
+    	- request merging across groups.
+    
+    o Async queues are currently global and not per group. Allow preemption of
+      an async queue if a sync queue in other group gets backlogged.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 3e5835588e20983417074286dc9c46aeff4bdcb5
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:49:43 2010 +0400
+
+    blkio: Export disk time and sectors used by a group to user space
+    
+    o Export disk time and sector used by a group to user space through cgroup
+      interface.
+    
+    o Also export a "dequeue" interface to cgroup which keeps track of how many
+      a times a group was deleted from service tree. Helps in debugging.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 5050a2e923c23fee20e5d20350da94328c028ea7
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:49:38 2010 +0400
+
+    blkio: Some debugging aids for CFQ
+    
+    o Some debugging aids for CFQ.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 32227ad5a49cdf40d128fff9f573e770326fb2a1
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:49:33 2010 +0400
+
+    blkio: Take care of cgroup deletion and cfq group reference counting
+    
+    o One can choose to change elevator or delete a cgroup. Implement group
+      reference counting so that both elevator exit and cgroup deletion can
+      take place gracefully.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Nauman Rafique <nauman at google.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit c80d513227c069c5f15e1722ef3d63096aa2652b
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:49:29 2010 +0400
+
+    blkio: Dynamic cfq group creation based on cgroup tasks belongs to
+    
+    o Determine the cgroup IO submitting task belongs to and create the cfq
+      group if it does not exist already.
+    
+    o Also link cfqq and associated cfq group.
+    
+    o Currently all async IO is mapped to root group.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit e890b41384a11cd0eaaf4901d72de44cd21e2b65
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:49:25 2010 +0400
+
+    blkio: Group time used accounting and workload context save restore
+    
+    o This patch introduces the functionality to do the accounting of group time
+      when a queue expires. This time used decides which is the group to go
+      next.
+    
+    o Also introduce the functionlity to save and restore the workload type
+      context with-in group. It might happen that once we expire the cfq queue
+      and group, a different group will schedule in and we will lose the context
+      of the workload type. Hence save and restore it upon queue expiry.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit fb6067d930baa1b510aba82153ddad866aa0cf65
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:49:21 2010 +0400
+
+    blkio: Implement per cfq group latency target and busy queue avg
+    
+    o So far we had 300ms soft target latency system wide. Now with the
+      introduction of cfq groups, divide that latency by number of groups so
+      that one can come up with group target latency which will be helpful
+      in determining the workload slice with-in group and also the dynamic
+      slice length of the cfq queue.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 0fee1302172d62ee9eb34c37d792ac05e30fe2d7
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:49:17 2010 +0400
+
+    blkio: Introduce per cfq group weights and vdisktime calculations
+    
+    o Bring in the per cfq group weight and how vdisktime is calculated for the
+      group. Also bring in the functionality of updating the min_vdisktime of
+      the group service tree.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit a31a7a44995ded913fd031f922cffa9e457b2a83
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:49:12 2010 +0400
+
+    blkio: Introduce blkio controller cgroup interface
+    
+    o This is basic implementation of blkio controller cgroup interface. This is
+      the common interface visible to user space and should be used by different
+      IO control policies as we implement those.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 82041001ee5b7a662d488238f46b8912cc440160
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:49:06 2010 +0400
+
+    blkio: Introduce the root service tree for cfq groups
+    
+    o So far we just had one cfq_group in cfq_data. To create space for more than
+      one cfq_group, we need to have a service tree of groups where all the groups
+      can be queued if they have active cfq queues backlogged in these.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 1b290883254f64d396f11a071b74598d97e1b3d3
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:49:02 2010 +0400
+
+    blkio: Keep queue on service tree until we expire it
+    
+    o Currently cfqq deletes a queue from service tree if it is empty (even if
+      we might idle on the queue). This patch keeps the queue on service tree
+      hence associated group remains on the service tree until we decide that
+      we are not going to idle on the queue and expire it.
+    
+    o This just helps in time accounting for queue/group and in implementation
+      of rest of the patches.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit d0d70b93083a4fc811bd3bfed1df04870102d538
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:48:58 2010 +0400
+
+    blkio: Implement macro to traverse each service tree in group
+    
+    o Implement a macro to traverse each service tree in the group. This avoids
+      usage of double for loop and special condition for idle tree 4 times.
+    
+    o Macro is little twisted because of special handling of idle class service
+      tree.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 4fea5fccf125349a109304569acbeda86c9ab67f
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:48:54 2010 +0400
+
+    blkio: Introduce the notion of cfq groups
+    
+    o This patch introduce the notion of cfq groups. Soon we will can have multiple
+      groups of different weights in the system.
+    
+    o Various service trees (prioclass and workload type trees), will become per
+      cfq group. So hierarchy looks as follows.
+    
+    			cfq_groups
+    			   |
+    			workload type
+    			   |
+    		        cfq queue
+    
+    o When an scheduling decision has to be taken, first we select the cfq group
+      then workload with-in the group and then cfq queue with-in the workload
+      type.
+    
+    o This patch just makes various workload service tree per cfq group and
+      introduce the function to be able to choose a group for scheduling.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 14d52ec9524545c8eb9c13d05925c53f1bd2b3ff
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date:   Tue Apr 27 16:48:49 2010 +0400
+
+    blkio: Set must_dispatch only if we decided to not dispatch the request
+    
+    o must_dispatch flag should be set only if we decided not to run the queue
+      and dispatch the request.
+    
+    Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit a6a0574d5ab33877885943183de7645e157ed16e
+Author: Shaohua Li <shaohua.li at intel.com>
+Date:   Tue Apr 27 16:48:46 2010 +0400
+
+    cfq-iosched: no dispatch limit for single queue
+    
+    Since commit 2f5cb7381b737e24c8046fd4aeab571fb71315f5, each queue can send
+    up to 4 * 4 requests if only one queue exists. I wonder why we have such limit.
+    Device supports tag can send more requests. For example, AHCI can send 31
+    requests. Test (direct aio randread) shows the limits reduce about 4% disk
+    thoughput.
+    On the other hand, since we send one request one time, if other queue
+    pop when current is sending more than cfq_quantum requests, current queue will
+    stop send requests soon after one request, so sounds there is no big latency.
+    
+    Signed-off-by: Shaohua Li <shaohua.li at intel.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 886ef3fce890295b04063e286c1a82c97574b737
+Author: Jens Axboe <jens.axboe at oracle.com>
+Date:   Tue Apr 27 16:48:41 2010 +0400
+
+    Revert "cfq: Make use of service count to estimate the rb_key offset"
+    
+    This reverts commit 3586e917f2c7df769d173c4ec99554cb40a911e5.
+    
+    Corrado Zoccolo <czoccolo at gmail.com> correctly points out, that we need
+    consistency of rb_key offset across groups. This means we cannot properly
+    use the per-service_tree service count. Revert this change.
+    
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 80216a50226739cd997445d5ff2335a4c944fba7
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date:   Tue Apr 27 16:48:36 2010 +0400
+
+    cfq-iosched: fix corner cases in idling logic
+    
+    Idling logic was disabled in some corner cases, leading to unfair share
+     for noidle queues.
+     * the idle timer was not armed if there were other requests in the
+       driver. unfortunately, those requests could come from other workloads,
+       or queues for which we don't enable idling. So we will check only
+       pending requests from the active queue
+     * rq_noidle check on no-idle queue could disable the end of tree idle if
+       the last completed request was rq_noidle. Now, we will disable that
+       idle only if all the queues served in the no-idle tree had rq_noidle
+       requests.
+    
+    Reported-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+    Acked-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit fed0ad86edd704970417ce78b1a130b1951f7bb8
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date:   Tue Apr 27 16:48:32 2010 +0400
+
+    cfq-iosched: idling on deep seeky sync queues
+    
+    Seeky sync queues with large depth can gain unfairly big share of disk
+     time, at the expense of other seeky queues. This patch ensures that
+     idling will be enabled for queues with I/O depth at least 4, and small
+     think time. The decision to enable idling is sticky, until an idle
+     window times out without seeing a new request.
+    
+    The reasoning behind the decision is that, if an application is using
+    large I/O depth, it is already optimized to make full utilization of
+    the hardware, and therefore we reserve a slice of exclusive use for it.
+    
+    Reported-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 989d070f4d3594f485df16fa5b5786db8188e837
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date:   Tue Apr 27 16:48:28 2010 +0400
+
+    cfq-iosched: fix no-idle preemption logic
+    
+    An incoming no-idle queue should preempt the active no-idle queue
+     only if the active queue is idling due to service tree empty.
+     Previous code was buggy in two ways:
+     * it relied on service_tree field to be set on the active queue, while
+       it is not set when the code is idling for a new request
+     * it didn't check for the service tree empty condition, so could lead to
+       LIFO behaviour if multiple queues with depth > 1 were preempting each
+       other on an non-NCQ device.
+    
+    Reported-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+    Acked-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 1baaab33a240924a5542eeb7a275d2915dc09518
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date:   Tue Apr 27 16:48:23 2010 +0400
+
+    cfq-iosched: fix ncq detection code
+    
+    CFQ's detection of queueing devices initially assumes a queuing device
+    and detects if the queue depth reaches a certain threshold.
+    However, it will reconsider this choice periodically.
+    
+    Unfortunately, if device is considered not queuing, CFQ will force a
+    unit queue depth for some workloads, thus defeating the detection logic.
+    This leads to poor performance on queuing hardware,
+    since the idle window remains enabled.
+    
+    Given this premise, switching to hw_tag = 0 after we have proved at
+    least once that the device is NCQ capable is not a good choice.
+    
+    The new detection code starts in an indeterminate state, in which CFQ behaves
+    as if hw_tag = 1, and then, if for a long observation period we never saw
+    large depth, we switch to hw_tag = 0, otherwise we stick to hw_tag = 1,
+    without reconsidering it again.
+    
+    Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 43090d5ccb1b6adcd28b2d4d54cc8ddf6c96a212
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date:   Tue Apr 27 16:48:19 2010 +0400
+
+    cfq-iosched: cleanup unreachable code
+    
+    cfq_should_idle returns false for no-idle queues that are not the last,
+    so the control flow will never reach the removed code in a state that
+    satisfies the if condition.
+    The unreachable code was added to emulate previous cfq behaviour for
+    non-NCQ rotational devices. My tests show that even without it, the
+    performances and fairness are comparable with previous cfq, thanks to
+    the fact that all seeky queues are grouped together, and that we idle at
+    the end of the tree.
+    
+    Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+    Acked-by: Vivek Goyal <vgoyal at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit ea4004872f1e7a3a3651319fd5df6df17e9c7e66
+Author: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+Date:   Tue Apr 27 16:48:15 2010 +0400
+
+    cfq: Make use of service count to estimate the rb_key offset
+    
+    For the moment, different workload cfq queues are put into different
+    service trees. But CFQ still uses "busy_queues" to estimate rb_key
+    offset when inserting a cfq queue into a service tree. I think this
+    isn't appropriate, and it should make use of service tree count to do
+    this estimation. This patch is for for-2.6.33 branch.
+    
+    Signed-off-by: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 4c49bbef74b78184ecdc8d4c14c6d531f9edea42
+Author: Randy Dunlap <randy.dunlap at oracle.com>
+Date:   Tue Apr 27 16:46:51 2010 +0400
+
+    block: jiffies fixes
+    
+    Use HZ-independent calculation of milliseconds.
+    Add jiffies.h where it was missing since functions or macros
+    from it are used.
+    
+    Signed-off-by: Randy Dunlap <randy.dunlap at oracle.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit f96f26aeb96cc338693fe5c2d48ab04e799f0187
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date:   Tue Apr 27 16:46:46 2010 +0400
+
+    cfq-iosched: fix next_rq computation
+    
+    Cfq has a bug in computation of next_rq, that affects transition
+    between multiple sequential request streams in a single queue
+    (e.g.: two sequential buffered writers of the same priority),
+    causing the alternation between the two streams for a transient period.
+    
+      8,0    1    18737     0.260400660  5312  D   W 141653311 + 256
+      8,0    1    20839     0.273239461  5400  D   W 141653567 + 256
+      8,0    1    20841     0.276343885  5394  D   W 142803919 + 256
+      8,0    1    20843     0.279490878  5394  D   W 141668927 + 256
+      8,0    1    20845     0.292459993  5400  D   W 142804175 + 256
+      8,0    1    20847     0.295537247  5400  D   W 141668671 + 256
+      8,0    1    20849     0.298656337  5400  D   W 142804431 + 256
+      8,0    1    20851     0.311481148  5394  D   W 141668415 + 256
+      8,0    1    20853     0.314421305  5394  D   W 142804687 + 256
+      8,0    1    20855     0.318960112  5400  D   W 142804943 + 256
+    
+    The fix makes sure that the next_rq is computed from the last
+    dispatched request, and not affected by merging.
+    
+      8,0    1    37776     4.305161306     0  D   W 141738087 + 256
+      8,0    1    37778     4.308298091     0  D   W 141738343 + 256
+      8,0    1    37780     4.312885190     0  D   W 141738599 + 256
+      8,0    1    37782     4.315933291     0  D   W 141738855 + 256
+      8,0    1    37784     4.319064459     0  D   W 141739111 + 256
+      8,0    1    37786     4.331918431  5672  D   W 142803007 + 256
+      8,0    1    37788     4.334930332  5672  D   W 142803263 + 256
+      8,0    1    37790     4.337902723  5672  D   W 142803519 + 256
+      8,0    1    37792     4.342359774  5672  D   W 142803775 + 256
+      8,0    1    37794     4.345318286     0  D   W 142804031 + 256
+    
+    Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit b12c0189dd602b89f3c6d82e050a7579f5813a09
+Author: Jens Axboe <jens.axboe at oracle.com>
+Date:   Tue Apr 27 16:44:33 2010 +0400
+
+    cfq-iosched: get rid of the coop_preempt flag
+    
+    We need to rework this logic post the cooperating cfq_queue merging,
+    for now just get rid of it and Jeff Moyer will fix the fall out.
+    
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 738f35df496b0c4a214f08b356f1a08d6f87b70e
+Author: Jens Axboe <jens.axboe at oracle.com>
+Date:   Tue Apr 27 16:44:28 2010 +0400
+
+    cfq-iosched: fix merge error
+    
+    We ended up with testing the same condition twice, pretty
+    pointless. Remove that first if.
+    
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit d70f9c5005fd87d2d9bcfe5a1dd831e119d497b5
+Author: Jens Axboe <jens.axboe at oracle.com>
+Date:   Tue Apr 27 16:42:34 2010 +0400
+
+    cfq-iosched: fix style issue in cfq_get_avg_queues()
+    
+    Line breaks and bad brace placement.
+    
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 1ad5fcfc2beacbe333bd947a6a95acb9ee810891
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date:   Tue Apr 27 16:42:30 2010 +0400
+
+    cfq-iosched: fairness for sync no-idle queues
+    
+    Currently no-idle queues in cfq are not serviced fairly:
+    even if they can only dispatch a small number of requests at a time,
+    they have to compete with idling queues to be serviced, experiencing
+    large latencies.
+    
+    We should notice, instead, that no-idle queues are the ones that would
+    benefit most from having low latency, in fact they are any of:
+    * processes with large think times (e.g. interactive ones like file
+      managers)
+    * seeky (e.g. programs faulting in their code at startup)
+    * or marked as no-idle from upper levels, to improve latencies of those
+      requests.
+    
+    This patch improves the fairness and latency for those queues, by:
+    * separating sync idle, sync no-idle and async queues in separate
+      service_trees, for each priority
+    * service all no-idle queues together
+    * and idling when the last no-idle queue has been serviced, to
+      anticipate for more no-idle work
+    * the timeslices allotted for idle and no-idle service_trees are
+      computed proportionally to the number of processes in each set.
+    
+    Servicing all no-idle queues together should have a performance boost
+    for NCQ-capable drives, without compromising fairness.
+    
+    Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit e2d27033102f717078e4bfdc9229ef84dbd8088c
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date:   Tue Apr 27 16:42:03 2010 +0400
+
+    cfq-iosched: enable idling for last queue on priority class
+    
+    cfq can disable idling for queues in various circumstances.
+    When workloads of different priorities are competing, if the higher
+    priority queue has idling disabled, lower priority queues may steal
+    its disk share. For example, in a scenario with an RT process
+    performing seeky reads vs a BE process performing sequential reads,
+    on an NCQ enabled hardware, with low_latency unset,
+    the RT process will dispatch only the few pending requests every full
+    slice of service for the BE process.
+    
+    The patch solves this issue by always performing idle on the last
+    queue at a given priority class > idle. If the same process, or one
+    that can pre-empt it (so at the same priority or higher), submits a
+    new request within the idle window, the lower priority queue won't
+    dispatch, saving the disk bandwidth for higher priority ones.
+    
+    Note: this doesn't touch the non_rotational + NCQ case (no hardware
+    to test if this is a benefit in that case).
+    
+    Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit ddc6295b4d6c3461a02f98ba75cbfe900a087ee4
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date:   Tue Apr 27 16:41:59 2010 +0400
+
+    cfq-iosched: reimplement priorities using different service trees
+    
+    We use different service trees for different priority classes.
+    This allows a simplification in the service tree insertion code, that no
+    longer has to consider priority while walking the tree.
+    
+    Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit b1ca547aa679a0605bf9cfbc2ee8c4d0f9738e90
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date:   Tue Apr 27 16:41:55 2010 +0400
+
+    cfq-iosched: preparation to handle multiple service trees
+    
+    We embed a pointer to the service tree in each queue, to handle multiple
+    service trees easily.
+    Service trees are enriched with a counter.
+    cfq_add_rq_rb is invoked after putting the rq in the fifo, to ensure
+    that all fields in rq are properly initialized.
+    
+    Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 72c938338cfb00497c498fd05901c23f2fa9e6ce
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date:   Tue Apr 27 16:41:50 2010 +0400
+
+    cfq-iosched: adapt slice to number of processes doing I/O
+    
+    When the number of processes performing I/O concurrently increases,
+    a fixed time slice per process will cause large latencies.
+    
+    This patch, if low_latency mode is enabled,  will scale the time slice
+    assigned to each process according to a 300ms target latency.
+    
+    In order to keep fairness among processes:
+    * The number of active processes is computed using a special form of
+    running average, that quickly follows sudden increases (to keep latency low),
+    and decrease slowly (to have fairness in spite of rapid decreases of this
+    value).
+    
+    To safeguard sequential bandwidth, we impose a minimum time slice
+    (computed using 2*cfq_slice_idle as base, adjusted according to priority
+    and async-ness).
+    
+    Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit ca34f4ef05e2b5abcb60af65a69a367ea9f5148e
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date:   Tue Apr 27 16:41:46 2010 +0400
+
+    cfq-iosched: simplify prio-unboost code
+    
+    Eliminate redundant checks.
+    
+    Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit ab7d66cd0bd0aff8fe977d03cd20afd1ff3a5dfd
+Author: Shaohua Li <shaohua.li at intel.com>
+Date:   Tue Apr 27 16:41:42 2010 +0400
+
+    cfq-iosched: improve hw_tag detection
+    
+    If active queue hasn't enough requests and idle window opens, cfq will not
+    dispatch sufficient requests to hardware. In such situation, current code
+    will zero hw_tag. But this is because cfq doesn't dispatch enough requests
+    instead of hardware queue doesn't work. Don't zero hw_tag in such case.
+    
+    Signed-off-by: Shaohua Li <shaohua.li at intel.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 00b99100690429e98f3a8efe7f59fe124814bc67
+Author: Jeff Moyer <jmoyer at redhat.com>
+Date:   Tue Apr 27 16:41:38 2010 +0400
+
+    cfq: break apart merged cfqqs if they stop cooperating
+    
+    cfq_queues are merged if they are issuing requests within the mean seek
+    distance of one another.  This patch detects when the coopearting stops and
+    breaks the queues back up.
+    
+    Signed-off-by: Jeff Moyer <jmoyer at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 9186d4378bed803bf7cca93c1abc4d74adab2ed2
+Author: Jeff Moyer <jmoyer at redhat.com>
+Date:   Tue Apr 27 16:32:26 2010 +0400
+
+    cfq: change the meaning of the cfqq_coop flag
+    
+    The flag used to indicate that a cfqq was allowed to jump ahead in the
+    scheduling order due to submitting a request close to the queue that
+    just executed.  Since closely cooperating queues are now merged, the flag
+    holds little meaning.  Change it to indicate that multiple queues were
+    merged.  This will later be used to allow the breaking up of merged queues
+    when they are no longer cooperating.
+    
+    Signed-off-by: Jeff Moyer <jmoyer at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit de85cbb1eaf76b988bbe96f89b4761352adf4614
+Author: Jeff Moyer <jmoyer at redhat.com>
+Date:   Tue Apr 27 16:32:20 2010 +0400
+
+    cfq: merge cooperating cfq_queues
+    
+    When cooperating cfq_queues are detected currently, they are allowed to
+    skip ahead in the scheduling order.  It is much more efficient to
+    automatically share the cfq_queue data structure between cooperating processes.
+    Performance of the read-test2 benchmark (which is written to emulate the
+    dump(8) utility) went from 12MB/s to 90MB/s on my SATA disk.  NFS servers
+    with multiple nfsd threads also saw performance increases.
+    
+    Signed-off-by: Jeff Moyer <jmoyer at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit e09d12221f4d1c7fcb00fd687ae6e759c39054c6
+Author: Jeff Moyer <jmoyer at redhat.com>
+Date:   Tue Apr 27 16:18:17 2010 +0400
+
+    cfq: calculate the seek_mean per cfq_queue not per cfq_io_context
+    
+    async cfq_queue's are already shared between processes within the same
+    priority, and forthcoming patches will change the mapping of cic to sync
+    cfq_queue from 1:1 to 1:N.  So, calculate the seekiness of a process
+    based on the cfq_queue instead of the cfq_io_context.
+    
+    Signed-off-by: Jeff Moyer <jmoyer at redhat.com>
+    Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
 commit c05f95fcb04e896c898218d12a8f37c43d2f9cc6
 Author: Pavel Emelyanov <xemul at openvz.org>
 Date:   Tue Apr 27 15:10:13 2010 +0400
@@ -4590,14 +6200,14 @@
 +library.  If this is what you want to do, use the GNU Library General
 +Public License instead of this License.
 diff --git a/Makefile b/Makefile
-index 573578f..12ba193 100644
+index 801d0e1..4eac9f7 100644
 --- a/Makefile
 +++ b/Makefile
 @@ -2,6 +2,7 @@ VERSION = 2
  PATCHLEVEL = 6
  SUBLEVEL = 32
  EXTRAVERSION =
-+VZVERSION = avdeyev
++VZVERSION = balandin
  NAME = Man-Eating Seals of Antiquity
  
  # *DOCUMENTATION*
@@ -4621,10 +6231,10 @@
  
  define filechk_version.h
 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
-index 4fdb669..1334638 100644
+index fbc161d..e6cc64c 100644
 --- a/arch/x86/Kconfig
 +++ b/arch/x86/Kconfig
-@@ -2069,6 +2069,8 @@ config HAVE_ATOMIC_IOMAP
+@@ -2074,6 +2074,8 @@ config HAVE_ATOMIC_IOMAP
  	def_bool y
  	depends on X86_32
  
@@ -4633,7 +6243,7 @@
  source "net/Kconfig"
  
  source "drivers/Kconfig"
-@@ -2086,3 +2088,5 @@ source "crypto/Kconfig"
+@@ -2091,3 +2093,5 @@ source "crypto/Kconfig"
  source "arch/x86/kvm/Kconfig"
  
  source "lib/Kconfig"
@@ -5178,7 +6788,7 @@
  
  	regs.bx = (unsigned long) fn;
 diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
-index 6eabe90..490f4f5 100644
+index 868fdb4..0cc650d 100644
 --- a/arch/x86/kernel/process_64.c
 +++ b/arch/x86/kernel/process_64.c
 @@ -25,8 +25,10 @@
@@ -5230,7 +6840,7 @@
  }
  
  void release_thread(struct task_struct *dead_task)
-@@ -681,3 +684,20 @@ unsigned long KSTK_ESP(struct task_struct *task)
+@@ -680,3 +683,20 @@ unsigned long KSTK_ESP(struct task_struct *task)
  	return (test_tsk_thread_flag(task, TIF_IA32)) ?
  			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
  }
@@ -5365,263 +6975,3193 @@
  		printk(" passed.\n");
  	}
  
-+#ifdef CONFIG_VE
-+	/* TSC reset. kill whatever might rely on old values */
-+	VE_TASK_INFO(current)->wakeup_stamp = 0;
-+#endif
++#ifdef CONFIG_VE
++	/* TSC reset. kill whatever might rely on old values */
++	VE_TASK_INFO(current)->wakeup_stamp = 0;
++#endif
+ 	/*
+ 	 * Reset it - just in case we boot another CPU later:
+ 	 */
+diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
+index 3909e3b..bbfa7af 100644
+--- a/arch/x86/kernel/x8664_ksyms_64.c
++++ b/arch/x86/kernel/x8664_ksyms_64.c
+@@ -3,6 +3,7 @@
+ 
+ #include <linux/module.h>
+ #include <linux/smp.h>
++#include <linux/syscalls.h>
+ 
+ #include <net/checksum.h>
+ 
+@@ -17,6 +18,7 @@
+ EXPORT_SYMBOL(mcount);
+ #endif
+ 
++EXPORT_SYMBOL(kernel_execve);
+ EXPORT_SYMBOL(kernel_thread);
+ 
+ EXPORT_SYMBOL(__get_user_1);
+diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
+index f4cee90..3e549cd 100644
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -689,7 +689,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
+ 	if (!printk_ratelimit())
+ 		return;
+ 
+-	printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
++	ve_printk(VE_LOG, "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+ 		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+ 		tsk->comm, task_pid_nr(tsk), address,
+ 		(void *)regs->ip, (void *)regs->sp, error_code);
+@@ -909,7 +909,7 @@ spurious_fault(unsigned long error_code, unsigned long address)
+ 	return ret;
+ }
+ 
+-int show_unhandled_signals = 1;
++int show_unhandled_signals = 0;
+ 
+ static inline int
+ access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
+diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
+index f46c340..6b7330c 100644
+--- a/arch/x86/mm/hugetlbpage.c
++++ b/arch/x86/mm/hugetlbpage.c
+@@ -12,6 +12,7 @@
+ #include <linux/slab.h>
+ #include <linux/err.h>
+ #include <linux/sysctl.h>
++#include <linux/module.h>
+ #include <asm/mman.h>
+ #include <asm/tlb.h>
+ #include <asm/tlbflush.h>
+@@ -230,6 +231,7 @@ int pud_huge(pud_t pud)
+ {
+ 	return !!(pud_val(pud) & _PAGE_PSE);
+ }
++EXPORT_SYMBOL(pmd_huge);
+ 
+ struct page *
+ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
+index c9ba9de..589a93b 100644
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -4,7 +4,8 @@
+ #include <asm/tlb.h>
+ #include <asm/fixmap.h>
+ 
+-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
++#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO | __GFP_UBC
++#define PGALLOC_KERN_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+ 
+ #ifdef CONFIG_HIGHPTE
+ #define PGALLOC_USER_GFP __GFP_HIGHMEM
+@@ -16,7 +17,7 @@ gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
+ 
+ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+ {
+-	return (pte_t *)__get_free_page(PGALLOC_GFP);
++	return (pte_t *)__get_free_page(PGALLOC_KERN_GFP);
+ }
+ 
+ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
+index 36fe08e..42445e5 100644
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -256,6 +256,8 @@ void flush_tlb_mm(struct mm_struct *mm)
+ 	preempt_enable();
+ }
+ 
++EXPORT_SYMBOL(flush_tlb_mm);
++
+ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
+ {
+ 	struct mm_struct *mm = vma->vm_mm;
+diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
+index 58bc00f..b7028c5 100644
+--- a/arch/x86/vdso/vdso32-setup.c
++++ b/arch/x86/vdso/vdso32-setup.c
+@@ -17,6 +17,8 @@
+ #include <linux/err.h>
+ #include <linux/module.h>
+ 
++#include <bc/vmpages.h>
++
+ #include <asm/cpufeature.h>
+ #include <asm/msr.h>
+ #include <asm/pgtable.h>
+@@ -37,6 +39,8 @@ enum {
+ #else
+ #define VDSO_DEFAULT	VDSO_ENABLED
+ #endif
++#undef VDSO_DEFAULT
++#define VDSO_DEFAULT VDSO_DISABLED
+ 
+ #ifdef CONFIG_X86_64
+ #define vdso_enabled			sysctl_vsyscall32
+@@ -193,7 +197,8 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
+ 	}
+ }
+ 
+-static struct page *vdso32_pages[1];
++struct page *vdso32_pages[1];
++EXPORT_SYMBOL_GPL(vdso32_pages);
+ 
+ #ifdef CONFIG_X86_64
+ 
+@@ -309,16 +314,30 @@ int __init sysenter_setup(void)
+ 	return 0;
+ }
+ 
++EXPORT_SYMBOL_GPL(VDSO32_SYSENTER_RETURN);
++EXPORT_SYMBOL_GPL(VDSO32_PRELINK);
++
+ /* Setup a VMA at program startup for the vsyscall page */
+-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
++int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp,
++				unsigned long map_address)
+ {
+ 	struct mm_struct *mm = current->mm;
+-	unsigned long addr;
++	unsigned long addr = map_address;
+ 	int ret = 0;
+ 	bool compat;
++	unsigned long flags;
+ 
+-	if (vdso_enabled == VDSO_DISABLED)
++	if (vdso_enabled == VDSO_DISABLED && map_address == 0) {
++		current->mm->context.vdso = NULL;
+ 		return 0;
++	}
++
++	flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | VM_MAYWRITE |
++		mm->def_flags;
++
++	ret = -ENOMEM;
++	if (ub_memory_charge(mm, PAGE_SIZE, flags, NULL, UB_SOFT))
++		goto err_charge;
+ 
+ 	down_write(&mm->mmap_sem);
+ 
+@@ -328,19 +347,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+ 
+ 	map_compat_vdso(compat);
+ 
+-	if (compat)
+-		addr = VDSO_HIGH_BASE;
+-	else {
+-		addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
++	if (!compat || map_address) {
++		addr = get_unmapped_area(NULL, addr, PAGE_SIZE, 0, 0);
+ 		if (IS_ERR_VALUE(addr)) {
+ 			ret = addr;
+ 			goto up_fail;
+ 		}
+-	}
++	} else
++		addr = VDSO_HIGH_BASE;
+ 
+ 	current->mm->context.vdso = (void *)addr;
+ 
+-	if (compat_uses_vma || !compat) {
++	if (compat_uses_vma || !compat || map_address) {
+ 		/*
+ 		 * MAYWRITE to allow gdb to COW and set breakpoints
+ 		 *
+@@ -368,9 +386,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+ 		current->mm->context.vdso = NULL;
+ 
+ 	up_write(&mm->mmap_sem);
++	if (ret < 0)
++		ub_memory_uncharge(mm, PAGE_SIZE, flags, NULL);
++err_charge:
+ 
+ 	return ret;
+ }
++EXPORT_SYMBOL_GPL(arch_setup_additional_pages);
+ 
+ #ifdef CONFIG_X86_64
+ 
+diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
+index 21e1aeb..507ba17 100644
+--- a/arch/x86/vdso/vma.c
++++ b/arch/x86/vdso/vma.c
+@@ -4,6 +4,7 @@
+  * Subject to the GPL, v.2
+  */
+ #include <linux/mm.h>
++#include <linux/module.h>
+ #include <linux/err.h>
+ #include <linux/sched.h>
+ #include <linux/init.h>
+@@ -99,17 +100,23 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
+ 
+ /* Setup a VMA at program startup for the vsyscall page.
+    Not called for compat tasks */
+-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
++int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp,
++				unsigned long map_address)
+ {
+ 	struct mm_struct *mm = current->mm;
+ 	unsigned long addr;
+ 	int ret;
+ 
+-	if (!vdso_enabled)
++	if (!vdso_enabled && map_address == 0) {
++		current->mm->context.vdso = NULL;
+ 		return 0;
++	}
+ 
+ 	down_write(&mm->mmap_sem);
+-	addr = vdso_addr(mm->start_stack, vdso_size);
++	if (map_address)
++		addr = map_address;
++	else
++		addr = vdso_addr(mm->start_stack, vdso_size);
+ 	addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
+ 	if (IS_ERR_VALUE(addr)) {
+ 		ret = addr;
+@@ -132,6 +139,7 @@ up_fail:
+ 	up_write(&mm->mmap_sem);
+ 	return ret;
+ }
++EXPORT_SYMBOL_GPL(arch_setup_additional_pages);
+ 
+ static __init int vdso_setup(char *s)
+ {
+diff --git a/block/Kconfig b/block/Kconfig
+index 9be0b56..e20fbde 100644
+--- a/block/Kconfig
++++ b/block/Kconfig
+@@ -77,6 +77,28 @@ config BLK_DEV_INTEGRITY
+ 	T10/SCSI Data Integrity Field or the T13/ATA External Path
+ 	Protection.  If in doubt, say N.
+ 
++config BLK_CGROUP
++	bool
++	depends on CGROUPS
++	default n
++	---help---
++	Generic block IO controller cgroup interface. This is the common
++	cgroup interface which should be used by various IO controlling
++	policies.
++
++	Currently, CFQ IO scheduler uses it to recognize task groups and
++	control disk bandwidth allocation (proportional time slice allocation)
++	to such task groups.
++
++config DEBUG_BLK_CGROUP
++	bool
++	depends on BLK_CGROUP
++	default n
++	---help---
++	Enable some debugging help. Currently it stores the cgroup path
++	in the blk group which can be used by cfq for tracing various
++	group related activity.
++
+ endif # BLOCK
+ 
+ config BLOCK_COMPAT
+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
+index 7e803fc..9c5f0b5 100644
+--- a/block/Kconfig.iosched
++++ b/block/Kconfig.iosched
+@@ -40,6 +40,23 @@ config IOSCHED_CFQ
+ 	  working environment, suitable for desktop systems.
+ 	  This is the default I/O scheduler.
+ 
++config CFQ_GROUP_IOSCHED
++	bool "CFQ Group Scheduling support"
++	depends on IOSCHED_CFQ && CGROUPS
++	select BLK_CGROUP
++	default n
++	---help---
++	  Enable group IO scheduling in CFQ.
++
++config DEBUG_CFQ_IOSCHED
++	bool "Debug CFQ Scheduling"
++	depends on CFQ_GROUP_IOSCHED
++	select DEBUG_BLK_CGROUP
++	default n
++	---help---
++	  Enable CFQ IO scheduling debugging in CFQ. Currently it makes
++	  blktrace output more verbose.
++
+ choice
+ 	prompt "Default I/O scheduler"
+ 	default DEFAULT_CFQ
+diff --git a/block/Makefile b/block/Makefile
+index ba74ca6..16334c9 100644
+--- a/block/Makefile
++++ b/block/Makefile
+@@ -8,6 +8,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
+ 			blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
+ 
+ obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
++obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
+ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
+ obj-$(CONFIG_IOSCHED_AS)	+= as-iosched.o
+ obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
+diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
+new file mode 100644
+index 0000000..444f20b
+--- /dev/null
++++ b/block/blk-cgroup.c
+@@ -0,0 +1,366 @@
++/*
++ * Common Block IO controller cgroup interface
++ *
++ * Based on ideas and code from CFQ, CFS and BFQ:
++ * Copyright (C) 2003 Jens Axboe <axboe at kernel.dk>
++ *
++ * Copyright (C) 2008 Fabio Checconi <fabio at gandalf.sssup.it>
++ *		      Paolo Valente <paolo.valente at unimore.it>
++ *
++ * Copyright (C) 2009 Vivek Goyal <vgoyal at redhat.com>
++ * 	              Nauman Rafique <nauman at google.com>
++ */
++#include <linux/ioprio.h>
++#include <linux/seq_file.h>
++#include <linux/kdev_t.h>
++#include <linux/module.h>
++#include <linux/err.h>
++#include "blk-cgroup.h"
++
++static DEFINE_SPINLOCK(blkio_list_lock);
++static LIST_HEAD(blkio_list);
++
++struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
++EXPORT_SYMBOL_GPL(blkio_root_cgroup);
++
++bool blkiocg_css_tryget(struct blkio_cgroup *blkcg)
++{
++	if (!css_tryget(&blkcg->css))
++		return false;
++	return true;
++}
++EXPORT_SYMBOL_GPL(blkiocg_css_tryget);
++
++void blkiocg_css_put(struct blkio_cgroup *blkcg)
++{
++	css_put(&blkcg->css);
++}
++EXPORT_SYMBOL_GPL(blkiocg_css_put);
++
++struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
++{
++	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
++			    struct blkio_cgroup, css);
++}
++EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
++
++void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
++			unsigned long time, unsigned long sectors)
++{
++	blkg->time += time;
++	blkg->sectors += sectors;
++}
++EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
++
++void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
++			struct blkio_group *blkg, void *key, dev_t dev)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&blkcg->lock, flags);
++	rcu_assign_pointer(blkg->key, key);
++	blkg->blkcg_id = css_id(&blkcg->css);
++	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
++	spin_unlock_irqrestore(&blkcg->lock, flags);
++#ifdef CONFIG_DEBUG_BLK_CGROUP
++	/* Need to take css reference ? */
++	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
++#endif
++	blkg->dev = dev;
++}
++EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
++
++static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
++{
++	hlist_del_init_rcu(&blkg->blkcg_node);
++	blkg->blkcg_id = 0;
++}
++
++/*
++ * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
++ * indicating that blk_group was unhashed by the time we got to it.
++ */
++int blkiocg_del_blkio_group(struct blkio_group *blkg)
++{
++	struct blkio_cgroup *blkcg;
++	unsigned long flags;
++	struct cgroup_subsys_state *css;
++	int ret = 1;
++
++	rcu_read_lock();
++	css = css_lookup(&blkio_subsys, blkg->blkcg_id);
++	if (!css)
++		goto out;
++
++	blkcg = container_of(css, struct blkio_cgroup, css);
++	spin_lock_irqsave(&blkcg->lock, flags);
++	if (!hlist_unhashed(&blkg->blkcg_node)) {
++		__blkiocg_del_blkio_group(blkg);
++		ret = 0;
++	}
++	spin_unlock_irqrestore(&blkcg->lock, flags);
++out:
++	rcu_read_unlock();
++	return ret;
++}
++EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
++
++/* called under rcu_read_lock(). */
++struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
++{
++	struct blkio_group *blkg;
++	struct hlist_node *n;
++	void *__key;
++
++	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
++		__key = blkg->key;
++		if (__key == key)
++			return blkg;
++	}
++
++	return NULL;
++}
++EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
++
++#define SHOW_FUNCTION(__VAR)						\
++static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
++				       struct cftype *cftype)		\
++{									\
++	struct blkio_cgroup *blkcg;					\
++									\
++	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
++	return (u64)blkcg->__VAR;					\
++}
++
++SHOW_FUNCTION(weight);
++#undef SHOW_FUNCTION
++
++static int
++blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
++{
++	struct blkio_cgroup *blkcg;
++	struct blkio_group *blkg;
++	struct hlist_node *n;
++	struct blkio_policy_type *blkiop;
++
++	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
++		return -EINVAL;
++
++	blkcg = cgroup_to_blkio_cgroup(cgroup);
++	spin_lock(&blkio_list_lock);
++	spin_lock_irq(&blkcg->lock);
++	blkcg->weight = (unsigned int)val;
++	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
++		list_for_each_entry(blkiop, &blkio_list, list)
++			blkiop->ops.blkio_update_group_weight_fn(blkg,
++					blkcg->weight);
++	}
++	spin_unlock_irq(&blkcg->lock);
++	spin_unlock(&blkio_list_lock);
++	return 0;
++}
++
++int blkiocg_set_weight(struct cgroup *cgroup, u64 val)
++{
++	return blkiocg_weight_write(cgroup, NULL, val);
++}
++
++#define SHOW_FUNCTION_PER_GROUP(__VAR)					\
++static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
++			struct cftype *cftype, struct seq_file *m)	\
++{									\
++	struct blkio_cgroup *blkcg;					\
++	struct blkio_group *blkg;					\
++	struct hlist_node *n;						\
++									\
++	if (!cgroup_lock_live_group(cgroup))				\
++		return -ENODEV;						\
++									\
++	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
++	rcu_read_lock();						\
++	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
++		if (blkg->dev)						\
++			seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev),	\
++				 MINOR(blkg->dev), blkg->__VAR);	\
++	}								\
++	rcu_read_unlock();						\
++	cgroup_unlock();						\
++	return 0;							\
++}
++
++SHOW_FUNCTION_PER_GROUP(time);
++SHOW_FUNCTION_PER_GROUP(sectors);
++#ifdef CONFIG_DEBUG_BLK_CGROUP
++SHOW_FUNCTION_PER_GROUP(dequeue);
++#endif
++#undef SHOW_FUNCTION_PER_GROUP
++
++#ifdef CONFIG_DEBUG_BLK_CGROUP
++void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
++			unsigned long dequeue)
++{
++	blkg->dequeue += dequeue;
++}
++EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
++#endif
++
++struct cftype blkio_files[] = {
++	{
++		.name = "weight",
++		.read_u64 = blkiocg_weight_read,
++		.write_u64 = blkiocg_weight_write,
++	},
++	{
++		.name = "time",
++		.read_seq_string = blkiocg_time_read,
++	},
++	{
++		.name = "sectors",
++		.read_seq_string = blkiocg_sectors_read,
++	},
++#ifdef CONFIG_DEBUG_BLK_CGROUP
++       {
++		.name = "dequeue",
++		.read_seq_string = blkiocg_dequeue_read,
++       },
++#endif
++};
++
++static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
++{
++	return cgroup_add_files(cgroup, subsys, blkio_files,
++				ARRAY_SIZE(blkio_files));
++}
++
++static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
++{
++	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
++	unsigned long flags;
++	struct blkio_group *blkg;
++	void *key;
++	struct blkio_policy_type *blkiop;
++
++	rcu_read_lock();
++remove_entry:
++	spin_lock_irqsave(&blkcg->lock, flags);
++
++	if (hlist_empty(&blkcg->blkg_list)) {
++		spin_unlock_irqrestore(&blkcg->lock, flags);
++		goto done;
++	}
++
++	blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
++				blkcg_node);
++	key = rcu_dereference(blkg->key);
++	__blkiocg_del_blkio_group(blkg);
++
++	spin_unlock_irqrestore(&blkcg->lock, flags);
++
++	/*
++	 * This blkio_group is being unlinked as associated cgroup is going
++	 * away. Let all the IO controlling policies know about this event.
++	 *
++	 * Currently this is static call to one io controlling policy. Once
++	 * we have more policies in place, we need some dynamic registration
++	 * of callback function.
++	 */
++	spin_lock(&blkio_list_lock);
++	list_for_each_entry(blkiop, &blkio_list, list)
++		blkiop->ops.blkio_unlink_group_fn(key, blkg);
++	spin_unlock(&blkio_list_lock);
++	goto remove_entry;
++done:
++	free_css_id(&blkio_subsys, &blkcg->css);
++	rcu_read_unlock();
++	kfree(blkcg);
++}
++
++static struct cgroup_subsys_state *
++blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
++{
++	struct blkio_cgroup *blkcg, *parent_blkcg;
++
++	if (!cgroup->parent) {
++		blkcg = &blkio_root_cgroup;
++		goto done;
++	}
++
++	/* Currently we do not support hierarchy deeper than two level (0,1) */
++	parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
++	if (css_depth(&parent_blkcg->css) > 0)
++		return ERR_PTR(-EINVAL);
++
++	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
++	if (!blkcg)
++		return ERR_PTR(-ENOMEM);
++
++	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
++done:
++	spin_lock_init(&blkcg->lock);
++	INIT_HLIST_HEAD(&blkcg->blkg_list);
++
++	return &blkcg->css;
++}
++
++/*
++ * We cannot support shared io contexts, as we have no mean to support
++ * two tasks with the same ioc in two different groups without major rework
++ * of the main cic data structures.  For now we allow a task to change
++ * its cgroup only if it's the only owner of its ioc.
++ */
++static int blkiocg_can_attach(struct cgroup_subsys *subsys,
++				struct cgroup *cgroup, struct task_struct *tsk,
++				bool threadgroup)
++{
++	struct io_context *ioc;
++	int ret = 0;
++
++	/* task_lock() is needed to avoid races with exit_io_context() */
++	task_lock(tsk);
++	ioc = tsk->io_context;
++	if (ioc && atomic_read(&ioc->nr_tasks) > 1)
++		ret = -EINVAL;
++	task_unlock(tsk);
++
++	return ret;
++}
++
++static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
++				struct cgroup *prev, struct task_struct *tsk,
++				bool threadgroup)
++{
++	struct io_context *ioc;
++
++	task_lock(tsk);
++	ioc = tsk->io_context;
++	if (ioc)
++		ioc->cgroup_changed = 1;
++	task_unlock(tsk);
++}
++
++struct cgroup_subsys blkio_subsys = {
++	.name = "blkio",
++	.create = blkiocg_create,
++	.can_attach = blkiocg_can_attach,
++	.attach = blkiocg_attach,
++	.destroy = blkiocg_destroy,
++	.populate = blkiocg_populate,
++	.subsys_id = blkio_subsys_id,
++	.use_id = 1,
++};
++
++void blkio_policy_register(struct blkio_policy_type *blkiop)
++{
++	spin_lock(&blkio_list_lock);
++	list_add_tail(&blkiop->list, &blkio_list);
++	spin_unlock(&blkio_list_lock);
++}
++EXPORT_SYMBOL_GPL(blkio_policy_register);
++
++void blkio_policy_unregister(struct blkio_policy_type *blkiop)
++{
++	spin_lock(&blkio_list_lock);
++	list_del_init(&blkiop->list);
++	spin_unlock(&blkio_list_lock);
++}
++EXPORT_SYMBOL_GPL(blkio_policy_unregister);
+diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
+new file mode 100644
+index 0000000..4d316df
+--- /dev/null
++++ b/block/blk-cgroup.h
+@@ -0,0 +1,127 @@
++#ifndef _BLK_CGROUP_H
++#define _BLK_CGROUP_H
++/*
++ * Common Block IO controller cgroup interface
++ *
++ * Based on ideas and code from CFQ, CFS and BFQ:
++ * Copyright (C) 2003 Jens Axboe <axboe at kernel.dk>
++ *
++ * Copyright (C) 2008 Fabio Checconi <fabio at gandalf.sssup.it>
++ *		      Paolo Valente <paolo.valente at unimore.it>
++ *
++ * Copyright (C) 2009 Vivek Goyal <vgoyal at redhat.com>
++ * 	              Nauman Rafique <nauman at google.com>
++ */
++
++#include <linux/cgroup.h>
++
++#ifdef CONFIG_BLK_CGROUP
++
++struct blkio_cgroup {
++	struct cgroup_subsys_state css;
++	unsigned int weight;
++	spinlock_t lock;
++	struct hlist_head blkg_list;
++};
++
++struct blkio_group {
++	/* An rcu protected unique identifier for the group */
++	void *key;
++	struct hlist_node blkcg_node;
++	unsigned short blkcg_id;
++#ifdef CONFIG_DEBUG_BLK_CGROUP
++	/* Store cgroup path */
++	char path[128];
++	/* How many times this group has been removed from service tree */
++	unsigned long dequeue;
++#endif
++	/* The device MKDEV(major, minor), this group has been created for */
++	dev_t   dev;
++
++	/* total disk time and nr sectors dispatched by this group */
++	unsigned long time;
++	unsigned long sectors;
++};
++
++extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg);
++extern void blkiocg_css_put(struct blkio_cgroup *blkcg);
++
++typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
++typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
++						unsigned int weight);
++
++struct blkio_policy_ops {
++	blkio_unlink_group_fn *blkio_unlink_group_fn;
++	blkio_update_group_weight_fn *blkio_update_group_weight_fn;
++};
++
++struct blkio_policy_type {
++	struct list_head list;
++	struct blkio_policy_ops ops;
++};
++
++/* Blkio controller policy registration */
++extern void blkio_policy_register(struct blkio_policy_type *);
++extern void blkio_policy_unregister(struct blkio_policy_type *);
++
++#else
++
++struct blkio_group {
++};
++
++struct blkio_policy_type {
++};
++
++static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
++static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
++
++#endif
++
++#define BLKIO_WEIGHT_MIN	100
++#define BLKIO_WEIGHT_MAX	1000
++#define BLKIO_WEIGHT_DEFAULT	500
++
++#ifdef CONFIG_DEBUG_BLK_CGROUP
++static inline char *blkg_path(struct blkio_group *blkg)
++{
++	return blkg->path;
++}
++void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
++				unsigned long dequeue);
++#else
++static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
++static inline void blkiocg_update_blkio_group_dequeue_stats(
++			struct blkio_group *blkg, unsigned long dequeue) {}
++#endif
++
++#ifdef CONFIG_BLK_CGROUP
++extern struct blkio_cgroup blkio_root_cgroup;
++extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
++extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
++			struct blkio_group *blkg, void *key, dev_t dev);
++extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
++extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
++						void *key);
++void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
++			unsigned long time, unsigned long sectors);
++#else
++struct cgroup;
++static inline struct blkio_cgroup *
++cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
++
++static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
++			struct blkio_group *blkg, void *key, dev_t dev)
++{
++}
++
++static inline int
++blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
++
++static inline struct blkio_group *
++blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
++static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
++			unsigned long time, unsigned long sectors)
++{
++}
++#endif
++#endif /* _BLK_CGROUP_H */
+diff --git a/block/blk-settings.c b/block/blk-settings.c
+index 9651c0a..06c6694 100644
+--- a/block/blk-settings.c
++++ b/block/blk-settings.c
+@@ -9,6 +9,7 @@
+ #include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
+ #include <linux/gcd.h>
+ #include <linux/lcm.h>
++#include <linux/jiffies.h>
+ 
+ #include "blk.h"
+ 
+@@ -142,7 +143,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
+ 	q->nr_batching = BLK_BATCH_REQ;
+ 
+ 	q->unplug_thresh = 4;		/* hmm */
+-	q->unplug_delay = (3 * HZ) / 1000;	/* 3 milliseconds */
++	q->unplug_delay = msecs_to_jiffies(3);	/* 3 milliseconds */
+ 	if (q->unplug_delay == 0)
+ 		q->unplug_delay = 1;
+ 
+diff --git a/block/bsg.c b/block/bsg.c
+index 0676301..a9fd2d8 100644
+--- a/block/bsg.c
++++ b/block/bsg.c
+@@ -15,6 +15,7 @@
+ #include <linux/blkdev.h>
+ #include <linux/poll.h>
+ #include <linux/cdev.h>
++#include <linux/jiffies.h>
+ #include <linux/percpu.h>
+ #include <linux/uio.h>
+ #include <linux/idr.h>
+@@ -197,7 +198,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
+ 	rq->cmd_len = hdr->request_len;
+ 	rq->cmd_type = REQ_TYPE_BLOCK_PC;
+ 
+-	rq->timeout = (hdr->timeout * HZ) / 1000;
++	rq->timeout = msecs_to_jiffies(hdr->timeout);
+ 	if (!rq->timeout)
+ 		rq->timeout = q->sg_timeout;
+ 	if (!rq->timeout)
+diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
+index aa1e953..023f4e6 100644
+--- a/block/cfq-iosched.c
++++ b/block/cfq-iosched.c
+@@ -9,9 +9,11 @@
+ #include <linux/module.h>
+ #include <linux/blkdev.h>
+ #include <linux/elevator.h>
++#include <linux/jiffies.h>
+ #include <linux/rbtree.h>
+ #include <linux/ioprio.h>
+ #include <linux/blktrace_api.h>
++#include "blk-cgroup.h"
+ 
+ /*
+  * tunables
+@@ -27,6 +29,8 @@ static const int cfq_slice_sync = HZ / 10;
+ static int cfq_slice_async = HZ / 25;
+ static const int cfq_slice_async_rq = 2;
+ static int cfq_slice_idle = HZ / 125;
++static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
++static const int cfq_hist_divisor = 4;
+ 
+ /*
+  * offset from end of service tree
+@@ -40,6 +44,10 @@ static int cfq_slice_idle = HZ / 125;
+ 
+ #define CFQ_SLICE_SCALE		(5)
+ #define CFQ_HW_QUEUE_MIN	(5)
++#define CFQ_SERVICE_SHIFT       12
++
++#define CFQQ_SEEK_THR		8 * 1024
++#define CFQQ_SEEKY(cfqq)	((cfqq)->seek_mean > CFQQ_SEEK_THR)
+ 
+ #define RQ_CIC(rq)		\
+ 	((struct cfq_io_context *) (rq)->elevator_private)
+@@ -57,6 +65,7 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
+ #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
+ 
+ #define sample_valid(samples)	((samples) > 80)
++#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
+ 
+ /*
+  * Most of our rbtree usage is for sorting with min extraction, so
+@@ -67,8 +76,12 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
+ struct cfq_rb_root {
+ 	struct rb_root rb;
+ 	struct rb_node *left;
++	unsigned count;
++	u64 min_vdisktime;
++	struct rb_node *active;
++	unsigned total_weight;
+ };
+-#define CFQ_RB_ROOT	(struct cfq_rb_root) { RB_ROOT, NULL, }
++#define CFQ_RB_ROOT	(struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, }
+ 
+ /*
+  * Per process-grouping structure
+@@ -99,6 +112,11 @@ struct cfq_queue {
+ 	/* fifo list of requests in sort_list */
+ 	struct list_head fifo;
+ 
++	/* time when queue got scheduled in to dispatch first request. */
++	unsigned long dispatch_start;
++	unsigned int allocated_slice;
++	/* time when first request from queue completed and slice started. */
++	unsigned long slice_start;
+ 	unsigned long slice_end;
+ 	long slice_resid;
+ 	unsigned int slice_dispatch;
+@@ -112,7 +130,70 @@ struct cfq_queue {
+ 	unsigned short ioprio, org_ioprio;
+ 	unsigned short ioprio_class, org_ioprio_class;
+ 
++	unsigned int seek_samples;
++	u64 seek_total;
++	sector_t seek_mean;
++	sector_t last_request_pos;
++
+ 	pid_t pid;
++
++	struct cfq_rb_root *service_tree;
++	struct cfq_queue *new_cfqq;
++	struct cfq_group *cfqg;
++	struct cfq_group *orig_cfqg;
++	/* Sectors dispatched in current dispatch round */
++	unsigned long nr_sectors;
++};
++
++/*
++ * First index in the service_trees.
++ * IDLE is handled separately, so it has negative index
++ */
++enum wl_prio_t {
++	BE_WORKLOAD = 0,
++	RT_WORKLOAD = 1,
++	IDLE_WORKLOAD = 2,
++};
++
++/*
++ * Second index in the service_trees.
++ */
++enum wl_type_t {
++	ASYNC_WORKLOAD = 0,
++	SYNC_NOIDLE_WORKLOAD = 1,
++	SYNC_WORKLOAD = 2
++};
++
++/* This is per cgroup per device grouping structure */
++struct cfq_group {
++	/* group service_tree member */
++	struct rb_node rb_node;
++
++	/* group service_tree key */
++	u64 vdisktime;
++	unsigned int weight;
++	bool on_st;
++
++	/* number of cfqq currently on this group */
++	int nr_cfqq;
++
++	/* Per group busy queus average. Useful for workload slice calc. */
++	unsigned int busy_queues_avg[2];
++	/*
++	 * rr lists of queues with requests, onle rr for each priority class.
++	 * Counts are embedded in the cfq_rb_root
++	 */
++	struct cfq_rb_root service_trees[2][3];
++	struct cfq_rb_root service_tree_idle;
++
++	unsigned long saved_workload_slice;
++	enum wl_type_t saved_workload;
++	enum wl_prio_t saved_serving_prio;
++	struct blkio_group blkg;
++#ifdef CONFIG_CFQ_GROUP_IOSCHED
++	struct hlist_node cfqd_node;
++	atomic_t ref;
++#endif
+ };
+ 
+ /*
+@@ -120,11 +201,18 @@ struct cfq_queue {
+  */
+ struct cfq_data {
+ 	struct request_queue *queue;
++	/* Root service tree for cfq_groups */
++	struct cfq_rb_root grp_service_tree;
++	struct cfq_group root_group;
+ 
+ 	/*
+-	 * rr list of queues with requests and the count of them
++	 * The priority currently being served
+ 	 */
+-	struct cfq_rb_root service_tree;
++	enum wl_prio_t serving_prio;
++	enum wl_type_t serving_type;
++	unsigned long workload_expires;
++	struct cfq_group *serving_group;
++	bool noidle_tree_requires_idle;
+ 
+ 	/*
+ 	 * Each priority tree is sorted by next_request position.  These
+@@ -143,8 +231,14 @@ struct cfq_data {
+ 	 */
+ 	int rq_queued;
+ 	int hw_tag;
+-	int hw_tag_samples;
+-	int rq_in_driver_peak;
++	/*
++	 * hw_tag can be
++	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
++	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
++	 *  0 => no NCQ
++	 */
++	int hw_tag_est_depth;
++	unsigned int hw_tag_samples;
+ 
+ 	/*
+ 	 * idle window management
+@@ -174,6 +268,7 @@ struct cfq_data {
+ 	unsigned int cfq_slice_async_rq;
+ 	unsigned int cfq_slice_idle;
+ 	unsigned int cfq_latency;
++	unsigned int cfq_group_isolation;
+ 
+ 	struct list_head cic_list;
+ 
+@@ -182,9 +277,28 @@ struct cfq_data {
+ 	 */
+ 	struct cfq_queue oom_cfqq;
+ 
+-	unsigned long last_end_sync_rq;
++	unsigned long last_delayed_sync;
++
++	/* List of cfq groups being managed on this device*/
++	struct hlist_head cfqg_list;
++	struct rcu_head rcu;
+ };
+ 
++static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
++
++static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
++					    enum wl_prio_t prio,
++					    enum wl_type_t type)
++{
++	if (!cfqg)
++		return NULL;
++
++	if (prio == IDLE_WORKLOAD)
++		return &cfqg->service_tree_idle;
++
++	return &cfqg->service_trees[prio][type];
++}
++
+ enum cfqq_state_flags {
+ 	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
+ 	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
+@@ -195,8 +309,10 @@ enum cfqq_state_flags {
+ 	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
+ 	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
+ 	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
+-	CFQ_CFQQ_FLAG_coop,		/* has done a coop jump of the queue */
+-	CFQ_CFQQ_FLAG_coop_preempt,	/* coop preempt */
++	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
++	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
++	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
++	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
+ };
+ 
+ #define CFQ_CFQQ_FNS(name)						\
+@@ -223,14 +339,78 @@ CFQ_CFQQ_FNS(prio_changed);
+ CFQ_CFQQ_FNS(slice_new);
+ CFQ_CFQQ_FNS(sync);
+ CFQ_CFQQ_FNS(coop);
+-CFQ_CFQQ_FNS(coop_preempt);
++CFQ_CFQQ_FNS(split_coop);
++CFQ_CFQQ_FNS(deep);
++CFQ_CFQQ_FNS(wait_busy);
+ #undef CFQ_CFQQ_FNS
+ 
++#ifdef CONFIG_DEBUG_CFQ_IOSCHED
++#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
++	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
++			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
++			blkg_path(&(cfqq)->cfqg->blkg), ##args);
++
++#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)				\
++	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
++				blkg_path(&(cfqg)->blkg), ##args);      \
++
++#else
+ #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
+ 	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
++#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0);
++#endif
+ #define cfq_log(cfqd, fmt, args...)	\
+ 	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
+ 
++/* Traverses through cfq group service trees */
++#define for_each_cfqg_st(cfqg, i, j, st) \
++	for (i = 0; i <= IDLE_WORKLOAD; i++) \
++		for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
++			: &cfqg->service_tree_idle; \
++			(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
++			(i == IDLE_WORKLOAD && j == 0); \
++			j++, st = i < IDLE_WORKLOAD ? \
++			&cfqg->service_trees[i][j]: NULL) \
++
++
++static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
++{
++	if (cfq_class_idle(cfqq))
++		return IDLE_WORKLOAD;
++	if (cfq_class_rt(cfqq))
++		return RT_WORKLOAD;
++	return BE_WORKLOAD;
++}
++
++
++static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
++{
++	if (!cfq_cfqq_sync(cfqq))
++		return ASYNC_WORKLOAD;
++	if (!cfq_cfqq_idle_window(cfqq))
++		return SYNC_NOIDLE_WORKLOAD;
++	return SYNC_WORKLOAD;
++}
++
++static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
++					struct cfq_data *cfqd,
++					struct cfq_group *cfqg)
++{
++	if (wl == IDLE_WORKLOAD)
++		return cfqg->service_tree_idle.count;
++
++	return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
++		+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
++		+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;
++}
++
++static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
++					struct cfq_group *cfqg)
++{
++	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
++		+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
++}
++
+ static void cfq_dispatch_insert(struct request_queue *, struct request *);
+ static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
+ 				       struct io_context *, gfp_t);
+@@ -279,7 +459,7 @@ static int cfq_queue_empty(struct request_queue *q)
+ {
+ 	struct cfq_data *cfqd = q->elevator->elevator_data;
+ 
+-	return !cfqd->busy_queues;
++	return !cfqd->rq_queued;
+ }
+ 
+ /*
+@@ -303,10 +483,110 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ 	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
+ }
+ 
++static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
++{
++	u64 d = delta << CFQ_SERVICE_SHIFT;
++
++	d = d * BLKIO_WEIGHT_DEFAULT;
++	do_div(d, cfqg->weight);
++	return d;
++}
++
++static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
++{
++	s64 delta = (s64)(vdisktime - min_vdisktime);
++	if (delta > 0)
++		min_vdisktime = vdisktime;
++
++	return min_vdisktime;
++}
++
++static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
++{
++	s64 delta = (s64)(vdisktime - min_vdisktime);
++	if (delta < 0)
++		min_vdisktime = vdisktime;
++
++	return min_vdisktime;
++}
++
++static void update_min_vdisktime(struct cfq_rb_root *st)
++{
++	u64 vdisktime = st->min_vdisktime;
++	struct cfq_group *cfqg;
++
++	if (st->active) {
++		cfqg = rb_entry_cfqg(st->active);
++		vdisktime = cfqg->vdisktime;
++	}
++
++	if (st->left) {
++		cfqg = rb_entry_cfqg(st->left);
++		vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
++	}
++
++	st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
++}
++
++/*
++ * get averaged number of queues of RT/BE priority.
++ * average is updated, with a formula that gives more weight to higher numbers,
++ * to quickly follows sudden increases and decrease slowly
++ */
++
++static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
++					struct cfq_group *cfqg, bool rt)
++{
++	unsigned min_q, max_q;
++	unsigned mult  = cfq_hist_divisor - 1;
++	unsigned round = cfq_hist_divisor / 2;
++	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
++
++	min_q = min(cfqg->busy_queues_avg[rt], busy);
++	max_q = max(cfqg->busy_queues_avg[rt], busy);
++	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
++		cfq_hist_divisor;
++	return cfqg->busy_queues_avg[rt];
++}
++
++static inline unsigned
++cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
++{
++	struct cfq_rb_root *st = &cfqd->grp_service_tree;
++
++	return cfq_target_latency * cfqg->weight / st->total_weight;
++}
++
+ static inline void
+ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ {
+-	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
++	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
++	if (cfqd->cfq_latency) {
++		/*
++		 * interested queues (we consider only the ones with the same
++		 * priority class in the cfq group)
++		 */
++		unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
++						cfq_class_rt(cfqq));
++		unsigned sync_slice = cfqd->cfq_slice[1];
++		unsigned expect_latency = sync_slice * iq;
++		unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
++
++		if (expect_latency > group_slice) {
++			unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
++			/* scale low_slice according to IO priority
++			 * and sync vs async */
++			unsigned low_slice =
++				min(slice, base_low_slice * slice / sync_slice);
++			/* the adapted slice value is scaled to fit all iqs
++			 * into the target latency */
++			slice = max(slice * group_slice / expect_latency,
++				    low_slice);
++		}
++	}
++	cfqq->slice_start = jiffies;
++	cfqq->slice_end = jiffies + slice;
++	cfqq->allocated_slice = slice;
+ 	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
+ }
+ 
+@@ -331,9 +611,9 @@ static inline bool cfq_slice_used(struct cfq_queue *cfqq)
+  * behind the head is penalized and only allowed to a certain extent.
+  */
+ static struct request *
+-cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
++cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
+ {
+-	sector_t last, s1, s2, d1 = 0, d2 = 0;
++	sector_t s1, s2, d1 = 0, d2 = 0;
+ 	unsigned long back_max;
+ #define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
+ #define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
+@@ -356,8 +636,6 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
+ 	s1 = blk_rq_pos(rq1);
+ 	s2 = blk_rq_pos(rq2);
+ 
+-	last = cfqd->last_position;
+-
+ 	/*
+ 	 * by definition, 1KiB is 2 sectors
+ 	 */
+@@ -425,6 +703,10 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
+  */
+ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
+ {
++	/* Service tree is empty */
++	if (!root->count)
++		return NULL;
++
+ 	if (!root->left)
+ 		root->left = rb_first(&root->rb);
+ 
+@@ -434,6 +716,17 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
+ 	return NULL;
+ }
+ 
++static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
++{
++	if (!root->left)
++		root->left = rb_first(&root->rb);
++
++	if (root->left)
++		return rb_entry_cfqg(root->left);
++
++	return NULL;
++}
++
+ static void rb_erase_init(struct rb_node *n, struct rb_root *root)
+ {
+ 	rb_erase(n, root);
+@@ -445,6 +738,7 @@ static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
+ 	if (root->left == n)
+ 		root->left = NULL;
+ 	rb_erase_init(n, &root->rb);
++	--root->count;
+ }
+ 
+ /*
+@@ -471,7 +765,7 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ 			next = rb_entry_rq(rbnext);
+ 	}
+ 
+-	return cfq_choose_req(cfqd, next, prev);
++	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
+ }
+ 
+ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
+@@ -480,12 +774,334 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
+ 	/*
+ 	 * just an approximation, should be ok.
+ 	 */
+-	return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) -
++	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
+ 		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
+ }
+ 
++static inline s64
++cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
++{
++	return cfqg->vdisktime - st->min_vdisktime;
++}
++
++static void
++__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
++{
++	struct rb_node **node = &st->rb.rb_node;
++	struct rb_node *parent = NULL;
++	struct cfq_group *__cfqg;
++	s64 key = cfqg_key(st, cfqg);
++	int left = 1;
++
++	while (*node != NULL) {
++		parent = *node;
++		__cfqg = rb_entry_cfqg(parent);
++
++		if (key < cfqg_key(st, __cfqg))
++			node = &parent->rb_left;
++		else {
++			node = &parent->rb_right;
++			left = 0;
++		}
++	}
++
++	if (left)
++		st->left = &cfqg->rb_node;
++
++	rb_link_node(&cfqg->rb_node, parent, node);
++	rb_insert_color(&cfqg->rb_node, &st->rb);
++}
++
++static void
++cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
++{
++	struct cfq_rb_root *st = &cfqd->grp_service_tree;
++	struct cfq_group *__cfqg;
++	struct rb_node *n;
++
++	cfqg->nr_cfqq++;
++	if (cfqg->on_st)
++		return;
++
++	/*
++	 * Currently put the group at the end. Later implement something
++	 * so that groups get lesser vtime based on their weights, so that
++	 * if group does not loose all if it was not continously backlogged.
++	 */
++	n = rb_last(&st->rb);
++	if (n) {
++		__cfqg = rb_entry_cfqg(n);
++		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
++	} else
++		cfqg->vdisktime = st->min_vdisktime;
++
++	__cfq_group_service_tree_add(st, cfqg);
++	cfqg->on_st = true;
++	st->total_weight += cfqg->weight;
++}
++
++static void
++cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
++{
++	struct cfq_rb_root *st = &cfqd->grp_service_tree;
++
++	if (st->active == &cfqg->rb_node)
++		st->active = NULL;
++
++	BUG_ON(cfqg->nr_cfqq < 1);
++	cfqg->nr_cfqq--;
++
++	/* If there are other cfq queues under this group, don't delete it */
++	if (cfqg->nr_cfqq)
++		return;
++
++	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
++	cfqg->on_st = false;
++	st->total_weight -= cfqg->weight;
++	if (!RB_EMPTY_NODE(&cfqg->rb_node))
++		cfq_rb_erase(&cfqg->rb_node, st);
++	cfqg->saved_workload_slice = 0;
++	blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1);
++}
++
++static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
++{
++	unsigned int slice_used;
++
++	/*
++	 * Queue got expired before even a single request completed or
++	 * got expired immediately after first request completion.
++	 */
++	if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
++		/*
++		 * Also charge the seek time incurred to the group, otherwise
++		 * if there are mutiple queues in the group, each can dispatch
++		 * a single request on seeky media and cause lots of seek time
++		 * and group will never know it.
++		 */
++		slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
++					1);
++	} else {
++		slice_used = jiffies - cfqq->slice_start;
++		if (slice_used > cfqq->allocated_slice)
++			slice_used = cfqq->allocated_slice;
++	}
++
++	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used,
++				cfqq->nr_sectors);
++	return slice_used;
++}
++
++static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
++				struct cfq_queue *cfqq)
++{
++	struct cfq_rb_root *st = &cfqd->grp_service_tree;
++	unsigned int used_sl, charge_sl;
++	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
++			- cfqg->service_tree_idle.count;
++
++	BUG_ON(nr_sync < 0);
++	used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);
++
++	if (!cfq_cfqq_sync(cfqq) && !nr_sync)
++		charge_sl = cfqq->allocated_slice;
++
++	/* Can't update vdisktime while group is on service tree */
++	cfq_rb_erase(&cfqg->rb_node, st);
++	cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);
++	__cfq_group_service_tree_add(st, cfqg);
++
++	/* This group is being expired. Save the context */
++	if (time_after(cfqd->workload_expires, jiffies)) {
++		cfqg->saved_workload_slice = cfqd->workload_expires
++						- jiffies;
++		cfqg->saved_workload = cfqd->serving_type;
++		cfqg->saved_serving_prio = cfqd->serving_prio;
++	} else
++		cfqg->saved_workload_slice = 0;
++
++	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
++					st->min_vdisktime);
++	blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl,
++						cfqq->nr_sectors);
++}
++
++#ifdef CONFIG_CFQ_GROUP_IOSCHED
++static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
++{
++	if (blkg)
++		return container_of(blkg, struct cfq_group, blkg);
++	return NULL;
++}
++
++void
++cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
++{
++	cfqg_of_blkg(blkg)->weight = weight;
++}
++
++static struct cfq_group *
++cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
++{
++	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
++	struct cfq_group *cfqg = NULL;
++	void *key = cfqd;
++	int i, j;
++	struct cfq_rb_root *st;
++	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
++	unsigned int major, minor;
++
++	/* Do we need to take this reference */
++	if (!blkiocg_css_tryget(blkcg))
++		return NULL;;
++
++	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
++	if (cfqg || !create)
++		goto done;
++
++	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
++	if (!cfqg)
++		goto done;
++
++	cfqg->weight = blkcg->weight;
++	for_each_cfqg_st(cfqg, i, j, st)
++		*st = CFQ_RB_ROOT;
++	RB_CLEAR_NODE(&cfqg->rb_node);
++
++	/*
++	 * Take the initial reference that will be released on destroy
++	 * This can be thought of a joint reference by cgroup and
++	 * elevator which will be dropped by either elevator exit
++	 * or cgroup deletion path depending on who is exiting first.
++	 */
++	atomic_set(&cfqg->ref, 1);
++
++	/* Add group onto cgroup list */
++	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
++	blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
++					MKDEV(major, minor));
++
++	/* Add group on cfqd list */
++	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
++
++done:
++	blkiocg_css_put(blkcg);
++	return cfqg;
++}
++
++/*
++ * Search for the cfq group current task belongs to. If create = 1, then also
++ * create the cfq group if it does not exist. request_queue lock must be held.
++ */
++static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
++{
++	struct cgroup *cgroup;
++	struct cfq_group *cfqg = NULL;
++
++	rcu_read_lock();
++	cgroup = task_cgroup(current, blkio_subsys_id);
++	cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
++	if (!cfqg && create)
++		cfqg = &cfqd->root_group;
++	rcu_read_unlock();
++	return cfqg;
++}
++
++static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
++{
++	/* Currently, all async queues are mapped to root group */
++	if (!cfq_cfqq_sync(cfqq))
++		cfqg = &cfqq->cfqd->root_group;
++
++	cfqq->cfqg = cfqg;
++	/* cfqq reference on cfqg */
++	atomic_inc(&cfqq->cfqg->ref);
++}
++
++static void cfq_put_cfqg(struct cfq_group *cfqg)
++{
++	struct cfq_rb_root *st;
++	int i, j;
++
++	BUG_ON(atomic_read(&cfqg->ref) <= 0);
++	if (!atomic_dec_and_test(&cfqg->ref))
++		return;
++	for_each_cfqg_st(cfqg, i, j, st)
++		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
++	kfree(cfqg);
++}
++
++static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
++{
++	/* Something wrong if we are trying to remove same group twice */
++	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
++
++	hlist_del_init(&cfqg->cfqd_node);
++
++	/*
++	 * Put the reference taken at the time of creation so that when all
++	 * queues are gone, group can be destroyed.
++	 */
++	cfq_put_cfqg(cfqg);
++}
++
++static void cfq_release_cfq_groups(struct cfq_data *cfqd)
++{
++	struct hlist_node *pos, *n;
++	struct cfq_group *cfqg;
++
++	hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
++		/*
++		 * If cgroup removal path got to blk_group first and removed
++		 * it from cgroup list, then it will take care of destroying
++		 * cfqg also.
++		 */
++		if (!blkiocg_del_blkio_group(&cfqg->blkg))
++			cfq_destroy_cfqg(cfqd, cfqg);
++	}
++}
++
++/*
++ * Blk cgroup controller notification saying that blkio_group object is being
++ * delinked as associated cgroup object is going away. That also means that
++ * no new IO will come in this group. So get rid of this group as soon as
++ * any pending IO in the group is finished.
++ *
++ * This function is called under rcu_read_lock(). key is the rcu protected
++ * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
++ * read lock.
++ *
++ * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
++ * it should not be NULL as even if elevator was exiting, cgroup deltion
++ * path got to it first.
++ */
++void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
++{
++	unsigned long  flags;
++	struct cfq_data *cfqd = key;
++
++	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
++	cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
++	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
++}
++
++#else /* GROUP_IOSCHED */
++static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
++{
++	return &cfqd->root_group;
++}
++static inline void
++cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
++	cfqq->cfqg = cfqg;
++}
++
++static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
++static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
++
++#endif /* GROUP_IOSCHED */
++
+ /*
+- * The cfqd->service_tree holds all pending cfq_queue's that have
++ * The cfqd->service_trees holds all pending cfq_queue's that have
+  * requests waiting to be processed. It is sorted in the order that
+  * we will service the queues.
+  */
+@@ -495,11 +1111,42 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ 	struct rb_node **p, *parent;
+ 	struct cfq_queue *__cfqq;
+ 	unsigned long rb_key;
++	struct cfq_rb_root *service_tree;
+ 	int left;
++	int new_cfqq = 1;
++	int group_changed = 0;
++
++#ifdef CONFIG_CFQ_GROUP_IOSCHED
++	if (!cfqd->cfq_group_isolation
++	    && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
++	    && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
++		/* Move this cfq to root group */
++		cfq_log_cfqq(cfqd, cfqq, "moving to root group");
++		if (!RB_EMPTY_NODE(&cfqq->rb_node))
++			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
++		cfqq->orig_cfqg = cfqq->cfqg;
++		cfqq->cfqg = &cfqd->root_group;
++		atomic_inc(&cfqd->root_group.ref);
++		group_changed = 1;
++	} else if (!cfqd->cfq_group_isolation
++		   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
++		/* cfqq is sequential now needs to go to its original group */
++		BUG_ON(cfqq->cfqg != &cfqd->root_group);
++		if (!RB_EMPTY_NODE(&cfqq->rb_node))
++			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
++		cfq_put_cfqg(cfqq->cfqg);
++		cfqq->cfqg = cfqq->orig_cfqg;
++		cfqq->orig_cfqg = NULL;
++		group_changed = 1;
++		cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
++	}
++#endif
+ 
++	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
++						cfqq_type(cfqq));
+ 	if (cfq_class_idle(cfqq)) {
+ 		rb_key = CFQ_IDLE_DELAY;
+-		parent = rb_last(&cfqd->service_tree.rb);
++		parent = rb_last(&service_tree->rb);
+ 		if (parent && parent != &cfqq->rb_node) {
+ 			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
+ 			rb_key += __cfqq->rb_key;
+@@ -517,23 +1164,27 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ 		cfqq->slice_resid = 0;
+ 	} else {
+ 		rb_key = -HZ;
+-		__cfqq = cfq_rb_first(&cfqd->service_tree);
++		__cfqq = cfq_rb_first(service_tree);
+ 		rb_key += __cfqq ? __cfqq->rb_key : jiffies;
+ 	}
+ 
+ 	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
++		new_cfqq = 0;
+ 		/*
+ 		 * same position, nothing more to do
+ 		 */
+-		if (rb_key == cfqq->rb_key)
++		if (rb_key == cfqq->rb_key &&
++		    cfqq->service_tree == service_tree)
+ 			return;
+ 
+-		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
++		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
++		cfqq->service_tree = NULL;
+ 	}
+ 
+ 	left = 1;
+ 	parent = NULL;
+-	p = &cfqd->service_tree.rb.rb_node;
++	cfqq->service_tree = service_tree;
++	p = &service_tree->rb.rb_node;
+ 	while (*p) {
+ 		struct rb_node **n;
+ 
+@@ -541,35 +1192,28 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ 		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
+ 
+ 		/*
+-		 * sort RT queues first, we always want to give
+-		 * preference to them. IDLE queues goes to the back.
+-		 * after that, sort on the next service time.
++		 * sort by key, that represents service time.
+ 		 */
+-		if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq))
++		if (time_before(rb_key, __cfqq->rb_key))
+ 			n = &(*p)->rb_left;
+-		else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
+-			n = &(*p)->rb_right;
+-		else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
+-			n = &(*p)->rb_left;
+-		else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
+-			n = &(*p)->rb_right;
+-		else if (time_before(rb_key, __cfqq->rb_key))
+-			n = &(*p)->rb_left;
+-		else
++		else {
+ 			n = &(*p)->rb_right;
+-
+-		if (n == &(*p)->rb_right)
+ 			left = 0;
++		}
+ 
+ 		p = n;
+ 	}
+ 
+ 	if (left)
+-		cfqd->service_tree.left = &cfqq->rb_node;
++		service_tree->left = &cfqq->rb_node;
+ 
+ 	cfqq->rb_key = rb_key;
+ 	rb_link_node(&cfqq->rb_node, parent, p);
+-	rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
++	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
++	service_tree->count++;
++	if ((add_front || !new_cfqq) && !group_changed)
++		return;
++	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
+ }
+ 
+ static struct cfq_queue *
+@@ -671,13 +1315,16 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
+ 	cfq_clear_cfqq_on_rr(cfqq);
+ 
+-	if (!RB_EMPTY_NODE(&cfqq->rb_node))
+-		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
++	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
++		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
++		cfqq->service_tree = NULL;
++	}
+ 	if (cfqq->p_root) {
+ 		rb_erase(&cfqq->p_node, cfqq->p_root);
+ 		cfqq->p_root = NULL;
+ 	}
+ 
++	cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+ 	BUG_ON(!cfqd->busy_queues);
+ 	cfqd->busy_queues--;
+ }
+@@ -688,7 +1335,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ static void cfq_del_rq_rb(struct request *rq)
+ {
+ 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
+-	struct cfq_data *cfqd = cfqq->cfqd;
+ 	const int sync = rq_is_sync(rq);
+ 
+ 	BUG_ON(!cfqq->queued[sync]);
+@@ -696,8 +1342,17 @@ static void cfq_del_rq_rb(struct request *rq)
+ 
+ 	elv_rb_del(&cfqq->sort_list, rq);
+ 
+-	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
+-		cfq_del_cfqq_rr(cfqd, cfqq);
++	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
++		/*
++		 * Queue will be deleted from service tree when we actually
++		 * expire it later. Right now just remove it from prio tree
++		 * as it is empty.
++		 */
++		if (cfqq->p_root) {
++			rb_erase(&cfqq->p_node, cfqq->p_root);
++			cfqq->p_root = NULL;
++		}
++	}
+ }
+ 
+ static void cfq_add_rq_rb(struct request *rq)
+@@ -722,7 +1377,7 @@ static void cfq_add_rq_rb(struct request *rq)
+ 	 * check if this request is a better next-serve candidate
+ 	 */
+ 	prev = cfqq->next_rq;
+-	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
++	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
+ 
+ 	/*
+ 	 * adjust priority tree position, if ->next_rq changes
+@@ -829,6 +1484,7 @@ static void
+ cfq_merged_requests(struct request_queue *q, struct request *rq,
+ 		    struct request *next)
+ {
++	struct cfq_queue *cfqq = RQ_CFQQ(rq);
+ 	/*
+ 	 * reposition in fifo if next is older than rq
+ 	 */
+@@ -838,6 +1494,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
+ 		rq_set_fifo_time(rq, rq_fifo_time(next));
+ 	}
+ 
++	if (cfqq->next_rq == next)
++		cfqq->next_rq = rq;
+ 	cfq_remove_request(next);
+ }
+ 
+@@ -871,8 +1529,12 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
+ {
+ 	if (cfqq) {
+ 		cfq_log_cfqq(cfqd, cfqq, "set_active");
++		cfqq->slice_start = 0;
++		cfqq->dispatch_start = jiffies;
++		cfqq->allocated_slice = 0;
+ 		cfqq->slice_end = 0;
+ 		cfqq->slice_dispatch = 0;
++		cfqq->nr_sectors = 0;
+ 
+ 		cfq_clear_cfqq_wait_request(cfqq);
+ 		cfq_clear_cfqq_must_dispatch(cfqq);
+@@ -899,6 +1561,16 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ 		del_timer(&cfqd->idle_slice_timer);
+ 
+ 	cfq_clear_cfqq_wait_request(cfqq);
++	cfq_clear_cfqq_wait_busy(cfqq);
++
++	/*
++	 * If this cfqq is shared between multiple processes, check to
++	 * make sure that those processes are still issuing I/Os within
++	 * the mean seek distance.  If not, it may be time to break the
++	 * queues apart again.
++	 */
++	if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
++		cfq_mark_cfqq_split_coop(cfqq);
+ 
+ 	/*
+ 	 * store what was left of this slice, if the queue idled/timed out
+@@ -908,11 +1580,19 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ 		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
+ 	}
+ 
++	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
++
++	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
++		cfq_del_cfqq_rr(cfqd, cfqq);
++
+ 	cfq_resort_rr_list(cfqd, cfqq);
+ 
+ 	if (cfqq == cfqd->active_queue)
+ 		cfqd->active_queue = NULL;
+ 
++	if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
++		cfqd->grp_service_tree.active = NULL;
++
+ 	if (cfqd->active_cic) {
+ 		put_io_context(cfqd->active_cic->ioc);
+ 		cfqd->active_cic = NULL;
+@@ -933,10 +1613,39 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
+  */
+ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
+ {
+-	if (RB_EMPTY_ROOT(&cfqd->service_tree.rb))
++	struct cfq_rb_root *service_tree =
++		service_tree_for(cfqd->serving_group, cfqd->serving_prio,
++					cfqd->serving_type);
++
++	if (!cfqd->rq_queued)
+ 		return NULL;
+ 
+-	return cfq_rb_first(&cfqd->service_tree);
++	/* There is nothing to dispatch */
++	if (!service_tree)
++		return NULL;
++	if (RB_EMPTY_ROOT(&service_tree->rb))
++		return NULL;
++	return cfq_rb_first(service_tree);
++}
++
++static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
++{
++	struct cfq_group *cfqg;
++	struct cfq_queue *cfqq;
++	int i, j;
++	struct cfq_rb_root *st;
++
++	if (!cfqd->rq_queued)
++		return NULL;
++
++	cfqg = cfq_get_next_cfqg(cfqd);
++	if (!cfqg)
++		return NULL;
++
++	for_each_cfqg_st(cfqg, i, j, st)
++		if ((cfqq = cfq_rb_first(st)) != NULL)
++			return cfqq;
++	return NULL;
+ }
+ 
+ /*
+@@ -945,14 +1654,8 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
+ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
+ 					      struct cfq_queue *cfqq)
+ {
+-	if (!cfqq) {
++	if (!cfqq)
+ 		cfqq = cfq_get_next_queue(cfqd);
+-		if (cfqq && !cfq_cfqq_coop_preempt(cfqq))
+-			cfq_clear_cfqq_coop(cfqq);
+-	}
+-
+-	if (cfqq)
+-		cfq_clear_cfqq_coop_preempt(cfqq);
+ 
+ 	__cfq_set_active_queue(cfqd, cfqq);
+ 	return cfqq;
+@@ -967,16 +1670,17 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
+ 		return cfqd->last_position - blk_rq_pos(rq);
+ }
+ 
+-#define CIC_SEEK_THR	8 * 1024
+-#define CIC_SEEKY(cic)	((cic)->seek_mean > CIC_SEEK_THR)
+-
+-static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
++static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
++			       struct request *rq, bool for_preempt)
+ {
+-	struct cfq_io_context *cic = cfqd->active_cic;
+-	sector_t sdist = cic->seek_mean;
++	sector_t sdist = cfqq->seek_mean;
++
++	if (!sample_valid(cfqq->seek_samples))
++		sdist = CFQQ_SEEK_THR;
+ 
+-	if (!sample_valid(cic->seek_samples))
+-		sdist = CIC_SEEK_THR;
++	/* if seek_mean is big, using it as close criteria is meaningless */
++	if (sdist > CFQQ_SEEK_THR && !for_preempt)
++		sdist = CFQQ_SEEK_THR;
+ 
+ 	return cfq_dist_from_last(cfqd, rq) <= sdist;
+ }
+@@ -1005,7 +1709,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
+ 	 * will contain the closest sector.
+ 	 */
+ 	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
+-	if (cfq_rq_close(cfqd, __cfqq->next_rq))
++	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false))
+ 		return __cfqq;
+ 
+ 	if (blk_rq_pos(__cfqq->next_rq) < sector)
+@@ -1016,7 +1720,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
+ 		return NULL;
+ 
+ 	__cfqq = rb_entry(node, struct cfq_queue, p_node);
+-	if (cfq_rq_close(cfqd, __cfqq->next_rq))
++	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false))
+ 		return __cfqq;
+ 
+ 	return NULL;
+@@ -1033,16 +1737,19 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
+  * assumption.
+  */
+ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
+-					      struct cfq_queue *cur_cfqq,
+-					      bool probe)
++					      struct cfq_queue *cur_cfqq)
+ {
+ 	struct cfq_queue *cfqq;
+ 
++	if (!cfq_cfqq_sync(cur_cfqq))
++		return NULL;
++	if (CFQQ_SEEKY(cur_cfqq))
++		return NULL;
++
+ 	/*
+-	 * A valid cfq_io_context is necessary to compare requests against
+-	 * the seek_mean of the current cfqq.
++	 * Don't search priority tree if it's the only queue in the group.
+ 	 */
+-	if (!cfqd->active_cic)
++	if (cur_cfqq->cfqg->nr_cfqq == 1)
+ 		return NULL;
+ 
+ 	/*
+@@ -1054,14 +1761,55 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
+ 	if (!cfqq)
+ 		return NULL;
+ 
+-	if (cfq_cfqq_coop(cfqq))
++	/* If new queue belongs to different cfq_group, don't choose it */
++	if (cur_cfqq->cfqg != cfqq->cfqg)
++		return NULL;
++
++	/*
++	 * It only makes sense to merge sync queues.
++	 */
++	if (!cfq_cfqq_sync(cfqq))
++		return NULL;
++	if (CFQQ_SEEKY(cfqq))
++		return NULL;
++
++	/*
++	 * Do not merge queues of different priority classes
++	 */
++	if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
+ 		return NULL;
+ 
+-	if (!probe)
+-		cfq_mark_cfqq_coop(cfqq);
+ 	return cfqq;
+ }
+ 
++/*
++ * Determine whether we should enforce idle window for this queue.
++ */
++
++static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
++{
++	enum wl_prio_t prio = cfqq_prio(cfqq);
++	struct cfq_rb_root *service_tree = cfqq->service_tree;
++
++	BUG_ON(!service_tree);
++	BUG_ON(!service_tree->count);
++
++	/* We never do for idle class queues. */
++	if (prio == IDLE_WORKLOAD)
++		return false;
++
++	/* We do for queues that were marked with idle window flag. */
++	if (cfq_cfqq_idle_window(cfqq) &&
++	   !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
++		return true;
++
++	/*
++	 * Otherwise, we do only if they are the last ones
++	 * in their service tree.
++	 */
++	return service_tree->count == 1 && cfq_cfqq_sync(cfqq);
++}
++
+ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
+ {
+ 	struct cfq_queue *cfqq = cfqd->active_queue;
+@@ -1082,13 +1830,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
+ 	/*
+ 	 * idle is disabled, either manually or by past process history
+ 	 */
+-	if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq))
++	if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
+ 		return;
+ 
+ 	/*
+-	 * still requests with the driver, don't idle
++	 * still active requests from this queue, don't idle
+ 	 */
+-	if (rq_in_driver(cfqd))
++	if (cfqq->dispatched)
+ 		return;
+ 
+ 	/*
+@@ -1109,14 +1857,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
+ 
+ 	cfq_mark_cfqq_wait_request(cfqq);
+ 
+-	/*
+-	 * we don't want to idle for seeks, but we do want to allow
+-	 * fair distribution of slice time for a process doing back-to-back
+-	 * seeks. so allow a little bit of time for him to submit a new rq
+-	 */
+ 	sl = cfqd->cfq_slice_idle;
+-	if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
+-		sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
+ 
+ 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
+ 	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
+@@ -1139,6 +1880,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
+ 
+ 	if (cfq_cfqq_sync(cfqq))
+ 		cfqd->sync_flight++;
++	cfqq->nr_sectors += blk_rq_sectors(rq);
+ }
+ 
+ /*
+@@ -1175,6 +1917,186 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ }
+ 
+ /*
++ * Must be called with the queue_lock held.
++ */
++static int cfqq_process_refs(struct cfq_queue *cfqq)
++{
++	int process_refs, io_refs;
++
++	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
++	process_refs = atomic_read(&cfqq->ref) - io_refs;
++	BUG_ON(process_refs < 0);
++	return process_refs;
++}
++
++static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
++{
++	int process_refs, new_process_refs;
++	struct cfq_queue *__cfqq;
++
++	/* Avoid a circular list and skip interim queue merges */
++	while ((__cfqq = new_cfqq->new_cfqq)) {
++		if (__cfqq == cfqq)
++			return;
++		new_cfqq = __cfqq;
++	}
++
++	process_refs = cfqq_process_refs(cfqq);
++	/*
++	 * If the process for the cfqq has gone away, there is no
++	 * sense in merging the queues.
++	 */
++	if (process_refs == 0)
++		return;
++
++	/*
++	 * Merge in the direction of the lesser amount of work.
++	 */
++	new_process_refs = cfqq_process_refs(new_cfqq);
++	if (new_process_refs >= process_refs) {
++		cfqq->new_cfqq = new_cfqq;
++		atomic_add(process_refs, &new_cfqq->ref);
++	} else {
++		new_cfqq->new_cfqq = cfqq;
++		atomic_add(new_process_refs, &cfqq->ref);
++	}
++}
++
++static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
++				struct cfq_group *cfqg, enum wl_prio_t prio)
++{
++	struct cfq_queue *queue;
++	int i;
++	bool key_valid = false;
++	unsigned long lowest_key = 0;
++	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
++
++	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
++		/* select the one with lowest rb_key */
++		queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
++		if (queue &&
++		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
++			lowest_key = queue->rb_key;
++			cur_best = i;
++			key_valid = true;
++		}
++	}
++
++	return cur_best;
++}
++
++static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
++{
++	unsigned slice;
++	unsigned count;
++	struct cfq_rb_root *st;
++	unsigned group_slice;
++
++	if (!cfqg) {
++		cfqd->serving_prio = IDLE_WORKLOAD;
++		cfqd->workload_expires = jiffies + 1;
++		return;
++	}
++
++	/* Choose next priority. RT > BE > IDLE */
++	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
++		cfqd->serving_prio = RT_WORKLOAD;
++	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
++		cfqd->serving_prio = BE_WORKLOAD;
++	else {
++		cfqd->serving_prio = IDLE_WORKLOAD;
++		cfqd->workload_expires = jiffies + 1;
++		return;
++	}
++
++	/*
++	 * For RT and BE, we have to choose also the type
++	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
++	 * expiration time
++	 */
++	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
++	count = st->count;
++
++	/*
++	 * check workload expiration, and that we still have other queues ready
++	 */
++	if (count && !time_after(jiffies, cfqd->workload_expires))
++		return;
++
++	/* otherwise select new workload type */
++	cfqd->serving_type =
++		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
++	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
++	count = st->count;
++
++	/*
++	 * the workload slice is computed as a fraction of target latency
++	 * proportional to the number of queues in that workload, over
++	 * all the queues in the same priority class
++	 */
++	group_slice = cfq_group_slice(cfqd, cfqg);
++
++	slice = group_slice * count /
++		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
++		      cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
++
++	if (cfqd->serving_type == ASYNC_WORKLOAD) {
++		unsigned int tmp;
++
++		/*
++		 * Async queues are currently system wide. Just taking
++		 * proportion of queues with-in same group will lead to higher
++		 * async ratio system wide as generally root group is going
++		 * to have higher weight. A more accurate thing would be to
++		 * calculate system wide asnc/sync ratio.
++		 */
++		tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
++		tmp = tmp/cfqd->busy_queues;
++		slice = min_t(unsigned, slice, tmp);
++
++		/* async workload slice is scaled down according to
++		 * the sync/async slice ratio. */
++		slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
++	} else
++		/* sync workload slice is at least 2 * cfq_slice_idle */
++		slice = max(slice, 2 * cfqd->cfq_slice_idle);
++
++	slice = max_t(unsigned, slice, CFQ_MIN_TT);
++	cfqd->workload_expires = jiffies + slice;
++	cfqd->noidle_tree_requires_idle = false;
++}
++
++static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
++{
++	struct cfq_rb_root *st = &cfqd->grp_service_tree;
++	struct cfq_group *cfqg;
++
++	if (RB_EMPTY_ROOT(&st->rb))
++		return NULL;
++	cfqg = cfq_rb_first_group(st);
++	st->active = &cfqg->rb_node;
++	update_min_vdisktime(st);
++	return cfqg;
++}
++
++static void cfq_choose_cfqg(struct cfq_data *cfqd)
++{
++	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
++
++	cfqd->serving_group = cfqg;
++
++	/* Restore the workload type data */
++	if (cfqg->saved_workload_slice) {
++		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
++		cfqd->serving_type = cfqg->saved_workload;
++		cfqd->serving_prio = cfqg->saved_serving_prio;
++	} else
++		cfqd->workload_expires = jiffies - 1;
++
++	choose_service_tree(cfqd, cfqg);
++}
++
++/*
+  * Select a queue for service. If we have a current active queue,
+  * check whether to continue servicing it, or retrieve and set a new one.
+  */
+@@ -1186,13 +2108,37 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
+ 	if (!cfqq)
+ 		goto new_queue;
+ 
++	if (!cfqd->rq_queued)
++		return NULL;
++
+ 	/*
+-	 * The active queue has run out of time, expire it and select new.
++	 * We were waiting for group to get backlogged. Expire the queue
+ 	 */
+-	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
++	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
+ 		goto expire;
+ 
+ 	/*
++	 * The active queue has run out of time, expire it and select new.
++	 */
++	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
++		/*
++		 * If slice had not expired at the completion of last request
++		 * we might not have turned on wait_busy flag. Don't expire
++		 * the queue yet. Allow the group to get backlogged.
++		 *
++		 * The very fact that we have used the slice, that means we
++		 * have been idling all along on this queue and it should be
++		 * ok to wait for this request to complete.
++		 */
++		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
++		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
++			cfqq = NULL;
++			goto keep_queue;
++		} else
++			goto expire;
++	}
++
++	/*
+ 	 * The active queue has requests and isn't expired, allow it to
+ 	 * dispatch.
+ 	 */
+@@ -1203,11 +2149,14 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
+ 	 * If another queue has a request waiting within our mean seek
+ 	 * distance, let it run.  The expire code will check for close
+ 	 * cooperators and put the close queue at the front of the service
+-	 * tree.
++	 * tree.  If possible, merge the expiring queue with the new cfqq.
+ 	 */
+-	new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0);
+-	if (new_cfqq)
++	new_cfqq = cfq_close_cooperator(cfqd, cfqq);
++	if (new_cfqq) {
++		if (!cfqq->new_cfqq)
++			cfq_setup_merge(cfqq, new_cfqq);
+ 		goto expire;
++	}
+ 
+ 	/*
+ 	 * No requests pending. If the active queue still has requests in
+@@ -1215,7 +2164,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
+ 	 * conditions to happen (or time out) before selecting a new queue.
+ 	 */
+ 	if (timer_pending(&cfqd->idle_slice_timer) ||
+-	    (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) {
++	    (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {
+ 		cfqq = NULL;
+ 		goto keep_queue;
+ 	}
+@@ -1223,6 +2172,13 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
+ expire:
+ 	cfq_slice_expired(cfqd, 0);
+ new_queue:
++	/*
++	 * Current queue expired. Check if we have to switch to a new
++	 * service tree
++	 */
++	if (!new_cfqq)
++		cfq_choose_cfqg(cfqd);
++
+ 	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
+ keep_queue:
+ 	return cfqq;
+@@ -1238,6 +2194,9 @@ static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
+ 	}
+ 
+ 	BUG_ON(!list_empty(&cfqq->fifo));
++
++	/* By default cfqq is not expired if it is empty. Do it explicitly */
++	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
+ 	return dispatched;
+ }
+ 
+@@ -1250,11 +2209,10 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
+ 	struct cfq_queue *cfqq;
+ 	int dispatched = 0;
+ 
+-	while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL)
++	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL)
+ 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+ 
+ 	cfq_slice_expired(cfqd, 0);
+-
+ 	BUG_ON(cfqd->busy_queues);
+ 
+ 	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
+@@ -1268,7 +2226,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ 	/*
+ 	 * Drain async requests before we start sync IO
+ 	 */
+-	if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
++	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
+ 		return false;
+ 
+ 	/*
+@@ -1298,9 +2256,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ 			return false;
+ 
+ 		/*
+-		 * Sole queue user, allow bigger slice
++		 * Sole queue user, no limit
+ 		 */
+-		max_dispatch *= 4;
++		max_dispatch = -1;
+ 	}
+ 
+ 	/*
+@@ -1309,7 +2267,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ 	 * based on the last sync IO we serviced
+ 	 */
+ 	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
+-		unsigned long last_sync = jiffies - cfqd->last_end_sync_rq;
++		unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
+ 		unsigned int depth;
+ 
+ 		depth = last_sync / cfqd->cfq_slice[1];
+@@ -1407,11 +2365,13 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
+  * task holds one reference to the queue, dropped when task exits. each rq
+  * in-flight on this queue also holds a reference, dropped when rq is freed.
+  *
++ * Each cfq queue took a reference on the parent group. Drop it now.
+  * queue lock must be held here.
+  */
+ static void cfq_put_queue(struct cfq_queue *cfqq)
+ {
+ 	struct cfq_data *cfqd = cfqq->cfqd;
++	struct cfq_group *cfqg, *orig_cfqg;
+ 
+ 	BUG_ON(atomic_read(&cfqq->ref) <= 0);
+ 
+@@ -1421,14 +2381,19 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
+ 	cfq_log_cfqq(cfqd, cfqq, "put_queue");
+ 	BUG_ON(rb_first(&cfqq->sort_list));
+ 	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
+-	BUG_ON(cfq_cfqq_on_rr(cfqq));
++	cfqg = cfqq->cfqg;
++	orig_cfqg = cfqq->orig_cfqg;
+ 
+ 	if (unlikely(cfqd->active_queue == cfqq)) {
+ 		__cfq_slice_expired(cfqd, cfqq, 0);
+ 		cfq_schedule_dispatch(cfqd);
+ 	}
+ 
++	BUG_ON(cfq_cfqq_on_rr(cfqq));
+ 	kmem_cache_free(cfq_pool, cfqq);
++	cfq_put_cfqg(cfqg);
++	if (orig_cfqg)
++		cfq_put_cfqg(orig_cfqg);
+ }
+ 
+ /*
+@@ -1518,11 +2483,29 @@ static void cfq_free_io_context(struct io_context *ioc)
+ 
+ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ {
++	struct cfq_queue *__cfqq, *next;
++
+ 	if (unlikely(cfqq == cfqd->active_queue)) {
+ 		__cfq_slice_expired(cfqd, cfqq, 0);
+ 		cfq_schedule_dispatch(cfqd);
+ 	}
+ 
++	/*
++	 * If this queue was scheduled to merge with another queue, be
++	 * sure to drop the reference taken on that queue (and others in
++	 * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
++	 */
++	__cfqq = cfqq->new_cfqq;
++	while (__cfqq) {
++		if (__cfqq == cfqq) {
++			WARN(1, "cfqq->new_cfqq loop detected\n");
++			break;
++		}
++		next = __cfqq->new_cfqq;
++		cfq_put_queue(__cfqq);
++		__cfqq = next;
++	}
++
+ 	cfq_put_queue(cfqq);
+ }
+ 
+@@ -1703,14 +2686,51 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ 	cfqq->pid = pid;
+ }
+ 
++#ifdef CONFIG_CFQ_GROUP_IOSCHED
++static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
++{
++	struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
++	struct cfq_data *cfqd = cic->key;
++	unsigned long flags;
++	struct request_queue *q;
++
++	if (unlikely(!cfqd))
++		return;
++
++	q = cfqd->queue;
++
++	spin_lock_irqsave(q->queue_lock, flags);
++
++	if (sync_cfqq) {
++		/*
++		 * Drop reference to sync queue. A new sync queue will be
++		 * assigned in new group upon arrival of a fresh request.
++		 */
++		cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
++		cic_set_cfqq(cic, NULL, 1);
++		cfq_put_queue(sync_cfqq);
++	}
++
++	spin_unlock_irqrestore(q->queue_lock, flags);
++}
++
++static void cfq_ioc_set_cgroup(struct io_context *ioc)
++{
++	call_for_each_cic(ioc, changed_cgroup);
++	ioc->cgroup_changed = 0;
++}
++#endif  /* CONFIG_CFQ_GROUP_IOSCHED */
++
+ static struct cfq_queue *
+ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
+ 		     struct io_context *ioc, gfp_t gfp_mask)
+ {
+ 	struct cfq_queue *cfqq, *new_cfqq = NULL;
+ 	struct cfq_io_context *cic;
++	struct cfq_group *cfqg;
+ 
+ retry:
++	cfqg = cfq_get_cfqg(cfqd, 1);
+ 	cic = cfq_cic_lookup(cfqd, ioc);
+ 	/* cic always exists here */
+ 	cfqq = cic_to_cfqq(cic, is_sync);
+@@ -1741,6 +2761,7 @@ retry:
+ 		if (cfqq) {
+ 			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
+ 			cfq_init_prio_data(cfqq, ioc);
++			cfq_link_cfqq_cfqg(cfqq, cfqg);
+ 			cfq_log_cfqq(cfqd, cfqq, "alloced");
+ 		} else
+ 			cfqq = &cfqd->oom_cfqq;
+@@ -1932,6 +2953,10 @@ out:
+ 	if (unlikely(ioc->ioprio_changed))
+ 		cfq_ioc_set_ioprio(ioc);
+ 
++#ifdef CONFIG_CFQ_GROUP_IOSCHED
++	if (unlikely(ioc->cgroup_changed))
++		cfq_ioc_set_cgroup(ioc);
++#endif
+ 	return cic;
+ err_free:
+ 	cfq_cic_free(cic);
+@@ -1952,33 +2977,33 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
+ }
+ 
+ static void
+-cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
++cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ 		       struct request *rq)
+ {
+ 	sector_t sdist;
+ 	u64 total;
+ 
+-	if (!cic->last_request_pos)
++	if (!cfqq->last_request_pos)
+ 		sdist = 0;
+-	else if (cic->last_request_pos < blk_rq_pos(rq))
+-		sdist = blk_rq_pos(rq) - cic->last_request_pos;
++	else if (cfqq->last_request_pos < blk_rq_pos(rq))
++		sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
+ 	else
+-		sdist = cic->last_request_pos - blk_rq_pos(rq);
++		sdist = cfqq->last_request_pos - blk_rq_pos(rq);
+ 
  	/*
- 	 * Reset it - just in case we boot another CPU later:
+ 	 * Don't allow the seek distance to get too large from the
+ 	 * odd fragment, pagein, etc
  	 */
-diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
-index 3909e3b..bbfa7af 100644
---- a/arch/x86/kernel/x8664_ksyms_64.c
-+++ b/arch/x86/kernel/x8664_ksyms_64.c
-@@ -3,6 +3,7 @@
+-	if (cic->seek_samples <= 60) /* second&third seek */
+-		sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
++	if (cfqq->seek_samples <= 60) /* second&third seek */
++		sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024);
+ 	else
+-		sdist = min(sdist, (cic->seek_mean * 4)	+ 2*1024*64);
++		sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64);
  
- #include <linux/module.h>
- #include <linux/smp.h>
-+#include <linux/syscalls.h>
+-	cic->seek_samples = (7*cic->seek_samples + 256) / 8;
+-	cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
+-	total = cic->seek_total + (cic->seek_samples/2);
+-	do_div(total, cic->seek_samples);
+-	cic->seek_mean = (sector_t)total;
++	cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8;
++	cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8;
++	total = cfqq->seek_total + (cfqq->seek_samples/2);
++	do_div(total, cfqq->seek_samples);
++	cfqq->seek_mean = (sector_t)total;
+ }
  
- #include <net/checksum.h>
+ /*
+@@ -1999,14 +3024,15 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
  
-@@ -17,6 +18,7 @@
- EXPORT_SYMBOL(mcount);
- #endif
+ 	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
  
-+EXPORT_SYMBOL(kernel_execve);
- EXPORT_SYMBOL(kernel_thread);
++	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
++		cfq_mark_cfqq_deep(cfqq);
++
+ 	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
+-	    (!cfqd->cfq_latency && cfqd->hw_tag && CIC_SEEKY(cic)))
++	    (!cfq_cfqq_deep(cfqq) && sample_valid(cfqq->seek_samples)
++	     && CFQQ_SEEKY(cfqq)))
+ 		enable_idle = 0;
+ 	else if (sample_valid(cic->ttime_samples)) {
+-		unsigned int slice_idle = cfqd->cfq_slice_idle;
+-		if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
+-			slice_idle = msecs_to_jiffies(CFQ_MIN_TT);
+-		if (cic->ttime_mean > slice_idle)
++		if (cic->ttime_mean > cfqd->cfq_slice_idle)
+ 			enable_idle = 0;
+ 		else
+ 			enable_idle = 1;
+@@ -2035,9 +3061,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
+ 	if (!cfqq)
+ 		return false;
  
- EXPORT_SYMBOL(__get_user_1);
-diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
-index f4cee90..3e549cd 100644
---- a/arch/x86/mm/fault.c
-+++ b/arch/x86/mm/fault.c
-@@ -689,7 +689,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
- 	if (!printk_ratelimit())
- 		return;
+-	if (cfq_slice_used(cfqq))
+-		return true;
+-
+ 	if (cfq_class_idle(new_cfqq))
+ 		return false;
  
--	printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
-+	ve_printk(VE_LOG, "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
- 		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
- 		tsk->comm, task_pid_nr(tsk), address,
- 		(void *)regs->ip, (void *)regs->sp, error_code);
-@@ -909,7 +909,7 @@ spurious_fault(unsigned long error_code, unsigned long address)
- 	return ret;
- }
+@@ -2045,12 +3068,31 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
+ 		return true;
  
--int show_unhandled_signals = 1;
-+int show_unhandled_signals = 0;
+ 	/*
++	 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
++	 */
++	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
++		return false;
++
++	/*
+ 	 * if the new request is sync, but the currently running queue is
+ 	 * not, let the sync request have priority.
+ 	 */
+ 	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
+ 		return true;
  
- static inline int
- access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
-diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
-index f46c340..6b7330c 100644
---- a/arch/x86/mm/hugetlbpage.c
-+++ b/arch/x86/mm/hugetlbpage.c
-@@ -12,6 +12,7 @@
- #include <linux/slab.h>
- #include <linux/err.h>
- #include <linux/sysctl.h>
-+#include <linux/module.h>
- #include <asm/mman.h>
- #include <asm/tlb.h>
- #include <asm/tlbflush.h>
-@@ -230,6 +231,7 @@ int pud_huge(pud_t pud)
- {
- 	return !!(pud_val(pud) & _PAGE_PSE);
++	if (new_cfqq->cfqg != cfqq->cfqg)
++		return false;
++
++	if (cfq_slice_used(cfqq))
++		return true;
++
++	/* Allow preemption only if we are idling on sync-noidle tree */
++	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
++	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
++	    new_cfqq->service_tree->count == 2 &&
++	    RB_EMPTY_ROOT(&cfqq->sort_list))
++		return true;
++
+ 	/*
+ 	 * So both queues are sync. Let the new request get disk time if
+ 	 * it's a metadata request and the current queue is doing regular IO.
+@@ -2071,16 +3113,8 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
+ 	 * if this request is as-good as one we would expect from the
+ 	 * current cfqq, let it preempt
+ 	 */
+-	if (cfq_rq_close(cfqd, rq) && (!cfq_cfqq_coop(new_cfqq) ||
+-	    cfqd->busy_queues == 1)) {
+-		/*
+-		 * Mark new queue coop_preempt, so its coop flag will not be
+-		 * cleared when new queue gets scheduled at the very first time
+-		 */
+-		cfq_mark_cfqq_coop_preempt(new_cfqq);
+-		cfq_mark_cfqq_coop(new_cfqq);
++	if (cfq_rq_close(cfqd, cfqq, rq, true))
+ 		return true;
+-	}
+ 
+ 	return false;
  }
-+EXPORT_SYMBOL(pmd_huge);
+@@ -2121,10 +3155,10 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ 		cfqq->meta_pending++;
  
- struct page *
- follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
-index c9ba9de..589a93b 100644
---- a/arch/x86/mm/pgtable.c
-+++ b/arch/x86/mm/pgtable.c
-@@ -4,7 +4,8 @@
- #include <asm/tlb.h>
- #include <asm/fixmap.h>
+ 	cfq_update_io_thinktime(cfqd, cic);
+-	cfq_update_io_seektime(cfqd, cic, rq);
++	cfq_update_io_seektime(cfqd, cfqq, rq);
+ 	cfq_update_idle_window(cfqd, cfqq, cic);
  
--#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
-+#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO | __GFP_UBC
-+#define PGALLOC_KERN_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+-	cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
++	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
  
- #ifdef CONFIG_HIGHPTE
- #define PGALLOC_USER_GFP __GFP_HIGHMEM
-@@ -16,7 +17,7 @@ gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
+ 	if (cfqq == cfqd->active_queue) {
+ 		/*
+@@ -2141,9 +3175,10 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ 			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
+ 			    cfqd->busy_queues > 1) {
+ 				del_timer(&cfqd->idle_slice_timer);
+-			__blk_run_queue(cfqd->queue);
+-			}
+-			cfq_mark_cfqq_must_dispatch(cfqq);
++				cfq_clear_cfqq_wait_request(cfqq);
++				__blk_run_queue(cfqd->queue);
++			} else
++				cfq_mark_cfqq_must_dispatch(cfqq);
+ 		}
+ 	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
+ 		/*
+@@ -2165,10 +3200,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
+ 	cfq_log_cfqq(cfqd, cfqq, "insert_request");
+ 	cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
  
- pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
- {
--	return (pte_t *)__get_free_page(PGALLOC_GFP);
-+	return (pte_t *)__get_free_page(PGALLOC_KERN_GFP);
- }
+-	cfq_add_rq_rb(rq);
+-
+ 	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
+ 	list_add_tail(&rq->queuelist, &cfqq->fifo);
++	cfq_add_rq_rb(rq);
  
- pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
-diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
-index 36fe08e..42445e5 100644
---- a/arch/x86/mm/tlb.c
-+++ b/arch/x86/mm/tlb.c
-@@ -256,6 +256,8 @@ void flush_tlb_mm(struct mm_struct *mm)
- 	preempt_enable();
+ 	cfq_rq_enqueued(cfqd, cfqq, rq);
  }
+@@ -2179,23 +3213,64 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
+  */
+ static void cfq_update_hw_tag(struct cfq_data *cfqd)
+ {
+-	if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak)
+-		cfqd->rq_in_driver_peak = rq_in_driver(cfqd);
++	struct cfq_queue *cfqq = cfqd->active_queue;
++
++	if (rq_in_driver(cfqd) > cfqd->hw_tag_est_depth)
++		cfqd->hw_tag_est_depth = rq_in_driver(cfqd);
++
++	if (cfqd->hw_tag == 1)
++		return;
  
-+EXPORT_SYMBOL(flush_tlb_mm);
+ 	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
+ 	    rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
+ 		return;
+ 
++	/*
++	 * If active queue hasn't enough requests and can idle, cfq might not
++	 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
++	 * case
++	 */
++	if (cfqq && cfq_cfqq_idle_window(cfqq) &&
++	    cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
++	    CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN)
++		return;
 +
- void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
- {
- 	struct mm_struct *mm = vma->vm_mm;
-diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
-index 58bc00f..b7028c5 100644
---- a/arch/x86/vdso/vdso32-setup.c
-+++ b/arch/x86/vdso/vdso32-setup.c
-@@ -17,6 +17,8 @@
- #include <linux/err.h>
- #include <linux/module.h>
+ 	if (cfqd->hw_tag_samples++ < 50)
+ 		return;
  
-+#include <bc/vmpages.h>
+-	if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN)
++	if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
+ 		cfqd->hw_tag = 1;
+ 	else
+ 		cfqd->hw_tag = 0;
++}
 +
- #include <asm/cpufeature.h>
- #include <asm/msr.h>
- #include <asm/pgtable.h>
-@@ -37,6 +39,8 @@ enum {
- #else
- #define VDSO_DEFAULT	VDSO_ENABLED
- #endif
-+#undef VDSO_DEFAULT
-+#define VDSO_DEFAULT VDSO_DISABLED
++static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
++{
++	struct cfq_io_context *cic = cfqd->active_cic;
++
++	/* If there are other queues in the group, don't wait */
++	if (cfqq->cfqg->nr_cfqq > 1)
++		return false;
++
++	if (cfq_slice_used(cfqq))
++		return true;
++
++	/* if slice left is less than think time, wait busy */
++	if (cic && sample_valid(cic->ttime_samples)
++	    && (cfqq->slice_end - jiffies < cic->ttime_mean))
++		return true;
++
++	/*
++	 * If think times is less than a jiffy than ttime_mean=0 and above
++	 * will not be true. It might happen that slice has not expired yet
++	 * but will expire soon (4-5 ns) during select_queue(). To cover the
++	 * case where think time is less than a jiffy, mark the queue wait
++	 * busy if only 1 jiffy is left in the slice.
++	 */
++	if (cfqq->slice_end - jiffies == 1)
++		return true;
  
- #ifdef CONFIG_X86_64
- #define vdso_enabled			sysctl_vsyscall32
-@@ -193,7 +197,8 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
- 	}
+-	cfqd->hw_tag_samples = 0;
+-	cfqd->rq_in_driver_peak = 0;
++	return false;
  }
  
--static struct page *vdso32_pages[1];
-+struct page *vdso32_pages[1];
-+EXPORT_SYMBOL_GPL(vdso32_pages);
+ static void cfq_completed_request(struct request_queue *q, struct request *rq)
+@@ -2206,7 +3281,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
+ 	unsigned long now;
  
- #ifdef CONFIG_X86_64
+ 	now = jiffies;
+-	cfq_log_cfqq(cfqd, cfqq, "complete");
++	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq));
  
-@@ -309,16 +314,30 @@ int __init sysenter_setup(void)
- 	return 0;
- }
+ 	cfq_update_hw_tag(cfqd);
  
-+EXPORT_SYMBOL_GPL(VDSO32_SYSENTER_RETURN);
-+EXPORT_SYMBOL_GPL(VDSO32_PRELINK);
+@@ -2220,7 +3295,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
+ 
+ 	if (sync) {
+ 		RQ_CIC(rq)->last_end_request = now;
+-		cfqd->last_end_sync_rq = now;
++		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
++			cfqd->last_delayed_sync = now;
+ 	}
+ 
+ 	/*
+@@ -2234,18 +3310,39 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
+ 			cfq_set_prio_slice(cfqd, cfqq);
+ 			cfq_clear_cfqq_slice_new(cfqq);
+ 		}
 +
- /* Setup a VMA at program startup for the vsyscall page */
--int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
-+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp,
-+				unsigned long map_address)
- {
- 	struct mm_struct *mm = current->mm;
--	unsigned long addr;
-+	unsigned long addr = map_address;
- 	int ret = 0;
- 	bool compat;
-+	unsigned long flags;
++		/*
++		 * Should we wait for next request to come in before we expire
++		 * the queue.
++		 */
++		if (cfq_should_wait_busy(cfqd, cfqq)) {
++			cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
++			cfq_mark_cfqq_wait_busy(cfqq);
++		}
++
+ 		/*
+-		 * If there are no requests waiting in this queue, and
+-		 * there are other queues ready to issue requests, AND
+-		 * those other queues are issuing requests within our
+-		 * mean seek distance, give them a chance to run instead
+-		 * of idling.
++		 * Idling is not enabled on:
++		 * - expired queues
++		 * - idle-priority queues
++		 * - async queues
++		 * - queues with still some requests queued
++		 * - when there is a close cooperator
+ 		 */
+ 		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
+ 			cfq_slice_expired(cfqd, 1);
+-		else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) &&
+-			 sync && !rq_noidle(rq))
+-			cfq_arm_slice_timer(cfqd);
++		else if (sync && cfqq_empty &&
++			 !cfq_close_cooperator(cfqd, cfqq)) {
++			cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);
++			/*
++			 * Idling is enabled for SYNC_WORKLOAD.
++			 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
++			 * only if we processed at least one !rq_noidle request
++			 */
++			if (cfqd->serving_type == SYNC_WORKLOAD
++			    || cfqd->noidle_tree_requires_idle
++			    || cfqq->cfqg->nr_cfqq == 1)
++				cfq_arm_slice_timer(cfqd);
++		}
+ 	}
  
--	if (vdso_enabled == VDSO_DISABLED)
-+	if (vdso_enabled == VDSO_DISABLED && map_address == 0) {
-+		current->mm->context.vdso = NULL;
- 		return 0;
+ 	if (!rq_in_driver(cfqd))
+@@ -2269,12 +3366,10 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
+ 			cfqq->ioprio = IOPRIO_NORM;
+ 	} else {
+ 		/*
+-		 * check if we need to unboost the queue
++		 * unboost the queue (if needed)
+ 		 */
+-		if (cfqq->ioprio_class != cfqq->org_ioprio_class)
+-			cfqq->ioprio_class = cfqq->org_ioprio_class;
+-		if (cfqq->ioprio != cfqq->org_ioprio)
+-			cfqq->ioprio = cfqq->org_ioprio;
++		cfqq->ioprio_class = cfqq->org_ioprio_class;
++		cfqq->ioprio = cfqq->org_ioprio;
+ 	}
+ }
+ 
+@@ -2338,6 +3433,35 @@ static void cfq_put_request(struct request *rq)
+ 	}
+ }
+ 
++static struct cfq_queue *
++cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
++		struct cfq_queue *cfqq)
++{
++	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
++	cic_set_cfqq(cic, cfqq->new_cfqq, 1);
++	cfq_mark_cfqq_coop(cfqq->new_cfqq);
++	cfq_put_queue(cfqq);
++	return cic_to_cfqq(cic, 1);
++}
++
++/*
++ * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
++ * was the last process referring to said cfqq.
++ */
++static struct cfq_queue *
++split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
++{
++	if (cfqq_process_refs(cfqq) == 1) {
++		cfqq->pid = current->pid;
++		cfq_clear_cfqq_coop(cfqq);
++		cfq_clear_cfqq_split_coop(cfqq);
++		return cfqq;
 +	}
 +
-+	flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | VM_MAYWRITE |
-+		mm->def_flags;
++	cic_set_cfqq(cic, NULL, 1);
++	cfq_put_queue(cfqq);
++	return NULL;
++}
+ /*
+  * Allocate cfq data structures associated with this request.
+  */
+@@ -2360,10 +3484,30 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
+ 	if (!cic)
+ 		goto queue_fail;
+ 
++new_queue:
+ 	cfqq = cic_to_cfqq(cic, is_sync);
+ 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
+ 		cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
+ 		cic_set_cfqq(cic, cfqq, is_sync);
++	} else {
++		/*
++		 * If the queue was seeky for too long, break it apart.
++		 */
++		if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
++			cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
++			cfqq = split_cfqq(cic, cfqq);
++			if (!cfqq)
++				goto new_queue;
++		}
 +
-+	ret = -ENOMEM;
-+	if (ub_memory_charge(mm, PAGE_SIZE, flags, NULL, UB_SOFT))
-+		goto err_charge;
++		/*
++		 * Check to see if this queue is scheduled to merge with
++		 * another, closely cooperating queue.  The merging of
++		 * queues happens here as it must be done in process context.
++		 * The reference on new_cfqq was taken in merge_cfqqs.
++		 */
++		if (cfqq->new_cfqq)
++			cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
+ 	}
  
- 	down_write(&mm->mmap_sem);
+ 	cfqq->allocated[rw]++;
+@@ -2438,6 +3582,11 @@ static void cfq_idle_slice_timer(unsigned long data)
+ 		 */
+ 		if (!RB_EMPTY_ROOT(&cfqq->sort_list))
+ 			goto out_kick;
++
++		/*
++		 * Queue depth flag is reset only when the idle didn't succeed
++		 */
++		cfq_clear_cfqq_deep(cfqq);
+ 	}
+ expire:
+ 	cfq_slice_expired(cfqd, timed_out);
+@@ -2468,6 +3617,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
+ 		cfq_put_queue(cfqd->async_idle_cfqq);
+ }
  
-@@ -328,19 +347,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
++static void cfq_cfqd_free(struct rcu_head *head)
++{
++	kfree(container_of(head, struct cfq_data, rcu));
++}
++
+ static void cfq_exit_queue(struct elevator_queue *e)
+ {
+ 	struct cfq_data *cfqd = e->elevator_data;
+@@ -2489,25 +3643,49 @@ static void cfq_exit_queue(struct elevator_queue *e)
+ 	}
  
- 	map_compat_vdso(compat);
+ 	cfq_put_async_queues(cfqd);
++	cfq_release_cfq_groups(cfqd);
++	blkiocg_del_blkio_group(&cfqd->root_group.blkg);
  
--	if (compat)
--		addr = VDSO_HIGH_BASE;
--	else {
--		addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
-+	if (!compat || map_address) {
-+		addr = get_unmapped_area(NULL, addr, PAGE_SIZE, 0, 0);
- 		if (IS_ERR_VALUE(addr)) {
- 			ret = addr;
- 			goto up_fail;
- 		}
--	}
-+	} else
-+		addr = VDSO_HIGH_BASE;
+ 	spin_unlock_irq(q->queue_lock);
  
- 	current->mm->context.vdso = (void *)addr;
+ 	cfq_shutdown_timer_wq(cfqd);
  
--	if (compat_uses_vma || !compat) {
-+	if (compat_uses_vma || !compat || map_address) {
- 		/*
- 		 * MAYWRITE to allow gdb to COW and set breakpoints
- 		 *
-@@ -368,9 +386,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
- 		current->mm->context.vdso = NULL;
+-	kfree(cfqd);
++	/* Wait for cfqg->blkg->key accessors to exit their grace periods. */
++	call_rcu(&cfqd->rcu, cfq_cfqd_free);
+ }
  
- 	up_write(&mm->mmap_sem);
-+	if (ret < 0)
-+		ub_memory_uncharge(mm, PAGE_SIZE, flags, NULL);
-+err_charge:
+ static void *cfq_init_queue(struct request_queue *q)
+ {
+ 	struct cfq_data *cfqd;
+-	int i;
++	int i, j;
++	struct cfq_group *cfqg;
++	struct cfq_rb_root *st;
  
- 	return ret;
- }
-+EXPORT_SYMBOL_GPL(arch_setup_additional_pages);
+ 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
+ 	if (!cfqd)
+ 		return NULL;
  
- #ifdef CONFIG_X86_64
+-	cfqd->service_tree = CFQ_RB_ROOT;
++	/* Init root service tree */
++	cfqd->grp_service_tree = CFQ_RB_ROOT;
++
++	/* Init root group */
++	cfqg = &cfqd->root_group;
++	for_each_cfqg_st(cfqg, i, j, st)
++		*st = CFQ_RB_ROOT;
++	RB_CLEAR_NODE(&cfqg->rb_node);
  
-diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
-index 21e1aeb..507ba17 100644
---- a/arch/x86/vdso/vma.c
-+++ b/arch/x86/vdso/vma.c
-@@ -4,6 +4,7 @@
-  * Subject to the GPL, v.2
-  */
- #include <linux/mm.h>
-+#include <linux/module.h>
- #include <linux/err.h>
- #include <linux/sched.h>
- #include <linux/init.h>
-@@ -99,17 +100,23 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
++	/* Give preference to root group over other groups */
++	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
++
++#ifdef CONFIG_CFQ_GROUP_IOSCHED
++	/*
++	 * Take a reference to root group which we never drop. This is just
++	 * to make sure that cfq_put_cfqg() does not try to kfree root group
++	 */
++	atomic_set(&cfqg->ref, 1);
++	blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,
++					0);
++#endif
+ 	/*
+ 	 * Not strictly needed (since RB_ROOT just clears the node and we
+ 	 * zeroed cfqd on alloc), but better be safe in case someone decides
+@@ -2523,6 +3701,7 @@ static void *cfq_init_queue(struct request_queue *q)
+ 	 */
+ 	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
+ 	atomic_inc(&cfqd->oom_cfqq.ref);
++	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
+ 
+ 	INIT_LIST_HEAD(&cfqd->cic_list);
+ 
+@@ -2544,8 +3723,14 @@ static void *cfq_init_queue(struct request_queue *q)
+ 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
+ 	cfqd->cfq_slice_idle = cfq_slice_idle;
+ 	cfqd->cfq_latency = 1;
+-	cfqd->hw_tag = 1;
+-	cfqd->last_end_sync_rq = jiffies;
++	cfqd->cfq_group_isolation = 0;
++	cfqd->hw_tag = -1;
++	/*
++	 * we optimistically start assuming sync ops weren't delayed in last
++	 * second, in order to have larger depth for async operations.
++	 */
++	cfqd->last_delayed_sync = jiffies - HZ;
++	INIT_RCU_HEAD(&cfqd->rcu);
+ 	return cfqd;
+ }
+ 
+@@ -2614,6 +3799,7 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
+ SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
+ SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
+ SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
++SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
+ #undef SHOW_FUNCTION
+ 
+ #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
+@@ -2646,6 +3832,7 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
+ STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
+ 		UINT_MAX, 0);
+ STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
++STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
+ #undef STORE_FUNCTION
+ 
+ #define CFQ_ATTR(name) \
+@@ -2662,6 +3849,7 @@ static struct elv_fs_entry cfq_attrs[] = {
+ 	CFQ_ATTR(slice_async_rq),
+ 	CFQ_ATTR(slice_idle),
+ 	CFQ_ATTR(low_latency),
++	CFQ_ATTR(group_isolation),
+ 	__ATTR_NULL
+ };
  
- /* Setup a VMA at program startup for the vsyscall page.
-    Not called for compat tasks */
--int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
-+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp,
-+				unsigned long map_address)
+@@ -2691,6 +3879,17 @@ static struct elevator_type iosched_cfq = {
+ 	.elevator_owner =	THIS_MODULE,
+ };
+ 
++#ifdef CONFIG_CFQ_GROUP_IOSCHED
++static struct blkio_policy_type blkio_policy_cfq = {
++	.ops = {
++		.blkio_unlink_group_fn =	cfq_unlink_blkio_group,
++		.blkio_update_group_weight_fn =	cfq_update_blkio_group_weight,
++	},
++};
++#else
++static struct blkio_policy_type blkio_policy_cfq;
++#endif
++
+ static int __init cfq_init(void)
  {
- 	struct mm_struct *mm = current->mm;
- 	unsigned long addr;
- 	int ret;
+ 	/*
+@@ -2705,6 +3904,7 @@ static int __init cfq_init(void)
+ 		return -ENOMEM;
  
--	if (!vdso_enabled)
-+	if (!vdso_enabled && map_address == 0) {
-+		current->mm->context.vdso = NULL;
- 		return 0;
-+	}
+ 	elv_register(&iosched_cfq);
++	blkio_policy_register(&blkio_policy_cfq);
  
- 	down_write(&mm->mmap_sem);
--	addr = vdso_addr(mm->start_stack, vdso_size);
-+	if (map_address)
-+		addr = map_address;
-+	else
-+		addr = vdso_addr(mm->start_stack, vdso_size);
- 	addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
- 	if (IS_ERR_VALUE(addr)) {
- 		ret = addr;
-@@ -132,6 +139,7 @@ up_fail:
- 	up_write(&mm->mmap_sem);
- 	return ret;
+ 	return 0;
  }
-+EXPORT_SYMBOL_GPL(arch_setup_additional_pages);
- 
- static __init int vdso_setup(char *s)
+@@ -2712,6 +3912,7 @@ static int __init cfq_init(void)
+ static void __exit cfq_exit(void)
  {
+ 	DECLARE_COMPLETION_ONSTACK(all_gone);
++	blkio_policy_unregister(&blkio_policy_cfq);
+ 	elv_unregister(&iosched_cfq);
+ 	ioc_gone = &all_gone;
+ 	/* ioc_gone's update must be visible before reading ioc_count */
 diff --git a/block/elevator.c b/block/elevator.c
 index a847046..7e0fe67 100644
 --- a/block/elevator.c
@@ -7477,7 +12017,7 @@
  	if (!sk)
  		goto out;
 diff --git a/drivers/net/tun.c b/drivers/net/tun.c
-index 4fdfa2a..37d414d 100644
+index 4fdfa2a..a052759 100644
 --- a/drivers/net/tun.c
 +++ b/drivers/net/tun.c
 @@ -61,6 +61,7 @@
@@ -7655,7 +12195,21 @@
  		tun_net_init(dev);
  
  		if (strchr(dev->name, '%')) {
-@@ -1316,6 +1340,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
+@@ -1006,9 +1030,10 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
+ 		if (err < 0)
+ 			goto err_free_sk;
+ 
+-		if (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) ||
+-		    device_create_file(&tun->dev->dev, &dev_attr_owner) ||
+-		    device_create_file(&tun->dev->dev, &dev_attr_group))
++		if ((dev_net(tun->dev) == &init_net) &&
++			(device_create_file(&tun->dev->dev, &dev_attr_tun_flags) ||
++			device_create_file(&tun->dev->dev, &dev_attr_owner) ||
++			device_create_file(&tun->dev->dev, &dev_attr_group)))
+ 			printk(KERN_ERR "Failed to create tun sysfs files\n");
+ 
+ 		sk->sk_destruct = tun_sock_destruct;
+@@ -1316,6 +1341,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
  	tfile->tun = NULL;
  	tfile->net = get_net(current->nsproxy->net_ns);
  	file->private_data = tfile;
@@ -7663,7 +12217,7 @@
  	return 0;
  }
  
-@@ -1457,6 +1482,226 @@ static const struct ethtool_ops tun_ethtool_ops = {
+@@ -1457,6 +1483,226 @@ static const struct ethtool_ops tun_ethtool_ops = {
  	.set_rx_csum	= tun_set_rx_csum
  };
  
@@ -7890,7 +12444,7 @@
  
  static int __init tun_init(void)
  {
-@@ -1476,6 +1721,8 @@ static int __init tun_init(void)
+@@ -1476,6 +1722,8 @@ static int __init tun_init(void)
  		printk(KERN_ERR "tun: Can't register misc device %d\n", TUN_MINOR);
  		goto err_misc;
  	}
@@ -7899,7 +12453,7 @@
  	return  0;
  err_misc:
  	rtnl_link_unregister(&tun_link_ops);
-@@ -1485,6 +1732,7 @@ err_linkops:
+@@ -1485,6 +1733,7 @@ err_linkops:
  
  static void tun_cleanup(void)
  {
@@ -11747,7 +16301,7 @@
  
  EXPORT_SYMBOL(get_empty_filp);
 diff --git a/fs/filesystems.c b/fs/filesystems.c
-index a24c58e..2723c3e 100644
+index a24c58e..bd5c213 100644
 --- a/fs/filesystems.c
 +++ b/fs/filesystems.c
 @@ -14,6 +14,9 @@
@@ -11971,7 +16525,20 @@
  		tmp = tmp->next;
  	}
  	read_unlock(&file_systems_lock);
-@@ -247,7 +356,7 @@ static const struct file_operations filesystems_proc_fops = {
+@@ -224,9 +333,12 @@ static int filesystems_proc_show(struct seq_file *m, void *v)
+ 	read_lock(&file_systems_lock);
+ 	tmp = file_systems;
+ 	while (tmp) {
++		if (!check_ve_fstype(tmp, get_exec_env()))
++			goto next; /* skip in VE */
+ 		seq_printf(m, "%s\t%s\n",
+ 			(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+ 			tmp->name);
++next:
+ 		tmp = tmp->next;
+ 	}
+ 	read_unlock(&file_systems_lock);
+@@ -247,7 +359,7 @@ static const struct file_operations filesystems_proc_fops = {
  
  static int __init proc_filesystems_init(void)
  {
@@ -11980,7 +16547,7 @@
  	return 0;
  }
  module_init(proc_filesystems_init);
-@@ -258,8 +367,8 @@ static struct file_system_type *__get_fs_type(const char *name, int len)
+@@ -258,8 +370,8 @@ static struct file_system_type *__get_fs_type(const char *name, int len)
  	struct file_system_type *fs;
  
  	read_lock(&file_systems_lock);
@@ -12434,18 +17001,19 @@
  			return 0;
  		}
 diff --git a/fs/ioprio.c b/fs/ioprio.c
-index c7c0b28..2a7e8ae 100644
+index c7c0b28..25f7275 100644
 --- a/fs/ioprio.c
 +++ b/fs/ioprio.c
-@@ -26,6 +26,7 @@
+@@ -26,6 +26,8 @@
  #include <linux/syscalls.h>
  #include <linux/security.h>
  #include <linux/pid_namespace.h>
 +#include <linux/nsproxy.h>
++#include <linux/ve_proto.h>
  
  int set_task_ioprio(struct task_struct *task, int ioprio)
  {
-@@ -78,8 +79,11 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
+@@ -78,8 +80,11 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
  	int data = IOPRIO_PRIO_DATA(ioprio);
  	struct task_struct *p, *g;
  	struct user_struct *user;
@@ -12458,7 +17026,7 @@
  
  	switch (class) {
  		case IOPRIO_CLASS_RT:
-@@ -137,17 +141,25 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
+@@ -137,17 +142,25 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
  			if (!user)
  				break;
  
@@ -12481,12 +17049,12 @@
 +				break;
 +			}
 +
-+			ret = 0; /* bc_set_ioprio(who, data); */
++			ret = ve_set_ioprio(who, data);
 +			break;
  		default:
  			ret = -EINVAL;
  	}
-@@ -192,9 +204,9 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
+@@ -192,9 +205,9 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
  {
  	struct task_struct *g, *p;
  	struct user_struct *user;
@@ -12497,7 +17065,7 @@
  
  	read_lock(&tasklist_lock);
  	switch (which) {
-@@ -230,7 +242,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
+@@ -230,7 +243,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
  			if (!user)
  				break;
  
@@ -12506,7 +17074,7 @@
  				if (__task_cred(p)->uid != user->uid)
  					continue;
  				tmpio = get_task_ioprio(p);
-@@ -240,7 +252,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
+@@ -240,7 +253,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
  					ret = tmpio;
  				else
  					ret = ioprio_best(ret, tmpio);
@@ -13540,7 +18108,7 @@
  	mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
  
 diff --git a/fs/nfs/client.c b/fs/nfs/client.c
-index 69d6a46..b9a8f89 100644
+index 127ed5c..95a31c8 100644
 --- a/fs/nfs/client.c
 +++ b/fs/nfs/client.c
 @@ -125,6 +125,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
@@ -14554,7 +19122,7 @@
  		mmput(mm);
  	return 0;
 diff --git a/fs/proc/base.c b/fs/proc/base.c
-index 13b0378..eb8a70f 100644
+index a1bb0f6..ef6ee19 100644
 --- a/fs/proc/base.c
 +++ b/fs/proc/base.c
 @@ -49,6 +49,7 @@
@@ -25476,6 +30044,41 @@
  /**
   * has_capability - Determine if a task has a superior capability available
   * @t: The task in question
+diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
+index 0008dee..9665343 100644
+--- a/include/linux/cgroup.h
++++ b/include/linux/cgroup.h
+@@ -220,6 +220,8 @@ struct cgroup {
+ 
+ 	/* For RCU-protected deletion */
+ 	struct rcu_head rcu_head;
++
++	int cgroup_lite_id;
+ };
+ 
+ /*
+@@ -525,6 +527,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
+ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
+ int cgroup_scan_tasks(struct cgroup_scanner *scan);
+ int cgroup_attach_task(struct cgroup *, struct task_struct *);
++int cgroup_set_task_css(struct task_struct *tsk, struct css_set *css);
+ 
+ /*
+  * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
+diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
+index 9c8d31b..ccefff0 100644
+--- a/include/linux/cgroup_subsys.h
++++ b/include/linux/cgroup_subsys.h
+@@ -60,3 +60,9 @@ SUBSYS(net_cls)
+ #endif
+ 
+ /* */
++
++#ifdef CONFIG_BLK_CGROUP
++SUBSYS(blkio)
++#endif
++
++/* */
 diff --git a/include/linux/compat.h b/include/linux/compat.h
 index af931ee..499d84a 100644
 --- a/include/linux/compat.h
@@ -28285,6 +32888,38 @@
  #ifdef CONFIG_INOTIFY
  
  /* Kernel API for producing events */
+diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
+index 4da4a75..d61b0b8 100644
+--- a/include/linux/iocontext.h
++++ b/include/linux/iocontext.h
+@@ -40,16 +40,11 @@ struct cfq_io_context {
+ 	struct io_context *ioc;
+ 
+ 	unsigned long last_end_request;
+-	sector_t last_request_pos;
+ 
+ 	unsigned long ttime_total;
+ 	unsigned long ttime_samples;
+ 	unsigned long ttime_mean;
+ 
+-	unsigned int seek_samples;
+-	u64 seek_total;
+-	sector_t seek_mean;
+-
+ 	struct list_head queue_list;
+ 	struct hlist_node cic_list;
+ 
+@@ -73,6 +68,10 @@ struct io_context {
+ 	unsigned short ioprio;
+ 	unsigned short ioprio_changed;
+ 
++#ifdef CONFIG_BLK_CGROUP
++	unsigned short cgroup_changed;
++#endif
++
+ 	/*
+ 	 * For request batching
+ 	 */
 diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
 index 76dad48..c699950 100644
 --- a/include/linux/ioprio.h
@@ -28646,6 +33281,19 @@
  extern void put_mnt_ns(struct mnt_namespace *ns);
  static inline void get_mnt_ns(struct mnt_namespace *ns)
  {
+diff --git a/include/linux/module.h b/include/linux/module.h
+index 460df15..482efc8 100644
+--- a/include/linux/module.h
++++ b/include/linux/module.h
+@@ -455,7 +455,7 @@ void symbol_put_addr(void *addr);
+ static inline local_t *__module_ref_addr(struct module *mod, int cpu)
+ {
+ #ifdef CONFIG_SMP
+-	return (local_t *) per_cpu_ptr(mod->refptr, cpu);
++	return (local_t *) (mod->refptr + per_cpu_offset(cpu));
+ #else
+ 	return &mod->ref;
+ #endif
 diff --git a/include/linux/mount.h b/include/linux/mount.h
 index 5d52753..f4bf358 100644
 --- a/include/linux/mount.h
@@ -31211,10 +35859,10 @@
 +#endif
 diff --git a/include/linux/ve_proto.h b/include/linux/ve_proto.h
 new file mode 100644
-index 0000000..8bc4e01
+index 0000000..5bb93e8
 --- /dev/null
 +++ b/include/linux/ve_proto.h
-@@ -0,0 +1,96 @@
+@@ -0,0 +1,100 @@
 +/*
 + *  include/linux/ve_proto.h
 + *
@@ -31246,6 +35894,10 @@
 +#endif
 +#endif
 +
++#define VE_IOPRIO_MIN 0
++#define VE_IOPRIO_MAX 8
++extern int ve_set_ioprio(int veid, int ioprio);
++
 +extern struct list_head ve_list_head;
 +#define for_each_ve(ve)	list_for_each_entry((ve), &ve_list_head, ve_list)
 +extern rwlock_t ve_list_lock;
@@ -41040,10 +45692,10 @@
  
 diff --git a/kernel/cgroup_lite.c b/kernel/cgroup_lite.c
 new file mode 100644
-index 0000000..0de6d16
+index 0000000..d299cf6
 --- /dev/null
 +++ b/kernel/cgroup_lite.c
-@@ -0,0 +1,226 @@
+@@ -0,0 +1,342 @@
 +/*
 + * lite cgroups engine
 + */
@@ -41065,6 +45717,78 @@
 +static struct cgroup init_cgroup;
 +static struct cftype *subsys_cftypes[CGROUP_SUBSYS_COUNT];
 +
++static struct idr cgroup_idr;
++static DEFINE_SPINLOCK(cgroup_idr_lock);
++
++unsigned short css_id(struct cgroup_subsys_state *css)
++{
++	return css->cgroup->cgroup_lite_id;
++}
++
++unsigned short css_depth(struct cgroup_subsys_state *css)
++{
++	return (css->cgroup == &init_cgroup) ? 0 : 1;
++}
++
++int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
++{
++	snprintf(buf, buflen, "/%d", cgrp->cgroup_lite_id);
++	return 0;
++}
++
++struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
++{
++	struct cgroup *g;
++
++	BUG_ON(!ss->use_id);
++	g = idr_find(&cgroup_idr, id);
++	if (!g)
++		return NULL;
++	return g->subsys[ss->subsys_id];
++}
++
++void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
++{
++}
++
++static int init_cgroup_id(struct cgroup *g)
++{
++	int err, id;
++
++	if (unlikely(!idr_pre_get(&cgroup_idr, GFP_KERNEL)))
++		return -ENOMEM;
++
++	spin_lock(&cgroup_idr_lock);
++	err = idr_get_new_above(&cgroup_idr, g, 1, &id);
++	spin_unlock(&cgroup_idr_lock);
++
++	if (err)
++		return err;
++
++	if (id > USHORT_MAX) {
++		spin_lock(&cgroup_idr_lock);
++		idr_remove(&cgroup_idr, id);
++		spin_unlock(&cgroup_idr_lock);
++		return -ENOSPC;
++	}
++
++	g->cgroup_lite_id = id;
++
++	return 0;
++}
++
++static void fini_cgroup_id(struct cgroup *g)
++{
++	spin_lock(&cgroup_idr_lock);
++	idr_remove(&cgroup_idr, g->cgroup_lite_id);
++	spin_unlock(&cgroup_idr_lock);
++}
++
++void __css_put(struct cgroup_subsys_state *css)
++{
++	atomic_dec(&css->refcnt);
++}
++
 +static int init_css_set_subsystems(struct cgroup *g, struct css_set *set)
 +{
 +	int i;
@@ -41079,7 +45803,7 @@
 +
 +		g->subsys[i] = ss;
 +		set->subsys[i] = ss;
-+		atomic_set(&ss->refcnt, 0);
++		atomic_set(&ss->refcnt, 1);
 +		ss->cgroup = g;
 +	}
 +	return 0;
@@ -41108,6 +45832,10 @@
 +	if (cs == NULL)
 +		goto err_calloc;
 +
++	err = init_cgroup_id(g);
++	if (err)
++		goto err_id;
++
 +	g->parent = &init_cgroup;
 +	err = init_css_set_subsystems(g, cs);
 +	if (err)
@@ -41119,6 +45847,8 @@
 +	return 0;
 +
 +err_subsys:
++	fini_cgroup_id(g);
++err_id:
 +	kfree(cs);
 +err_calloc:
 +	kfree(g);
@@ -41142,13 +45872,14 @@
 +		if (cs->pre_destroy)
 +			cs->pre_destroy(cs, g);
 +
-+		if (atomic_read(&ss->refcnt))
++		if (atomic_read(&ss->refcnt) != 1)
 +			printk(KERN_ERR "CG: leaking %d/%s subsys\n",
 +					ve->veid, subsys[i]->name);
 +		else
 +			cs->destroy(cs, g);
 +	}
 +
++	fini_cgroup_id(g);
 +	kfree(g);
 +	kfree(css);
 +	ve->ve_cgroup = NULL;
@@ -41183,6 +45914,40 @@
 +	return -ENODATA;
 +}
 +
++int cgroup_set_task_css(struct task_struct *tsk, struct css_set *css)
++{
++	int i, err;
++	struct cgroup_subsys *cs;
++	struct css_set *old_css;
++
++	old_css = tsk->cgroups;
++
++	if (old_css == css)
++		return 0;
++
++	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
++		cs = subsys[i];
++		if (!cs->can_attach)
++			continue;
++		err = cs->can_attach(cs, css->subsys[i]->cgroup, tsk, false);
++		if (err)
++			return err;
++	}
++
++	tsk->cgroups = css;
++
++	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
++		cs = subsys[i];
++		if (!cs->attach)
++			continue;
++		cs->attach(cs, css->subsys[i]->cgroup,
++				old_css->subsys[i]->cgroup, tsk, false);
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL(cgroup_set_task_css);
++
 +/*
 + * proc struts
 + */
@@ -41266,6 +46031,9 @@
 +{
 +	get_ve0()->ve_cgroup = &init_cgroup;
 +	get_ve0()->ve_css_set = &init_css_set;
++	idr_init(&cgroup_idr);
++	if (init_cgroup_id(&init_cgroup))
++		panic("CG: Can't init initial cgroup id\n");
 +	if (init_css_set_subsystems(&init_cgroup, &init_css_set) != 0)
 +		panic("CG: Can't init initial set\n");
 +	return 0;
@@ -43690,10 +48458,10 @@
 +EXPORT_SYMBOL(lookup_cpt_obj_bypos);
 diff --git a/kernel/cpt/cpt_files.c b/kernel/cpt/cpt_files.c
 new file mode 100644
-index 0000000..3ada205
+index 0000000..927a4e3
 --- /dev/null
 +++ b/kernel/cpt/cpt_files.c
-@@ -0,0 +1,1783 @@
+@@ -0,0 +1,1782 @@
 +/*
 + *
 + *  kernel/cpt/cpt_files.c
@@ -44357,20 +49125,19 @@
 +
 +	if (file->f_op == &shm_file_operations ||
 +	    file->f_op == &shmem_file_operations) {
-+		struct file *shm_file = file;
 +
 +		/* shmget uses shm ops  */
 +		if (file->f_op == &shm_file_operations) {
 +			struct shm_file_data *sfd = file->private_data;
-+			shm_file = sfd->file;
++			file = sfd->file;
 +		}
 +
-+		cpt_dump_content_sysvshm(shm_file, ctx);
++		cpt_dump_content_sysvshm(file, ctx);
 +
-+		do_read = shm_file->f_dentry->d_inode->i_fop->read;
++		do_read = file->f_dentry->d_inode->i_fop->read;
 +		if (!do_read) {
 +			wprintk_ctx("TMPFS is not configured?\n");
-+			return dump_content_shm(shm_file, ctx);
++			return dump_content_shm(file, ctx);
 +		}
 +	}
 +
@@ -67177,7 +71944,7 @@
 +cond_syscall(sys_fairsched_chwt);
 +cond_syscall(sys_fairsched_rate);
 diff --git a/kernel/sysctl.c b/kernel/sysctl.c
-index b8bd058..5ef2188 100644
+index b8bd058..5b754e4 100644
 --- a/kernel/sysctl.c
 +++ b/kernel/sysctl.c
 @@ -50,6 +50,7 @@
@@ -67210,17 +71977,20 @@
  extern int latencytop_enabled;
  extern int sysctl_nr_open_min, sysctl_nr_open_max;
  #ifndef CONFIG_MMU
-@@ -169,6 +185,9 @@ static int proc_taint(struct ctl_table *table, int write,
+@@ -169,6 +185,12 @@ static int proc_taint(struct ctl_table *table, int write,
  			       void __user *buffer, size_t *lenp, loff_t *ppos);
  #endif
  
 +static int proc_dointvec_ve(struct ctl_table *table, int write,
 +		void __user *buffer, size_t *lenp, loff_t *ppos);
++static int sysctl_data_ve(struct ctl_table *table,
++		void __user *oldval, size_t __user *oldlenp,
++		void __user *newval, size_t newlen);
 +
  static struct ctl_table root_table[];
  static struct ctl_table_root sysctl_table_root;
  static struct ctl_table_header root_table_header = {
-@@ -178,9 +197,31 @@ static struct ctl_table_header root_table_header = {
+@@ -178,9 +200,31 @@ static struct ctl_table_header root_table_header = {
  	.root = &sysctl_table_root,
  	.set = &sysctl_table_root.default_set,
  };
@@ -67253,7 +72023,7 @@
  };
  
  static struct ctl_table kern_table[];
-@@ -504,6 +545,20 @@ static struct ctl_table kern_table[] = {
+@@ -504,6 +548,20 @@ static struct ctl_table kern_table[] = {
  		.proc_handler	= &proc_dointvec,
  	},
  #endif
@@ -67274,7 +72044,7 @@
  #ifdef __hppa__
  	{
  		.ctl_name	= KERN_HPPA_PWRSW,
-@@ -699,6 +754,24 @@ static struct ctl_table kern_table[] = {
+@@ -699,6 +757,24 @@ static struct ctl_table kern_table[] = {
  		.extra1		= &pid_max_min,
  		.extra2		= &pid_max_max,
  	},
@@ -67299,7 +72069,7 @@
  	{
  		.ctl_name	= KERN_PANIC_ON_OOPS,
  		.procname	= "panic_on_oops",
-@@ -824,10 +897,12 @@ static struct ctl_table kern_table[] = {
+@@ -824,10 +900,13 @@ static struct ctl_table kern_table[] = {
  	{
  		.ctl_name	= KERN_RANDOMIZE,
  		.procname	= "randomize_va_space",
@@ -67311,10 +72081,11 @@
  		.mode		= 0644,
 -		.proc_handler	= &proc_dointvec,
 +		.proc_handler	= &proc_dointvec_ve,
++		.strategy	= &sysctl_data_ve,
  	},
  #endif
  #if defined(CONFIG_S390) && defined(CONFIG_SMP)
-@@ -1424,6 +1499,21 @@ static struct ctl_table vm_table[] = {
+@@ -1424,6 +1503,21 @@ static struct ctl_table vm_table[] = {
  		.extra2		= &one,
  	},
  #endif
@@ -67336,7 +72107,7 @@
  
  /*
   * NOTE: do not add new entries to this table unless you have read
-@@ -1600,6 +1690,13 @@ static struct ctl_table fs_table[] = {
+@@ -1600,6 +1694,13 @@ static struct ctl_table fs_table[] = {
  };
  
  static struct ctl_table debug_table[] = {
@@ -67350,7 +72121,7 @@
  #if defined(CONFIG_X86) || defined(CONFIG_PPC)
  	{
  		.ctl_name	= CTL_UNNUMBERED,
-@@ -2150,10 +2247,27 @@ struct ctl_table_header *__register_sysctl_paths(
+@@ -2150,10 +2251,27 @@ struct ctl_table_header *__register_sysctl_paths(
  struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
  						struct ctl_table *table)
  {
@@ -67378,7 +72149,7 @@
  /**
   * register_sysctl_table - register a sysctl table hierarchy
   * @table: the top-level table structure
-@@ -2170,6 +2284,14 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
+@@ -2170,6 +2288,14 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
  	return register_sysctl_paths(null_path, table);
  }
  
@@ -67393,7 +72164,7 @@
  /**
   * unregister_sysctl_table - unregister a sysctl table hierarchy
   * @header: the header returned from register_sysctl_table
-@@ -2231,6 +2353,18 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+@@ -2231,6 +2357,18 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
  	return NULL;
  }
  
@@ -67412,7 +72183,7 @@
  void unregister_sysctl_table(struct ctl_table_header * table)
  {
  }
-@@ -2902,6 +3036,25 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
+@@ -2902,6 +3040,25 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
  	return 0;
  }
  
@@ -67438,7 +72209,49 @@
  #else /* CONFIG_PROC_FS */
  
  int proc_dostring(struct ctl_table *table, int write,
-@@ -3236,6 +3389,56 @@ static int deprecated_sysctl_warning(struct __sysctl_args *args)
+@@ -2996,6 +3153,27 @@ int sysctl_data(struct ctl_table *table,
+ 	return 1;
+ }
+ 
++#ifdef CONFIG_VE
++static int sysctl_data_ve(struct ctl_table *table,
++		void __user *oldval, size_t __user *oldlenp,
++		void __user *newval, size_t newlen)
++{
++	struct ctl_table tmp_table;
++
++	tmp_table = *table;
++	tmp_table.data = (char *)get_exec_env() + (unsigned long)table->extra1;
++
++	return sysctl_data(&tmp_table, oldval, oldlenp, newval, newlen);
++}
++#else
++static int sysctl_data_ve(struct ctl_table *table,
++		void __user *oldval, size_t __user *oldlenp,
++		void __user *newval, size_t newlen)
++{
++	return sysctl_data(table, oldval, oldlenp, newval, newlen);
++}
++#endif
++
+ /* The generic string strategy routine: */
+ int sysctl_string(struct ctl_table *table,
+ 		  void __user *oldval, size_t __user *oldlenp,
+@@ -3175,6 +3353,13 @@ int sysctl_data(struct ctl_table *table,
+ 	return -ENOSYS;
+ }
+ 
++static int sysctl_data_ve(struct ctl_table *table,
++		  void __user *oldval, size_t __user *oldlenp,
++		  void __user *newval, size_t newlen)
++{
++	return -ENOSYS;
++}
++
+ int sysctl_string(struct ctl_table *table,
+ 		  void __user *oldval, size_t __user *oldlenp,
+ 		  void __user *newval, size_t newlen)
+@@ -3236,6 +3421,56 @@ static int deprecated_sysctl_warning(struct __sysctl_args *args)
  	return 0;
  }
  
@@ -67495,7 +72308,7 @@
  /*
   * No sense putting this after each symbol definition, twice,
   * exception granted :-)
-@@ -3249,7 +3452,9 @@ EXPORT_SYMBOL(proc_dostring);
+@@ -3249,7 +3484,9 @@ EXPORT_SYMBOL(proc_dostring);
  EXPORT_SYMBOL(proc_doulongvec_minmax);
  EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
  EXPORT_SYMBOL(register_sysctl_table);
@@ -67924,10 +72737,10 @@
 +
 diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
 new file mode 100644
-index 0000000..85c42c3
+index 0000000..907d944
 --- /dev/null
 +++ b/kernel/ve/ve.c
-@@ -0,0 +1,129 @@
+@@ -0,0 +1,161 @@
 +/*
 + *  linux/kernel/ve/ve.c
 + *
@@ -68057,9 +72870,41 @@
 +
 +	wake_up_process(ve_cleanup_thread);
 +}
++
++#ifdef CONFIG_BLK_CGROUP
++extern int blkiocg_set_weight(struct cgroup *cgroup, u64 val);
++
++static u64 ioprio_weight[VE_IOPRIO_MAX] = {200, 275, 350, 425, 500, 575, 650, 725};
++
++int ve_set_ioprio(int veid, int ioprio)
++{
++	struct ve_struct *ve;
++	int ret;
++
++	if (ioprio < VE_IOPRIO_MIN || ioprio >= VE_IOPRIO_MAX)
++		return -ERANGE;
++
++	ret = -ESRCH;
++	read_lock(&ve_list_lock);
++	for_each_ve(ve) {
++		if (ve->veid != veid)
++			continue;
++		ret = blkiocg_set_weight(ve->ve_cgroup, ioprio_weight[ioprio]);
++		break;
++	}
++	read_unlock(&ve_list_lock);
++
++	return ret;
++}
++#else
++int ve_set_ioprio(int veid, int ioprio)
++{
++	return -EINVAL;
++}
++#endif /* CONFIG_BLK_CGROUP */
 diff --git a/kernel/ve/vecalls.c b/kernel/ve/vecalls.c
 new file mode 100644
-index 0000000..cc27878
+index 0000000..9947b57
 --- /dev/null
 +++ b/kernel/ve/vecalls.c
 @@ -0,0 +1,2335 @@
@@ -68905,7 +73750,7 @@
 +	atomic_inc(&new->pcounter);
 +	get_ve(new);
 +
-+	tsk->cgroups = new->ve_css_set;
++	cgroup_set_task_css(tsk, new->ve_css_set);
 +
 +	new->user_ns = get_user_ns(new_creds->user->user_ns);
 +}
@@ -70400,7 +75245,7 @@
 +module_exit(vecalls_exit)
 diff --git a/kernel/ve/veowner.c b/kernel/ve/veowner.c
 new file mode 100644
-index 0000000..50f4d9a
+index 0000000..0726e44
 --- /dev/null
 +++ b/kernel/ve/veowner.c
 @@ -0,0 +1,160 @@
@@ -70514,7 +75359,7 @@
 +		.proc_handler	= proc_dointvec,
 +	},
 +	{
-+		.ctl_name	= 228,
++		.ctl_name	= CTL_UNNUMBERED,
 +		.procname	= "ve-xattr-policy",
 +		.data		= &ve_xattr_policy,
 +		.maxlen		= sizeof(int),
@@ -72075,7 +76920,7 @@
  static int do_mlockall(int flags)
  {
 diff --git a/mm/mmap.c b/mm/mmap.c
-index ae19746..991a1ac 100644
+index ae19746..a5dd0bf 100644
 --- a/mm/mmap.c
 +++ b/mm/mmap.c
 @@ -29,6 +29,7 @@
@@ -72138,15 +76983,6 @@
  		goto out;
  set_brk:
  	mm->brk = brk;
-@@ -927,7 +946,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
- 			prot |= PROT_EXEC;
- 
- 	if (!len)
--		return -EINVAL;
-+		return strncmp(current->comm, "rpm", 3) ? -EINVAL : addr;
- 
- 	if (!(flags & MAP_FIXED))
- 		addr = round_hint_to_min(addr);
 @@ -1106,6 +1125,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
  	struct rb_node **rb_link, *rb_parent;
  	unsigned long charged = 0;