[kernel] r15772 - in dists/sid/linux-2.6/debian: . patches/features/all/openvz
Maximilian Attems
maks at alioth.debian.org
Wed May 26 05:20:28 UTC 2010
Author: maks
Date: Wed May 26 05:20:20 2010
New Revision: 15772
Log:
update openvz patch
uff full blkio backport.
Modified:
dists/sid/linux-2.6/debian/changelog
dists/sid/linux-2.6/debian/patches/features/all/openvz/openvz.patch
Modified: dists/sid/linux-2.6/debian/changelog
==============================================================================
--- dists/sid/linux-2.6/debian/changelog Wed May 26 04:02:28 2010 (r15771)
+++ dists/sid/linux-2.6/debian/changelog Wed May 26 05:20:20 2010 (r15772)
@@ -28,6 +28,7 @@
[ maximilian attems]
* Add stable 2.6.32.14-rc1.
* Add drm changes from stable 2.6.33.5.
+ * Update openvz patch to 509eb1f29c43.
-- Ben Hutchings <ben at decadent.org.uk> Tue, 18 May 2010 02:13:44 +0100
Modified: dists/sid/linux-2.6/debian/patches/features/all/openvz/openvz.patch
==============================================================================
--- dists/sid/linux-2.6/debian/patches/features/all/openvz/openvz.patch Wed May 26 04:02:28 2010 (r15771)
+++ dists/sid/linux-2.6/debian/patches/features/all/openvz/openvz.patch Wed May 26 05:20:20 2010 (r15772)
@@ -1,3 +1,1613 @@
+commit 509eb1f29c4301126a0ccda8e001dfd0af0d56d2
+Author: Pavel Emelyanov <xemul at openvz.org>
+Date: Mon May 24 14:27:05 2010 +0400
+
+ OpenVZ kernel 2.6.32-balandin released
+
+ Named after Aleksandr Nikolayevich Balandin - a Russian cosmonaut.
+
+ Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit eb28ec67376e267760e72c96ca3d54346d39a56f
+Author: Pavel Emelyanov <xemul at openvz.org>
+Date: Mon May 24 15:10:31 2010 +0400
+
+ sysctl: Compilation fix after merge of sysctl fixes
+
+ Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 0bb7a0e0615e134b7ae9f7e2e2737be5ff76881b
+Author: Cyrill Gorcunov <gorcunov at openvz.org>
+Date: Mon May 24 14:23:28 2010 +0400
+
+ fs: Don't list non-VE fs in /proc/filesistems
+
+ Which is due to luck of a virtualized filesystems filter.
+ Implement it.
+
+ http://bugzilla.openvz.org/show_bug.cgi?id=1504
+
+ Reported-by: Kir Kolyshkin <kir at openvz.org>
+ Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
+ Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 866f4866b2d988c1ac1222f0397efd1e6e64d443
+Author: Andrey Vagin <avagin at openvz.org>
+Date: Mon May 24 13:14:58 2010 +0400
+
+ Fix sysctl warnings about unknown sysctl binary
+
+ Switch this entry over to use CTL_UNNUMBERED, because
+ nobody use it via sys_sysctl.
+
+ http://bugzilla.openvz.org/show_bug.cgi?id=1463
+
+ Signed-off-by: Andrey Vagin <avagin at openvz.org>
+ Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 2412f2cf0853b5303af7740000c99179eeece3e4
+Author: Andrey Vagin <avagin at openvz.org>
+Date: Mon May 24 13:15:37 2010 +0400
+
+ susctl: Add sysctl_data_ve helper
+
+ This helper is analogous to proc_dointvec_ve
+
+ Add generic method for sys_syscal access to per ve values.
+
+ The extra1 field of ctl_table contains data field offset from ve_struct begin.
+ without CONFIG_VE use address from .data field.
+
+ Signed-off-by: Andrey Vagin <avagin at openvz.org>
+ Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 39f9a055139faf313a1ad823b145e535d5485f5c
+Author: Andrey Vagin <avagin at openvz.org>
+Date: Mon May 24 13:16:11 2010 +0400
+
+ Fix sysctl warnings about msissing strategy for randomize_va_space
+
+ http://bugzilla.openvz.org/show_bug.cgi?id=1463
+
+ Signed-off-by: Andrey Vagin <avagin at openvz.org>
+ Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit de3a7aab2eeab095a81f414d0e5e855da1d99c61
+Author: Andrey Vagin <avagin at openvz.ru>
+Date: Mon May 24 13:13:36 2010 +0400
+
+ cpt: use shem_file for dump inode content of shm
+
+ Files with shm_file_operations save link in private_data on
+ the file with shmem_file_operation. For dumping inode content
+ we use read from shmem_file_operation, but pass the file with
+ shm_file_operations.
+
+ shmem_file_operation use do_sync_read, which uses file->f_op->aio_read,
+ but it's absent in smh_file_operation.
+
+ do_read
+ do_sync_read(*f, ...)
+ f->f_op->aio_read -> Oops
+
+ http://bugzilla.openvz.org/show_bug.cgi?id=1500
+
+ Signed-off-by: Andrey Vagin <avagin at openvz.org>
+ Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 1c4eba47b2d5d3d26c186485de8adf8ef293ebb5
+Author: Stanislav Kinsbursky <skinsbursky at openvz.org>
+Date: Mon May 24 14:05:44 2010 +0400
+
+ tun: device_create_file omitted if net level is not init_net
+
+ device_create_file() calls are omitted in tun_set_iff() if net is inside container.
+ Used the same condition check like in netdev_register_kobject().
+
+ http://bugzilla.openvz.org/show_bug.cgi?id=1497
+
+ Signed-off-by: Stanislav Kinsbursky <skinsbursky at openvz.org>
+ Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 98447fa5c37746da0699b9f8d8bbd59d8147d9bc
+Author: Kir Kolyshkin <kir at openvz.org>
+Date: Mon May 24 13:04:17 2010 +0400
+
+ Revert "mm mmap zero length kludge"
+
+ This kludge was made for really old rpm versions which were since then
+ fixed (see references to RH bugzilla in OpenVZ bug #893). More to say,
+ it now makes rpm itself segfault in our templates when locale is set,
+ details are in OpenVZ bug #1502. So remove it and hope for the best.
+
+ http://bugzilla.openvz.org/1502
+ http://bugzilla.openvz.org/893
+
+ This reverts commit d252a93b32d6d251fcc73863b75b91edaa801b95.
+
+ Signed-off-by: Kir Kolyshkin <kir at openvz.org>
+ Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 57358efc0e639282309d8b6aea8efb8ae3d6d9ad
+Merge: 42a0a10 1cd8211
+Author: Pavel Emelyanov <xemul at openvz.org>
+Date: Mon May 24 12:59:24 2010 +0400
+
+ Merged linux-2.6.32.13
+
+ Conflicts:
+
+ Makefile
+
+ Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 42a0a1071d3872af254373c1cc07085b9bf24d3a
+Author: Konstantin Khlebnikov <khlebnikov at openvz.org>
+Date: Mon May 24 12:56:47 2010 +0400
+
+ ioprio: Make it possible to set ve ioprio finally
+
+ Add ioprio compat call for blk-cgroup. Simulate the old ioprio with
+ the new blk-cgroup weight.
+
+ Signed-off-by: Konstantin Khlebnikov <khlebnikov at openvz.org>
+ Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit a4452f1cc33f6e4f7d8f58abab818ede313cdfbc
+Author: Konstantin Khlebnikov <khlebnikov at openvz.org>
+Date: Mon May 24 12:55:43 2010 +0400
+
+ cgroup-lite: Set task css properly
+
+ Fix task moving between cgroups at ve create and enter.
+ Add a helper to attach a task to a cgroup set (based on the
+ cgroup_attach_task).
+
+ Signed-off-by: Konstantin Khlebnikov <khlebnikov at openvz.org>
+ Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 983bb0952f838b55130f20a9486a04c92ae5826b
+Author: Konstantin Khlebnikov <khlebnikov at openvz.org>
+Date: Mon May 24 12:54:09 2010 +0400
+
+ cgroup-lite: add cgroup-id for blk-cgroups
+
+ Use one id for all subsystems in one cgroup. Store the id right
+ on the cgroup struct instead of hacking around css_id structures.
+
+ Plus add other cgroup tree related functions required by blk-cgroup.
+
+ Signed-off-by: Konstantin Khlebnikov <khlebnikov at openvz.org>
+ Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit f54f5b3e0a014f3bb5c530b4c13d443a2fc92b52
+Author: Konstantin Khlebnikov <khlebnikov at openvz.org>
+Date: Mon May 24 12:50:31 2010 +0400
+
+ cgroup-lite: fix subsys state refcnt
+
+ Add missed __css_put and fix refcnt initial state: for alive css refcnt
+ starts from 1, see the init_cgroup_css and the cgroup_clear_css_refs.
+
+ Signed-off-by: Konstantin Khlebnikov <khlebnikov at openvz.org>
+ Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 1cd8211f07663ebeac04b19ac849de7ed5eef969
+Author: Greg Kroah-Hartman <gregkh at suse.de>
+Date: Wed May 12 15:11:42 2010 -0700
+
+ Revert "module: fix __module_ref_addr()"
+
+ This reverts commit d150a2b96558a7349cbf3a72a279c37bc67d50fb.
+
+ Thanks to Jiri Benc for finding the problem that this patch is
+ not correct for the 2.6.32-stable series.
+
+ Cc: Jiri Kosina <jkosina at suse.cz>
+ Signed-off-by: Greg Kroah-Hartman <gregkh at suse.de>
+
+commit dd480cee5d48b5fd88f4f074743b542fab6d9e70
+Author: Shaohua Li <shaohua.li at intel.com>
+Date: Tue Apr 27 16:52:01 2010 +0400
+
+ cfq-iosched: split seeky coop queues after one slice
+
+ Currently we split seeky coop queues after 1s, which is too big. Below patch
+ marks seeky coop queue split_coop flag after one slice. After that, if new
+ requests come in, the queues will be splitted. Patch is suggested by Corrado.
+
+ Signed-off-by: Shaohua Li <shaohua.li at intel.com>
+ Reviewed-by: Corrado Zoccolo <czoccolo at gmail.com>
+ Acked-by: Jeff Moyer <jmoyer at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 187231a1fad899839137f76c08dd016a81245abb
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:51:57 2010 +0400
+
+ cfq-iosched: Do not idle on async queues
+
+ Few weeks back, Shaohua Li had posted similar patch. I am reposting it
+ with more test results.
+
+ This patch does two things.
+
+ - Do not idle on async queues.
+
+ - It also changes the write queue depth CFQ drives (cfq_may_dispatch()).
+ Currently, we seem to driving queue depth of 1 always for WRITES. This is
+ true even if there is only one write queue in the system and all the logic
+ of infinite queue depth in case of single busy queue as well as slowly
+ increasing queue depth based on last delayed sync request does not seem to
+ be kicking in at all.
+
+ This patch will allow deeper WRITE queue depths (subjected to the other
+ WRITE queue depth contstraints like cfq_quantum and last delayed sync
+ request).
+
+ Shaohua Li had reported getting more out of his SSD. For me, I have got
+ one Lun exported from an HP EVA and when pure buffered writes are on, I
+ can get more out of the system. Following are test results of pure
+ buffered writes (with end_fsync=1) with vanilla and patched kernel. These
+ results are average of 3 sets of run with increasing number of threads.
+
+ AVERAGE[bufwfs][vanilla]
+ -------
+ job Set NR ReadBW(KB/s) MaxClat(us) WriteBW(KB/s) MaxClat(us)
+ --- --- -- ------------ ----------- ------------- -----------
+ bufwfs 3 1 0 0 95349 474141
+ bufwfs 3 2 0 0 100282 806926
+ bufwfs 3 4 0 0 109989 2.7301e+06
+ bufwfs 3 8 0 0 116642 3762231
+ bufwfs 3 16 0 0 118230 6902970
+
+ AVERAGE[bufwfs] [patched kernel]
+ -------
+ bufwfs 3 1 0 0 270722 404352
+ bufwfs 3 2 0 0 206770 1.06552e+06
+ bufwfs 3 4 0 0 195277 1.62283e+06
+ bufwfs 3 8 0 0 260960 2.62979e+06
+ bufwfs 3 16 0 0 299260 1.70731e+06
+
+ I also ran buffered writes along with some sequential reads and some
+ buffered reads going on in the system on a SATA disk because the potential
+ risk could be that we should not be driving queue depth higher in presence
+ of sync IO going to keep the max clat low.
+
+ With some random and sequential reads going on in the system on one SATA
+ disk I did not see any significant increase in max clat. So it looks like
+ other WRITE queue depth control logic is doing its job. Here are the
+ results.
+
+ AVERAGE[brr, bsr, bufw together] [vanilla]
+ -------
+ job Set NR ReadBW(KB/s) MaxClat(us) WriteBW(KB/s) MaxClat(us)
+ --- --- -- ------------ ----------- ------------- -----------
+ brr 3 1 850 546345 0 0
+ bsr 3 1 14650 729543 0 0
+ bufw 3 1 0 0 23908 8274517
+
+ brr 3 2 981.333 579395 0 0
+ bsr 3 2 14149.7 1175689 0 0
+ bufw 3 2 0 0 21921 1.28108e+07
+
+ brr 3 4 898.333 1.75527e+06 0 0
+ bsr 3 4 12230.7 1.40072e+06 0 0
+ bufw 3 4 0 0 19722.3 2.4901e+07
+
+ brr 3 8 900 3160594 0 0
+ bsr 3 8 9282.33 1.91314e+06 0 0
+ bufw 3 8 0 0 18789.3 23890622
+
+ AVERAGE[brr, bsr, bufw mixed] [patched kernel]
+ -------
+ job Set NR ReadBW(KB/s) MaxClat(us) WriteBW(KB/s) MaxClat(us)
+ --- --- -- ------------ ----------- ------------- -----------
+ brr 3 1 837 417973 0 0
+ bsr 3 1 14357.7 591275 0 0
+ bufw 3 1 0 0 24869.7 8910662
+
+ brr 3 2 1038.33 543434 0 0
+ bsr 3 2 13351.3 1205858 0 0
+ bufw 3 2 0 0 18626.3 13280370
+
+ brr 3 4 913 1.86861e+06 0 0
+ bsr 3 4 12652.3 1430974 0 0
+ bufw 3 4 0 0 15343.3 2.81305e+07
+
+ brr 3 8 890 2.92695e+06 0 0
+ bsr 3 8 9635.33 1.90244e+06 0 0
+ bufw 3 8 0 0 17200.3 24424392
+
+ So looks like it might make sense to include this patch.
+
+ Thanks
+ Vivek
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 9027160e254ff7ea55338a1857843144445d57aa
+Author: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+Date: Tue Apr 27 16:51:53 2010 +0400
+
+ blk-cgroup: Fix potential deadlock in blk-cgroup
+
+ I triggered a lockdep warning as following.
+
+ =======================================================
+ [ INFO: possible circular locking dependency detected ]
+ 2.6.33-rc2 #1
+ -------------------------------------------------------
+ test_io_control/7357 is trying to acquire lock:
+ (blkio_list_lock){+.+...}, at: [<c053a990>] blkiocg_weight_write+0x82/0x9e
+
+ but task is already holding lock:
+ (&(&blkcg->lock)->rlock){......}, at: [<c053a949>] blkiocg_weight_write+0x3b/0x9e
+
+ which lock already depends on the new lock.
+
+ the existing dependency chain (in reverse order) is:
+
+ -> #2 (&(&blkcg->lock)->rlock){......}:
+ [<c04583b7>] validate_chain+0x8bc/0xb9c
+ [<c0458dba>] __lock_acquire+0x723/0x789
+ [<c0458eb0>] lock_acquire+0x90/0xa7
+ [<c0692b0a>] _raw_spin_lock_irqsave+0x27/0x5a
+ [<c053a4e1>] blkiocg_add_blkio_group+0x1a/0x6d
+ [<c053cac7>] cfq_get_queue+0x225/0x3de
+ [<c053eec2>] cfq_set_request+0x217/0x42d
+ [<c052c8a6>] elv_set_request+0x17/0x26
+ [<c0532a0f>] get_request+0x203/0x2c5
+ [<c0532ae9>] get_request_wait+0x18/0x10e
+ [<c0533470>] __make_request+0x2ba/0x375
+ [<c0531985>] generic_make_request+0x28d/0x30f
+ [<c0532da7>] submit_bio+0x8a/0x8f
+ [<c04d827a>] submit_bh+0xf0/0x10f
+ [<c04d91d2>] ll_rw_block+0xc0/0xf9
+ [<f86e9705>] ext3_find_entry+0x319/0x544 [ext3]
+ [<f86eae58>] ext3_lookup+0x2c/0xb9 [ext3]
+ [<c04c3e1b>] do_lookup+0xd3/0x172
+ [<c04c56c8>] link_path_walk+0x5fb/0x95c
+ [<c04c5a65>] path_walk+0x3c/0x81
+ [<c04c5b63>] do_path_lookup+0x21/0x8a
+ [<c04c66cc>] do_filp_open+0xf0/0x978
+ [<c04c0c7e>] open_exec+0x1b/0xb7
+ [<c04c1436>] do_execve+0xbb/0x266
+ [<c04081a9>] sys_execve+0x24/0x4a
+ [<c04028a2>] ptregs_execve+0x12/0x18
+
+ -> #1 (&(&q->__queue_lock)->rlock){..-.-.}:
+ [<c04583b7>] validate_chain+0x8bc/0xb9c
+ [<c0458dba>] __lock_acquire+0x723/0x789
+ [<c0458eb0>] lock_acquire+0x90/0xa7
+ [<c0692b0a>] _raw_spin_lock_irqsave+0x27/0x5a
+ [<c053dd2a>] cfq_unlink_blkio_group+0x17/0x41
+ [<c053a6eb>] blkiocg_destroy+0x72/0xc7
+ [<c0467df0>] cgroup_diput+0x4a/0xb2
+ [<c04ca473>] dentry_iput+0x93/0xb7
+ [<c04ca4b3>] d_kill+0x1c/0x36
+ [<c04cb5c5>] dput+0xf5/0xfe
+ [<c04c6084>] do_rmdir+0x95/0xbe
+ [<c04c60ec>] sys_rmdir+0x10/0x12
+ [<c04027cc>] sysenter_do_call+0x12/0x32
+
+ -> #0 (blkio_list_lock){+.+...}:
+ [<c0458117>] validate_chain+0x61c/0xb9c
+ [<c0458dba>] __lock_acquire+0x723/0x789
+ [<c0458eb0>] lock_acquire+0x90/0xa7
+ [<c06929fd>] _raw_spin_lock+0x1e/0x4e
+ [<c053a990>] blkiocg_weight_write+0x82/0x9e
+ [<c0467f1e>] cgroup_file_write+0xc6/0x1c0
+ [<c04bd2f3>] vfs_write+0x8c/0x116
+ [<c04bd7c6>] sys_write+0x3b/0x60
+ [<c04027cc>] sysenter_do_call+0x12/0x32
+
+ other info that might help us debug this:
+
+ 1 lock held by test_io_control/7357:
+ #0: (&(&blkcg->lock)->rlock){......}, at: [<c053a949>] blkiocg_weight_write+0x3b/0x9e
+ stack backtrace:
+ Pid: 7357, comm: test_io_control Not tainted 2.6.33-rc2 #1
+ Call Trace:
+ [<c045754f>] print_circular_bug+0x91/0x9d
+ [<c0458117>] validate_chain+0x61c/0xb9c
+ [<c0458dba>] __lock_acquire+0x723/0x789
+ [<c0458eb0>] lock_acquire+0x90/0xa7
+ [<c053a990>] ? blkiocg_weight_write+0x82/0x9e
+ [<c06929fd>] _raw_spin_lock+0x1e/0x4e
+ [<c053a990>] ? blkiocg_weight_write+0x82/0x9e
+ [<c053a990>] blkiocg_weight_write+0x82/0x9e
+ [<c0467f1e>] cgroup_file_write+0xc6/0x1c0
+ [<c0454df5>] ? trace_hardirqs_off+0xb/0xd
+ [<c044d93a>] ? cpu_clock+0x2e/0x44
+ [<c050e6ec>] ? security_file_permission+0xf/0x11
+ [<c04bcdda>] ? rw_verify_area+0x8a/0xad
+ [<c0467e58>] ? cgroup_file_write+0x0/0x1c0
+ [<c04bd2f3>] vfs_write+0x8c/0x116
+ [<c04bd7c6>] sys_write+0x3b/0x60
+ [<c04027cc>] sysenter_do_call+0x12/0x32
+
+ To prevent deadlock, we should take locks as following sequence:
+
+ blkio_list_lock -> queue_lock -> blkcg_lock.
+
+ The following patch should fix this bug.
+
+ Signed-off-by: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 0460ada9ec82e679632588772a3084652c1db996
+Author: Divyesh Shah <dpshah at google.com>
+Date: Tue Apr 27 16:51:48 2010 +0400
+
+ cfq-iosched: Respect ioprio_class when preempting
+
+ In cfq_should_preempt(), we currently allow some cases where a non-RT request
+ can preempt an ongoing RT cfqq timeslice. This should not happen.
+ Examples include:
+
+ o A sync_noidle wl type non-RT request pre-empting a sync_noidle wl type cfqq
+ on which we are idling.
+ o Once we have per-cgroup async queues, a non-RT sync request pre-empting a RT
+ async cfqq.
+
+ Signed-off-by: Divyesh Shah<dpshah at google.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 58244fb9adfe3f58b17be18c9f27d59dbf4977fe
+Author: Shaohua Li <shaohua.li at intel.com>
+Date: Tue Apr 27 16:51:44 2010 +0400
+
+ cfq-iosched: don't regard requests with long distance as close
+
+ seek_mean could be very big sometimes, using it as close criteria is meaningless
+ as this doen't improve any performance. So if it's big, let's fallback to
+ default value.
+
+ Reviewed-by: Corrado Zoccolo <czoccolo at gmail.com>
+ Signed-off-by: Shaohua Li<shaohua.li at intel.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 875add11b7efa93199cd179e17786c8c83cf77ea
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:51:39 2010 +0400
+
+ cfq-iosched: Remove prio_change logic for workload selection
+
+ o CFQ now internally divides cfq queues in therr workload categories. sync-idle,
+ sync-noidle and async. Which workload to run depends primarily on rb_key
+ offset across three service trees. Which is a combination of mulitiple things
+ including what time queue got queued on the service tree.
+
+ There is one exception though. That is if we switched the prio class, say
+ we served some RT tasks and again started serving BE class, then with-in
+ BE class we always started with sync-noidle workload irrespective of rb_key
+ offset in service trees.
+
+ This can provide better latencies for sync-noidle workload in the presence
+ of RT tasks.
+
+ o This patch gets rid of that exception and which workload to run with-in
+ class always depends on lowest rb_key across service trees. The reason
+ being that now we have multiple BE class groups and if we always switch
+ to sync-noidle workload with-in group, we can potentially starve a sync-idle
+ workload with-in group. Same is true for async workload which will be in
+ root group. Also the workload-switching with-in group will become very
+ unpredictable as it now depends whether some RT workload was running in
+ the system or not.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Reviewed-by: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+ Acked-by: Corrado Zoccolo <czoccolo at gmail.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 98a3d07b1fe96e53a15cbab963ea26b68b573194
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:51:35 2010 +0400
+
+ cfq-iosched: Get rid of nr_groups
+
+ o Currently code does not seem to be using cfqd->nr_groups. Get rid of it.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Reviewed-by: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit af90feaf148382f0f79b9411fc50d88bd861710a
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:51:31 2010 +0400
+
+ cfq-iosched: Remove the check for same cfq group from allow_merge
+
+ o allow_merge() already checks if submitting task is pointing to same cfqq
+ as rq has been queued in. If everything is fine, we should not be having
+ a task in one cgroup and having a pointer to cfqq in other cgroup.
+
+ Well I guess in some situations it can happen and that is, when a random
+ IO queue has been moved into root cgroup for group_isolation=0. In
+ this case, tasks's cgroup/group is different from where actually cfqq is,
+ but this is intentional and in this case merging should be allowed.
+
+ The second situation is where due to close cooperator patches, multiple
+ processes can be sharing a cfqq. If everything implemented right, we should
+ not end up in a situation where tasks from different processes in different
+ groups are sharing the same cfqq as we allow merging of cooperating queues
+ only if they are in same group.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Reviewed-by: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 76160ce0edc2aeeaa4df9292700aecdd0c4c36cb
+Author: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+Date: Tue Apr 27 16:51:27 2010 +0400
+
+ cfq: set workload as expired if it doesn't have any slice left
+
+ When a group is resumed, if it doesn't have workload slice left,
+ we should set workload_expires as expired. Otherwise, we might
+ start from where we left in previous group by error.
+ Thanks the idea from Corrado.
+
+ Signed-off-by: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 6a78ef2e36ba6a63c5617326b38e268820cdd893
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:51:23 2010 +0400
+
+ Fix a CFQ crash in "for-2.6.33" branch of block tree
+
+ I think my previous patch introduced a bug which can lead to CFQ hitting
+ BUG_ON().
+
+ The offending commit in for-2.6.33 branch is.
+
+ commit 7667aa0630407bc07dc38dcc79d29cc0a65553c1
+ Author: Vivek Goyal <vgoyal at redhat.com>
+ Date: Tue Dec 8 17:52:58 2009 -0500
+
+ cfq-iosched: Take care of corner cases of group losing share due to deletion
+
+ While doing some stress testing on my box, I enountered following.
+
+ login: [ 3165.148841] BUG: scheduling while
+ atomic: swapper/0/0x10000100
+ [ 3165.149821] Modules linked in: cfq_iosched dm_multipath qla2xxx igb
+ scsi_transport_fc dm_snapshot [last unloaded: scsi_wait_scan]
+ [ 3165.149821] Pid: 0, comm: swapper Not tainted
+ 2.6.32-block-for-33-merged-new #3
+ [ 3165.149821] Call Trace:
+ [ 3165.149821] <IRQ> [<ffffffff8103fab8>] __schedule_bug+0x5c/0x60
+ [ 3165.149821] [<ffffffff8103afd7>] ? __wake_up+0x44/0x4d
+ [ 3165.149821] [<ffffffff8153a979>] schedule+0xe3/0x7bc
+ [ 3165.149821] [<ffffffff8103a796>] ? cpumask_next+0x1d/0x1f
+ [ 3165.149821] [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
+ [cfq_iosched]
+ [ 3165.149821] [<ffffffff810422d8>] __cond_resched+0x2a/0x35
+ [ 3165.149821] [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
+ [cfq_iosched]
+ [ 3165.149821] [<ffffffff8153b1ee>] _cond_resched+0x2c/0x37
+ [ 3165.149821] [<ffffffff8100e2db>] is_valid_bugaddr+0x16/0x2f
+ [ 3165.149821] [<ffffffff811e4161>] report_bug+0x18/0xac
+ [ 3165.149821] [<ffffffff8100f1fc>] die+0x39/0x63
+ [ 3165.149821] [<ffffffff8153cde1>] do_trap+0x11a/0x129
+ [ 3165.149821] [<ffffffff8100d470>] do_invalid_op+0x96/0x9f
+ [ 3165.149821] [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
+ [cfq_iosched]
+ [ 3165.149821] [<ffffffff81034b4d>] ? enqueue_task+0x5c/0x67
+ [ 3165.149821] [<ffffffff8103ae83>] ? task_rq_unlock+0x11/0x13
+ [ 3165.149821] [<ffffffff81041aae>] ? try_to_wake_up+0x292/0x2a4
+ [ 3165.149821] [<ffffffff8100c935>] invalid_op+0x15/0x20
+ [ 3165.149821] [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
+ [cfq_iosched]
+ [ 3165.149821] [<ffffffff810df5a6>] ? virt_to_head_page+0xe/0x2f
+ [ 3165.149821] [<ffffffff811d8c2a>] blk_peek_request+0x191/0x1a7
+ [ 3165.149821] [<ffffffff811e5b8d>] ? kobject_get+0x1a/0x21
+ [ 3165.149821] [<ffffffff812c8d4c>] scsi_request_fn+0x82/0x3df
+ [ 3165.149821] [<ffffffff8110b2de>] ? bio_fs_destructor+0x15/0x17
+ [ 3165.149821] [<ffffffff810df5a6>] ? virt_to_head_page+0xe/0x2f
+ [ 3165.149821] [<ffffffff811d931f>] __blk_run_queue+0x42/0x71
+ [ 3165.149821] [<ffffffff811d9403>] blk_run_queue+0x26/0x3a
+ [ 3165.149821] [<ffffffff812c8761>] scsi_run_queue+0x2de/0x375
+ [ 3165.149821] [<ffffffff812b60ac>] ? put_device+0x17/0x19
+ [ 3165.149821] [<ffffffff812c92d7>] scsi_next_command+0x3b/0x4b
+ [ 3165.149821] [<ffffffff812c9b9f>] scsi_io_completion+0x1c9/0x3f5
+ [ 3165.149821] [<ffffffff812c3c36>] scsi_finish_command+0xb5/0xbe
+
+ I think I have hit following BUG_ON() in cfq_dispatch_request().
+
+ BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
+
+ Please find attached the patch to fix it. I have done some stress testing
+ with it and have not seen it happening again.
+
+ o We should wait on a queue even after slice expiry only if it is empty. If
+ queue is not empty then continue to expire it.
+
+ o If we decide to keep the queue then make cfqq=NULL. Otherwise select_queue()
+ will return a valid cfqq and cfq_dispatch_request() can hit following
+ BUG_ON().
+
+ BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list))
+
+ Reviewed-by: Jeff Moyer <jmoyer at redhat.com>
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 086fcfd4a9aec3209a9a8b2c591734850bbca097
+Author: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+Date: Tue Apr 27 16:51:18 2010 +0400
+
+ cfq: Remove wait_request flag when idle time is being deleted
+
+ Remove wait_request flag when idle time is being deleted, otherwise
+ it'll hit this path every time when a request is enqueued.
+
+ Signed-off-by: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 9714cf0030da3ceaea312be05cc056d4b36fe118
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date: Tue Apr 27 16:51:14 2010 +0400
+
+ cfq-iosched: commenting non-obvious initialization
+
+ Added a comment to explain the initialization of last_delayed_sync.
+
+ Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+ Acked-by: Jeff Moyer <jmoyer at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 75e3bc83c0d1f9c909bd0bce56ac377623c22807
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:51:10 2010 +0400
+
+ cfq-iosched: Take care of corner cases of group losing share due to deletion
+
+ If there is a sequential reader running in a group, we wait for next request
+ to come in that group after slice expiry and once new request is in, we expire
+ the queue. Otherwise we delete the group from service tree and group looses
+ its fair share.
+
+ So far I was marking a queue as wait_busy if it had consumed its slice and
+ it was last queue in the group. But this condition did not cover following
+ two cases.
+
+ 1.If a request completed and slice has not expired yet. Next request comes
+ in and is dispatched to disk. Now select_queue() hits and slice has expired.
+ This group will be deleted. Because request is still in the disk, this queue
+ will never get a chance to wait_busy.
+
+ 2.If request completed and slice has not expired yet. Before next request
+ comes in (delay due to think time), select_queue() hits and expires the
+ queue hence group. This queue never got a chance to wait busy.
+
+ Gui was hitting the boundary condition 1 and not getting fairness numbers
+ proportional to weight.
+
+ This patch puts the checks for above two conditions and improves the fairness
+ numbers for sequential workload on rotational media. Check in select_queue()
+ takes care of case 1 and additional check in should_wait_busy() takes care
+ of case 2.
+
+ Reported-by: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 6c866a0686a169f5098da254fb6b0f8812318469
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:51:06 2010 +0400
+
+ cfq-iosched: Get rid of cfqq wait_busy_done flag
+
+ o Get rid of wait_busy_done flag. This flag only tells we were doing wait
+ busy on a queue and that queue got request so expire it. That information
+ can easily be obtained by (cfq_cfqq_wait_busy() && queue_is_not_empty). So
+ remove this flag and keep code simple.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 44c156f1191391dddb02f1abff022a61c2f94a17
+Author: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+Date: Tue Apr 27 16:51:02 2010 +0400
+
+ cfq: Optimization for close cooperating queue searching
+
+ It doesn't make any sense to try to find out a close cooperating
+ queue if current cfqq is the only one in the group.
+
+ Signed-off-by: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit bd4386b49b4ba2c012dc22c7a80512681a5ade15
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date: Tue Apr 27 16:50:58 2010 +0400
+
+ cfq-iosched: reduce write depth only if sync was delayed
+
+ The introduction of ramp-up formula for async queue depths has
+ slowed down dirty page reclaim, by reducing async write performance.
+ This patch makes sure the formula kicks in only when sync request
+ was recently delayed.
+
+ Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 433c9d47f26fcb9141f1a1c3f15245a8391c5a08
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:50:52 2010 +0400
+
+ cfq-iosched: Do not access cfqq after freeing it
+
+ Fix a crash during boot reported by Jeff Moyer. Fix the issue of accessing
+ cfqq after freeing it.
+
+ Reported-by: Jeff Moyer <jmoyer at redhat.com>
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Reviewed-by: Jeff Moyer <jmoyer at redhat.com>
+ Signed-off-by: Jens Axboe <axboe at carl.(none)>
+
+commit 21e7ec5499dfae1930bc103e1f2430b262ac0c61
+Author: Stephen Rothwell <sfr at canb.auug.org.au>
+Date: Tue Apr 27 16:50:48 2010 +0400
+
+ block: include linux/err.h to use ERR_PTR
+
+ Signed-off-by: Stephen Rothwell <sfr at canb.auug.org.au>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit ba750bcbce0558bfe7ea2fd4a9b9ca74e1eac70f
+Author: Jens Axboe <jens.axboe at oracle.com>
+Date: Tue Apr 27 16:50:44 2010 +0400
+
+ cfq-iosched: use call_rcu() instead of doing grace period stall on queue exit
+
+ After the merge of the IO controller patches, booting on my megaraid
+ box ran much slower. Vivek Goyal traced it down to megaraid discovery
+ creating tons of devices, each suffering a grace period when they later
+ kill that queue (if no device is found).
+
+ So lets use call_rcu() to batch these deferred frees, instead of taking
+ the grace period hit for each one.
+
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 291282276037c26045453190e5dd441ff03e319a
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:50:40 2010 +0400
+
+ blkio: Implement dynamic io controlling policy registration
+
+ o One of the goals of block IO controller is that it should be able to
+ support mulitple io control policies, some of which be operational at
+ higher level in storage hierarchy.
+
+ o To begin with, we had one io controlling policy implemented by CFQ, and
+ I hard coded the CFQ functions called by blkio. This created issues when
+ CFQ is compiled as module.
+
+ o This patch implements a basic dynamic io controlling policy registration
+ functionality in blkio. This is similar to elevator functionality where
+ ioschedulers register the functions dynamically.
+
+ o Now in future, when more IO controlling policies are implemented, these
+ can dynakically register with block IO controller.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 7701338499b73355707c41ae27358a4dd5bc4b84
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:50:36 2010 +0400
+
+ blkio: Export some symbols from blkio as its user CFQ can be a module
+
+ o blkio controller is inside the kernel and cfq makes use of interfaces
+ exported by blkio. CFQ can be a module too, hence export symbols used
+ by CFQ.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 8dfe981d81c7a967b6040d73fcae9780ef1519ae
+Author: Shaohua Li <shaohua.li at intel.com>
+Date: Tue Apr 27 16:50:31 2010 +0400
+
+ cfq-iosched: make nonrot check logic consistent
+
+ cfq_arm_slice_timer() has logic to disable idle window for SSD device. The same
+ thing should be done at cfq_select_queue() too, otherwise we will still see
+ idle window. This makes the nonrot check logic consistent in cfq.
+ Tests in a intel SSD with low_latency knob close, below patch can triple disk
+ thoughput for muti-thread sequential read.
+
+ Signed-off-by: Shaohua Li <shaohua.li at intel.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit e1853aca5799c76d0dd8ff97c5bed8c2e6059fa2
+Author: Jens Axboe <jens.axboe at oracle.com>
+Date: Tue Apr 27 16:50:28 2010 +0400
+
+ cfq-iosched: move IO controller declerations to a header file
+
+ They should not be declared inside some other file that's not related
+ to CFQ.
+
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 16ca6c55c9c1961dbd748a5c94883ab1d65bb04f
+Author: Jens Axboe <jens.axboe at oracle.com>
+Date: Tue Apr 27 16:50:24 2010 +0400
+
+ cfq-iosched: fix compile problem with !CONFIG_CGROUP
+
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 5260a89b72023fcad7242552059312e31a864bf2
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:50:19 2010 +0400
+
+ blkio: Wait on sync-noidle queue even if rq_noidle = 1
+
+ o rq_noidle() is supposed to tell cfq that do not expect a request after this
+ one, hence don't idle. But this does not seem to work very well. For example
+ for direct random readers, rq_noidle = 1 but there is next request coming
+ after this. Not idling, leads to a group not getting its share even if
+ group_isolation=1.
+
+ o The right solution for this issue is to scan the higher layers and set
+ right flag (WRITE_SYNC or WRITE_ODIRECT). For the time being, this single
+ line fix helps. This should not have any significant impact when we are
+ not using cgroups. I will later figure out IO paths in higher layer and
+ fix it.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 3647d976033973a4502696fb45a980baa8cf1350
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:50:15 2010 +0400
+
+ blkio: Implement group_isolation tunable
+
+ o If a group is running only a random reader, then it will not have enough
+ traffic to keep disk busy and we will reduce overall throughput. This
+ should result in better latencies for random reader though. If we don't
+ idle on random reader service tree, then this random reader will experience
+ large latencies if there are other groups present in system with sequential
+ readers running in these.
+
+ o One solution suggested by corrado is that by default keep the random readers
+ or sync-noidle workload in root group so that during one dispatch round
+ we idle only once on sync-noidle tree. This means that all the sync-idle
+ workload queues will be in their respective group and we will see service
+ differentiation in those but not on sync-noidle workload.
+
+ o Provide a tunable group_isolation. If set, this will make sure that even
+ sync-noidle queues go in their respective group and we wait on these. This
+ provides stronger isolation between groups but at the expense of throughput
+ if group does not have enough traffic to keep the disk busy.
+
+ o By default group_isolation = 0
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit d7d266e74623a5ff4a196c9ba35edb33d844078d
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:50:09 2010 +0400
+
+ blkio: Determine async workload length based on total number of queues
+
+ o Async queues are not per group. Instead these are system wide and maintained
+ in root group. Hence their workload slice length should be calculated
+ based on total number of queues in the system and not just queues in the
+ root group.
+
+ o As root group's default weight is 1000, make sure to charge async queue
+ more in terms of vtime so that it does not get more time on disk because
+ root group has higher weight.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 853b022fdecf1394bc6f56ed4391acfcdac76a77
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:50:06 2010 +0400
+
+ blkio: Wait for cfq queue to get backlogged if group is empty
+
+ o If a queue consumes its slice and then gets deleted from service tree, its
+ associated group will also get deleted from service tree if this was the
+ only queue in the group. That will make group loose its share.
+
+ o For the queues on which we have idling on and if these have used their
+ slice, wait a bit for these queues to get backlogged again and then
+ expire these queues so that group does not loose its share.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 6b1099c5bbc770dc0e00e447c91cc2c70abfcd4d
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:49:55 2010 +0400
+
+ blkio: Propagate cgroup weight updation to cfq groups
+
+ o Propagate blkio cgroup weight updation to associated cfq groups.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit b8e49f6ef8a5b19dcc3596a957b10ff7783ca8e3
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:49:51 2010 +0400
+
+ blkio: Drop the reference to queue once the task changes cgroup
+
+ o If a task changes cgroup, drop reference to the cfqq associated with io
+ context and set cfqq pointer stored in ioc to NULL so that upon next request
+ arrival we will allocate a new queue in new group.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit f0939f2fb5a93f52e4c38c96dd403a20412635ac
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:49:47 2010 +0400
+
+ blkio: Provide some isolation between groups
+
+ o Do not allow following three operations across groups for isolation.
+ - selection of co-operating queues
+ - preemtpions across groups
+ - request merging across groups.
+
+ o Async queues are currently global and not per group. Allow preemption of
+ an async queue if a sync queue in other group gets backlogged.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 3e5835588e20983417074286dc9c46aeff4bdcb5
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:49:43 2010 +0400
+
+ blkio: Export disk time and sectors used by a group to user space
+
+ o Export disk time and sector used by a group to user space through cgroup
+ interface.
+
+ o Also export a "dequeue" interface to cgroup which keeps track of how many
+ a times a group was deleted from service tree. Helps in debugging.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 5050a2e923c23fee20e5d20350da94328c028ea7
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:49:38 2010 +0400
+
+ blkio: Some debugging aids for CFQ
+
+ o Some debugging aids for CFQ.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 32227ad5a49cdf40d128fff9f573e770326fb2a1
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:49:33 2010 +0400
+
+ blkio: Take care of cgroup deletion and cfq group reference counting
+
+ o One can choose to change elevator or delete a cgroup. Implement group
+ reference counting so that both elevator exit and cgroup deletion can
+ take place gracefully.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Nauman Rafique <nauman at google.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit c80d513227c069c5f15e1722ef3d63096aa2652b
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:49:29 2010 +0400
+
+ blkio: Dynamic cfq group creation based on cgroup tasks belongs to
+
+ o Determine the cgroup IO submitting task belongs to and create the cfq
+ group if it does not exist already.
+
+ o Also link cfqq and associated cfq group.
+
+ o Currently all async IO is mapped to root group.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit e890b41384a11cd0eaaf4901d72de44cd21e2b65
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:49:25 2010 +0400
+
+ blkio: Group time used accounting and workload context save restore
+
+ o This patch introduces the functionality to do the accounting of group time
+ when a queue expires. This time used decides which is the group to go
+ next.
+
+ o Also introduce the functionlity to save and restore the workload type
+ context with-in group. It might happen that once we expire the cfq queue
+ and group, a different group will schedule in and we will lose the context
+ of the workload type. Hence save and restore it upon queue expiry.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit fb6067d930baa1b510aba82153ddad866aa0cf65
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:49:21 2010 +0400
+
+ blkio: Implement per cfq group latency target and busy queue avg
+
+ o So far we had 300ms soft target latency system wide. Now with the
+ introduction of cfq groups, divide that latency by number of groups so
+ that one can come up with group target latency which will be helpful
+ in determining the workload slice with-in group and also the dynamic
+ slice length of the cfq queue.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 0fee1302172d62ee9eb34c37d792ac05e30fe2d7
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:49:17 2010 +0400
+
+ blkio: Introduce per cfq group weights and vdisktime calculations
+
+ o Bring in the per cfq group weight and how vdisktime is calculated for the
+ group. Also bring in the functionality of updating the min_vdisktime of
+ the group service tree.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit a31a7a44995ded913fd031f922cffa9e457b2a83
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:49:12 2010 +0400
+
+ blkio: Introduce blkio controller cgroup interface
+
+ o This is basic implementation of blkio controller cgroup interface. This is
+ the common interface visible to user space and should be used by different
+ IO control policies as we implement those.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 82041001ee5b7a662d488238f46b8912cc440160
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:49:06 2010 +0400
+
+ blkio: Introduce the root service tree for cfq groups
+
+ o So far we just had one cfq_group in cfq_data. To create space for more than
+ one cfq_group, we need to have a service tree of groups where all the groups
+ can be queued if they have active cfq queues backlogged in these.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 1b290883254f64d396f11a071b74598d97e1b3d3
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:49:02 2010 +0400
+
+ blkio: Keep queue on service tree until we expire it
+
+ o Currently cfqq deletes a queue from service tree if it is empty (even if
+ we might idle on the queue). This patch keeps the queue on service tree
+ hence associated group remains on the service tree until we decide that
+ we are not going to idle on the queue and expire it.
+
+ o This just helps in time accounting for queue/group and in implementation
+ of rest of the patches.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit d0d70b93083a4fc811bd3bfed1df04870102d538
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:48:58 2010 +0400
+
+ blkio: Implement macro to traverse each service tree in group
+
+ o Implement a macro to traverse each service tree in the group. This avoids
+ usage of double for loop and special condition for idle tree 4 times.
+
+ o Macro is little twisted because of special handling of idle class service
+ tree.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 4fea5fccf125349a109304569acbeda86c9ab67f
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:48:54 2010 +0400
+
+ blkio: Introduce the notion of cfq groups
+
+ o This patch introduce the notion of cfq groups. Soon we will can have multiple
+ groups of different weights in the system.
+
+ o Various service trees (prioclass and workload type trees), will become per
+ cfq group. So hierarchy looks as follows.
+
+ cfq_groups
+ |
+ workload type
+ |
+ cfq queue
+
+ o When an scheduling decision has to be taken, first we select the cfq group
+ then workload with-in the group and then cfq queue with-in the workload
+ type.
+
+ o This patch just makes various workload service tree per cfq group and
+ introduce the function to be able to choose a group for scheduling.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 14d52ec9524545c8eb9c13d05925c53f1bd2b3ff
+Author: Vivek Goyal <vgoyal at redhat.com>
+Date: Tue Apr 27 16:48:49 2010 +0400
+
+ blkio: Set must_dispatch only if we decided to not dispatch the request
+
+ o must_dispatch flag should be set only if we decided not to run the queue
+ and dispatch the request.
+
+ Signed-off-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit a6a0574d5ab33877885943183de7645e157ed16e
+Author: Shaohua Li <shaohua.li at intel.com>
+Date: Tue Apr 27 16:48:46 2010 +0400
+
+ cfq-iosched: no dispatch limit for single queue
+
+ Since commit 2f5cb7381b737e24c8046fd4aeab571fb71315f5, each queue can send
+ up to 4 * 4 requests if only one queue exists. I wonder why we have such limit.
+ Device supports tag can send more requests. For example, AHCI can send 31
+ requests. Test (direct aio randread) shows the limits reduce about 4% disk
+ thoughput.
+ On the other hand, since we send one request one time, if other queue
+ pop when current is sending more than cfq_quantum requests, current queue will
+ stop send requests soon after one request, so sounds there is no big latency.
+
+ Signed-off-by: Shaohua Li <shaohua.li at intel.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 886ef3fce890295b04063e286c1a82c97574b737
+Author: Jens Axboe <jens.axboe at oracle.com>
+Date: Tue Apr 27 16:48:41 2010 +0400
+
+ Revert "cfq: Make use of service count to estimate the rb_key offset"
+
+ This reverts commit 3586e917f2c7df769d173c4ec99554cb40a911e5.
+
+ Corrado Zoccolo <czoccolo at gmail.com> correctly points out, that we need
+ consistency of rb_key offset across groups. This means we cannot properly
+ use the per-service_tree service count. Revert this change.
+
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 80216a50226739cd997445d5ff2335a4c944fba7
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date: Tue Apr 27 16:48:36 2010 +0400
+
+ cfq-iosched: fix corner cases in idling logic
+
+ Idling logic was disabled in some corner cases, leading to unfair share
+ for noidle queues.
+ * the idle timer was not armed if there were other requests in the
+ driver. unfortunately, those requests could come from other workloads,
+ or queues for which we don't enable idling. So we will check only
+ pending requests from the active queue
+ * rq_noidle check on no-idle queue could disable the end of tree idle if
+ the last completed request was rq_noidle. Now, we will disable that
+ idle only if all the queues served in the no-idle tree had rq_noidle
+ requests.
+
+ Reported-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+ Acked-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit fed0ad86edd704970417ce78b1a130b1951f7bb8
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date: Tue Apr 27 16:48:32 2010 +0400
+
+ cfq-iosched: idling on deep seeky sync queues
+
+ Seeky sync queues with large depth can gain unfairly big share of disk
+ time, at the expense of other seeky queues. This patch ensures that
+ idling will be enabled for queues with I/O depth at least 4, and small
+ think time. The decision to enable idling is sticky, until an idle
+ window times out without seeing a new request.
+
+ The reasoning behind the decision is that, if an application is using
+ large I/O depth, it is already optimized to make full utilization of
+ the hardware, and therefore we reserve a slice of exclusive use for it.
+
+ Reported-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 989d070f4d3594f485df16fa5b5786db8188e837
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date: Tue Apr 27 16:48:28 2010 +0400
+
+ cfq-iosched: fix no-idle preemption logic
+
+ An incoming no-idle queue should preempt the active no-idle queue
+ only if the active queue is idling due to service tree empty.
+ Previous code was buggy in two ways:
+ * it relied on service_tree field to be set on the active queue, while
+ it is not set when the code is idling for a new request
+ * it didn't check for the service tree empty condition, so could lead to
+ LIFO behaviour if multiple queues with depth > 1 were preempting each
+ other on an non-NCQ device.
+
+ Reported-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+ Acked-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 1baaab33a240924a5542eeb7a275d2915dc09518
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date: Tue Apr 27 16:48:23 2010 +0400
+
+ cfq-iosched: fix ncq detection code
+
+ CFQ's detection of queueing devices initially assumes a queuing device
+ and detects if the queue depth reaches a certain threshold.
+ However, it will reconsider this choice periodically.
+
+ Unfortunately, if device is considered not queuing, CFQ will force a
+ unit queue depth for some workloads, thus defeating the detection logic.
+ This leads to poor performance on queuing hardware,
+ since the idle window remains enabled.
+
+ Given this premise, switching to hw_tag = 0 after we have proved at
+ least once that the device is NCQ capable is not a good choice.
+
+ The new detection code starts in an indeterminate state, in which CFQ behaves
+ as if hw_tag = 1, and then, if for a long observation period we never saw
+ large depth, we switch to hw_tag = 0, otherwise we stick to hw_tag = 1,
+ without reconsidering it again.
+
+ Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 43090d5ccb1b6adcd28b2d4d54cc8ddf6c96a212
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date: Tue Apr 27 16:48:19 2010 +0400
+
+ cfq-iosched: cleanup unreachable code
+
+ cfq_should_idle returns false for no-idle queues that are not the last,
+ so the control flow will never reach the removed code in a state that
+ satisfies the if condition.
+ The unreachable code was added to emulate previous cfq behaviour for
+ non-NCQ rotational devices. My tests show that even without it, the
+ performances and fairness are comparable with previous cfq, thanks to
+ the fact that all seeky queues are grouped together, and that we idle at
+ the end of the tree.
+
+ Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+ Acked-by: Vivek Goyal <vgoyal at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit ea4004872f1e7a3a3651319fd5df6df17e9c7e66
+Author: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+Date: Tue Apr 27 16:48:15 2010 +0400
+
+ cfq: Make use of service count to estimate the rb_key offset
+
+ For the moment, different workload cfq queues are put into different
+ service trees. But CFQ still uses "busy_queues" to estimate rb_key
+ offset when inserting a cfq queue into a service tree. I think this
+ isn't appropriate, and it should make use of service tree count to do
+ this estimation. This patch is for for-2.6.33 branch.
+
+ Signed-off-by: Gui Jianfeng <guijianfeng at cn.fujitsu.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 4c49bbef74b78184ecdc8d4c14c6d531f9edea42
+Author: Randy Dunlap <randy.dunlap at oracle.com>
+Date: Tue Apr 27 16:46:51 2010 +0400
+
+ block: jiffies fixes
+
+ Use HZ-independent calculation of milliseconds.
+ Add jiffies.h where it was missing since functions or macros
+ from it are used.
+
+ Signed-off-by: Randy Dunlap <randy.dunlap at oracle.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit f96f26aeb96cc338693fe5c2d48ab04e799f0187
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date: Tue Apr 27 16:46:46 2010 +0400
+
+ cfq-iosched: fix next_rq computation
+
+ Cfq has a bug in computation of next_rq, that affects transition
+ between multiple sequential request streams in a single queue
+ (e.g.: two sequential buffered writers of the same priority),
+ causing the alternation between the two streams for a transient period.
+
+ 8,0 1 18737 0.260400660 5312 D W 141653311 + 256
+ 8,0 1 20839 0.273239461 5400 D W 141653567 + 256
+ 8,0 1 20841 0.276343885 5394 D W 142803919 + 256
+ 8,0 1 20843 0.279490878 5394 D W 141668927 + 256
+ 8,0 1 20845 0.292459993 5400 D W 142804175 + 256
+ 8,0 1 20847 0.295537247 5400 D W 141668671 + 256
+ 8,0 1 20849 0.298656337 5400 D W 142804431 + 256
+ 8,0 1 20851 0.311481148 5394 D W 141668415 + 256
+ 8,0 1 20853 0.314421305 5394 D W 142804687 + 256
+ 8,0 1 20855 0.318960112 5400 D W 142804943 + 256
+
+ The fix makes sure that the next_rq is computed from the last
+ dispatched request, and not affected by merging.
+
+ 8,0 1 37776 4.305161306 0 D W 141738087 + 256
+ 8,0 1 37778 4.308298091 0 D W 141738343 + 256
+ 8,0 1 37780 4.312885190 0 D W 141738599 + 256
+ 8,0 1 37782 4.315933291 0 D W 141738855 + 256
+ 8,0 1 37784 4.319064459 0 D W 141739111 + 256
+ 8,0 1 37786 4.331918431 5672 D W 142803007 + 256
+ 8,0 1 37788 4.334930332 5672 D W 142803263 + 256
+ 8,0 1 37790 4.337902723 5672 D W 142803519 + 256
+ 8,0 1 37792 4.342359774 5672 D W 142803775 + 256
+ 8,0 1 37794 4.345318286 0 D W 142804031 + 256
+
+ Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit b12c0189dd602b89f3c6d82e050a7579f5813a09
+Author: Jens Axboe <jens.axboe at oracle.com>
+Date: Tue Apr 27 16:44:33 2010 +0400
+
+ cfq-iosched: get rid of the coop_preempt flag
+
+ We need to rework this logic post the cooperating cfq_queue merging,
+ for now just get rid of it and Jeff Moyer will fix the fall out.
+
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 738f35df496b0c4a214f08b356f1a08d6f87b70e
+Author: Jens Axboe <jens.axboe at oracle.com>
+Date: Tue Apr 27 16:44:28 2010 +0400
+
+ cfq-iosched: fix merge error
+
+ We ended up with testing the same condition twice, pretty
+ pointless. Remove that first if.
+
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit d70f9c5005fd87d2d9bcfe5a1dd831e119d497b5
+Author: Jens Axboe <jens.axboe at oracle.com>
+Date: Tue Apr 27 16:42:34 2010 +0400
+
+ cfq-iosched: fix style issue in cfq_get_avg_queues()
+
+ Line breaks and bad brace placement.
+
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 1ad5fcfc2beacbe333bd947a6a95acb9ee810891
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date: Tue Apr 27 16:42:30 2010 +0400
+
+ cfq-iosched: fairness for sync no-idle queues
+
+ Currently no-idle queues in cfq are not serviced fairly:
+ even if they can only dispatch a small number of requests at a time,
+ they have to compete with idling queues to be serviced, experiencing
+ large latencies.
+
+ We should notice, instead, that no-idle queues are the ones that would
+ benefit most from having low latency, in fact they are any of:
+ * processes with large think times (e.g. interactive ones like file
+ managers)
+ * seeky (e.g. programs faulting in their code at startup)
+ * or marked as no-idle from upper levels, to improve latencies of those
+ requests.
+
+ This patch improves the fairness and latency for those queues, by:
+ * separating sync idle, sync no-idle and async queues in separate
+ service_trees, for each priority
+ * service all no-idle queues together
+ * and idling when the last no-idle queue has been serviced, to
+ anticipate for more no-idle work
+ * the timeslices allotted for idle and no-idle service_trees are
+ computed proportionally to the number of processes in each set.
+
+ Servicing all no-idle queues together should have a performance boost
+ for NCQ-capable drives, without compromising fairness.
+
+ Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit e2d27033102f717078e4bfdc9229ef84dbd8088c
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date: Tue Apr 27 16:42:03 2010 +0400
+
+ cfq-iosched: enable idling for last queue on priority class
+
+ cfq can disable idling for queues in various circumstances.
+ When workloads of different priorities are competing, if the higher
+ priority queue has idling disabled, lower priority queues may steal
+ its disk share. For example, in a scenario with an RT process
+ performing seeky reads vs a BE process performing sequential reads,
+ on an NCQ enabled hardware, with low_latency unset,
+ the RT process will dispatch only the few pending requests every full
+ slice of service for the BE process.
+
+ The patch solves this issue by always performing idle on the last
+ queue at a given priority class > idle. If the same process, or one
+ that can pre-empt it (so at the same priority or higher), submits a
+ new request within the idle window, the lower priority queue won't
+ dispatch, saving the disk bandwidth for higher priority ones.
+
+ Note: this doesn't touch the non_rotational + NCQ case (no hardware
+ to test if this is a benefit in that case).
+
+ Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit ddc6295b4d6c3461a02f98ba75cbfe900a087ee4
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date: Tue Apr 27 16:41:59 2010 +0400
+
+ cfq-iosched: reimplement priorities using different service trees
+
+ We use different service trees for different priority classes.
+ This allows a simplification in the service tree insertion code, that no
+ longer has to consider priority while walking the tree.
+
+ Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit b1ca547aa679a0605bf9cfbc2ee8c4d0f9738e90
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date: Tue Apr 27 16:41:55 2010 +0400
+
+ cfq-iosched: preparation to handle multiple service trees
+
+ We embed a pointer to the service tree in each queue, to handle multiple
+ service trees easily.
+ Service trees are enriched with a counter.
+ cfq_add_rq_rb is invoked after putting the rq in the fifo, to ensure
+ that all fields in rq are properly initialized.
+
+ Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 72c938338cfb00497c498fd05901c23f2fa9e6ce
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date: Tue Apr 27 16:41:50 2010 +0400
+
+ cfq-iosched: adapt slice to number of processes doing I/O
+
+ When the number of processes performing I/O concurrently increases,
+ a fixed time slice per process will cause large latencies.
+
+ This patch, if low_latency mode is enabled, will scale the time slice
+ assigned to each process according to a 300ms target latency.
+
+ In order to keep fairness among processes:
+ * The number of active processes is computed using a special form of
+ running average, that quickly follows sudden increases (to keep latency low),
+ and decrease slowly (to have fairness in spite of rapid decreases of this
+ value).
+
+ To safeguard sequential bandwidth, we impose a minimum time slice
+ (computed using 2*cfq_slice_idle as base, adjusted according to priority
+ and async-ness).
+
+ Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit ca34f4ef05e2b5abcb60af65a69a367ea9f5148e
+Author: Corrado Zoccolo <czoccolo at gmail.com>
+Date: Tue Apr 27 16:41:46 2010 +0400
+
+ cfq-iosched: simplify prio-unboost code
+
+ Eliminate redundant checks.
+
+ Signed-off-by: Corrado Zoccolo <czoccolo at gmail.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit ab7d66cd0bd0aff8fe977d03cd20afd1ff3a5dfd
+Author: Shaohua Li <shaohua.li at intel.com>
+Date: Tue Apr 27 16:41:42 2010 +0400
+
+ cfq-iosched: improve hw_tag detection
+
+ If active queue hasn't enough requests and idle window opens, cfq will not
+ dispatch sufficient requests to hardware. In such situation, current code
+ will zero hw_tag. But this is because cfq doesn't dispatch enough requests
+ instead of hardware queue doesn't work. Don't zero hw_tag in such case.
+
+ Signed-off-by: Shaohua Li <shaohua.li at intel.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 00b99100690429e98f3a8efe7f59fe124814bc67
+Author: Jeff Moyer <jmoyer at redhat.com>
+Date: Tue Apr 27 16:41:38 2010 +0400
+
+ cfq: break apart merged cfqqs if they stop cooperating
+
+ cfq_queues are merged if they are issuing requests within the mean seek
+ distance of one another. This patch detects when the coopearting stops and
+ breaks the queues back up.
+
+ Signed-off-by: Jeff Moyer <jmoyer at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit 9186d4378bed803bf7cca93c1abc4d74adab2ed2
+Author: Jeff Moyer <jmoyer at redhat.com>
+Date: Tue Apr 27 16:32:26 2010 +0400
+
+ cfq: change the meaning of the cfqq_coop flag
+
+ The flag used to indicate that a cfqq was allowed to jump ahead in the
+ scheduling order due to submitting a request close to the queue that
+ just executed. Since closely cooperating queues are now merged, the flag
+ holds little meaning. Change it to indicate that multiple queues were
+ merged. This will later be used to allow the breaking up of merged queues
+ when they are no longer cooperating.
+
+ Signed-off-by: Jeff Moyer <jmoyer at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit de85cbb1eaf76b988bbe96f89b4761352adf4614
+Author: Jeff Moyer <jmoyer at redhat.com>
+Date: Tue Apr 27 16:32:20 2010 +0400
+
+ cfq: merge cooperating cfq_queues
+
+ When cooperating cfq_queues are detected currently, they are allowed to
+ skip ahead in the scheduling order. It is much more efficient to
+ automatically share the cfq_queue data structure between cooperating processes.
+ Performance of the read-test2 benchmark (which is written to emulate the
+ dump(8) utility) went from 12MB/s to 90MB/s on my SATA disk. NFS servers
+ with multiple nfsd threads also saw performance increases.
+
+ Signed-off-by: Jeff Moyer <jmoyer at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
+commit e09d12221f4d1c7fcb00fd687ae6e759c39054c6
+Author: Jeff Moyer <jmoyer at redhat.com>
+Date: Tue Apr 27 16:18:17 2010 +0400
+
+ cfq: calculate the seek_mean per cfq_queue not per cfq_io_context
+
+ async cfq_queue's are already shared between processes within the same
+ priority, and forthcoming patches will change the mapping of cic to sync
+ cfq_queue from 1:1 to 1:N. So, calculate the seekiness of a process
+ based on the cfq_queue instead of the cfq_io_context.
+
+ Signed-off-by: Jeff Moyer <jmoyer at redhat.com>
+ Signed-off-by: Jens Axboe <jens.axboe at oracle.com>
+
commit c05f95fcb04e896c898218d12a8f37c43d2f9cc6
Author: Pavel Emelyanov <xemul at openvz.org>
Date: Tue Apr 27 15:10:13 2010 +0400
@@ -4590,14 +6200,14 @@
+library. If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/Makefile b/Makefile
-index 573578f..12ba193 100644
+index 801d0e1..4eac9f7 100644
--- a/Makefile
+++ b/Makefile
@@ -2,6 +2,7 @@ VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 32
EXTRAVERSION =
-+VZVERSION = avdeyev
++VZVERSION = balandin
NAME = Man-Eating Seals of Antiquity
# *DOCUMENTATION*
@@ -4621,10 +6231,10 @@
define filechk_version.h
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
-index 4fdb669..1334638 100644
+index fbc161d..e6cc64c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
-@@ -2069,6 +2069,8 @@ config HAVE_ATOMIC_IOMAP
+@@ -2074,6 +2074,8 @@ config HAVE_ATOMIC_IOMAP
def_bool y
depends on X86_32
@@ -4633,7 +6243,7 @@
source "net/Kconfig"
source "drivers/Kconfig"
-@@ -2086,3 +2088,5 @@ source "crypto/Kconfig"
+@@ -2091,3 +2093,5 @@ source "crypto/Kconfig"
source "arch/x86/kvm/Kconfig"
source "lib/Kconfig"
@@ -5178,7 +6788,7 @@
regs.bx = (unsigned long) fn;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
-index 6eabe90..490f4f5 100644
+index 868fdb4..0cc650d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -25,8 +25,10 @@
@@ -5230,7 +6840,7 @@
}
void release_thread(struct task_struct *dead_task)
-@@ -681,3 +684,20 @@ unsigned long KSTK_ESP(struct task_struct *task)
+@@ -680,3 +683,20 @@ unsigned long KSTK_ESP(struct task_struct *task)
return (test_tsk_thread_flag(task, TIF_IA32)) ?
(task_pt_regs(task)->sp) : ((task)->thread.usersp);
}
@@ -5365,263 +6975,3193 @@
printk(" passed.\n");
}
-+#ifdef CONFIG_VE
-+ /* TSC reset. kill whatever might rely on old values */
-+ VE_TASK_INFO(current)->wakeup_stamp = 0;
-+#endif
++#ifdef CONFIG_VE
++ /* TSC reset. kill whatever might rely on old values */
++ VE_TASK_INFO(current)->wakeup_stamp = 0;
++#endif
+ /*
+ * Reset it - just in case we boot another CPU later:
+ */
+diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
+index 3909e3b..bbfa7af 100644
+--- a/arch/x86/kernel/x8664_ksyms_64.c
++++ b/arch/x86/kernel/x8664_ksyms_64.c
+@@ -3,6 +3,7 @@
+
+ #include <linux/module.h>
+ #include <linux/smp.h>
++#include <linux/syscalls.h>
+
+ #include <net/checksum.h>
+
+@@ -17,6 +18,7 @@
+ EXPORT_SYMBOL(mcount);
+ #endif
+
++EXPORT_SYMBOL(kernel_execve);
+ EXPORT_SYMBOL(kernel_thread);
+
+ EXPORT_SYMBOL(__get_user_1);
+diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
+index f4cee90..3e549cd 100644
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -689,7 +689,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
+ if (!printk_ratelimit())
+ return;
+
+- printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
++ ve_printk(VE_LOG, "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+ task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+ tsk->comm, task_pid_nr(tsk), address,
+ (void *)regs->ip, (void *)regs->sp, error_code);
+@@ -909,7 +909,7 @@ spurious_fault(unsigned long error_code, unsigned long address)
+ return ret;
+ }
+
+-int show_unhandled_signals = 1;
++int show_unhandled_signals = 0;
+
+ static inline int
+ access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
+diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
+index f46c340..6b7330c 100644
+--- a/arch/x86/mm/hugetlbpage.c
++++ b/arch/x86/mm/hugetlbpage.c
+@@ -12,6 +12,7 @@
+ #include <linux/slab.h>
+ #include <linux/err.h>
+ #include <linux/sysctl.h>
++#include <linux/module.h>
+ #include <asm/mman.h>
+ #include <asm/tlb.h>
+ #include <asm/tlbflush.h>
+@@ -230,6 +231,7 @@ int pud_huge(pud_t pud)
+ {
+ return !!(pud_val(pud) & _PAGE_PSE);
+ }
++EXPORT_SYMBOL(pmd_huge);
+
+ struct page *
+ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
+index c9ba9de..589a93b 100644
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -4,7 +4,8 @@
+ #include <asm/tlb.h>
+ #include <asm/fixmap.h>
+
+-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
++#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO | __GFP_UBC
++#define PGALLOC_KERN_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+
+ #ifdef CONFIG_HIGHPTE
+ #define PGALLOC_USER_GFP __GFP_HIGHMEM
+@@ -16,7 +17,7 @@ gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
+
+ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+ {
+- return (pte_t *)__get_free_page(PGALLOC_GFP);
++ return (pte_t *)__get_free_page(PGALLOC_KERN_GFP);
+ }
+
+ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
+index 36fe08e..42445e5 100644
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -256,6 +256,8 @@ void flush_tlb_mm(struct mm_struct *mm)
+ preempt_enable();
+ }
+
++EXPORT_SYMBOL(flush_tlb_mm);
++
+ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
+ {
+ struct mm_struct *mm = vma->vm_mm;
+diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
+index 58bc00f..b7028c5 100644
+--- a/arch/x86/vdso/vdso32-setup.c
++++ b/arch/x86/vdso/vdso32-setup.c
+@@ -17,6 +17,8 @@
+ #include <linux/err.h>
+ #include <linux/module.h>
+
++#include <bc/vmpages.h>
++
+ #include <asm/cpufeature.h>
+ #include <asm/msr.h>
+ #include <asm/pgtable.h>
+@@ -37,6 +39,8 @@ enum {
+ #else
+ #define VDSO_DEFAULT VDSO_ENABLED
+ #endif
++#undef VDSO_DEFAULT
++#define VDSO_DEFAULT VDSO_DISABLED
+
+ #ifdef CONFIG_X86_64
+ #define vdso_enabled sysctl_vsyscall32
+@@ -193,7 +197,8 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
+ }
+ }
+
+-static struct page *vdso32_pages[1];
++struct page *vdso32_pages[1];
++EXPORT_SYMBOL_GPL(vdso32_pages);
+
+ #ifdef CONFIG_X86_64
+
+@@ -309,16 +314,30 @@ int __init sysenter_setup(void)
+ return 0;
+ }
+
++EXPORT_SYMBOL_GPL(VDSO32_SYSENTER_RETURN);
++EXPORT_SYMBOL_GPL(VDSO32_PRELINK);
++
+ /* Setup a VMA at program startup for the vsyscall page */
+-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
++int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp,
++ unsigned long map_address)
+ {
+ struct mm_struct *mm = current->mm;
+- unsigned long addr;
++ unsigned long addr = map_address;
+ int ret = 0;
+ bool compat;
++ unsigned long flags;
+
+- if (vdso_enabled == VDSO_DISABLED)
++ if (vdso_enabled == VDSO_DISABLED && map_address == 0) {
++ current->mm->context.vdso = NULL;
+ return 0;
++ }
++
++ flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | VM_MAYWRITE |
++ mm->def_flags;
++
++ ret = -ENOMEM;
++ if (ub_memory_charge(mm, PAGE_SIZE, flags, NULL, UB_SOFT))
++ goto err_charge;
+
+ down_write(&mm->mmap_sem);
+
+@@ -328,19 +347,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+
+ map_compat_vdso(compat);
+
+- if (compat)
+- addr = VDSO_HIGH_BASE;
+- else {
+- addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
++ if (!compat || map_address) {
++ addr = get_unmapped_area(NULL, addr, PAGE_SIZE, 0, 0);
+ if (IS_ERR_VALUE(addr)) {
+ ret = addr;
+ goto up_fail;
+ }
+- }
++ } else
++ addr = VDSO_HIGH_BASE;
+
+ current->mm->context.vdso = (void *)addr;
+
+- if (compat_uses_vma || !compat) {
++ if (compat_uses_vma || !compat || map_address) {
+ /*
+ * MAYWRITE to allow gdb to COW and set breakpoints
+ *
+@@ -368,9 +386,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+ current->mm->context.vdso = NULL;
+
+ up_write(&mm->mmap_sem);
++ if (ret < 0)
++ ub_memory_uncharge(mm, PAGE_SIZE, flags, NULL);
++err_charge:
+
+ return ret;
+ }
++EXPORT_SYMBOL_GPL(arch_setup_additional_pages);
+
+ #ifdef CONFIG_X86_64
+
+diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
+index 21e1aeb..507ba17 100644
+--- a/arch/x86/vdso/vma.c
++++ b/arch/x86/vdso/vma.c
+@@ -4,6 +4,7 @@
+ * Subject to the GPL, v.2
+ */
+ #include <linux/mm.h>
++#include <linux/module.h>
+ #include <linux/err.h>
+ #include <linux/sched.h>
+ #include <linux/init.h>
+@@ -99,17 +100,23 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
+
+ /* Setup a VMA at program startup for the vsyscall page.
+ Not called for compat tasks */
+-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
++int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp,
++ unsigned long map_address)
+ {
+ struct mm_struct *mm = current->mm;
+ unsigned long addr;
+ int ret;
+
+- if (!vdso_enabled)
++ if (!vdso_enabled && map_address == 0) {
++ current->mm->context.vdso = NULL;
+ return 0;
++ }
+
+ down_write(&mm->mmap_sem);
+- addr = vdso_addr(mm->start_stack, vdso_size);
++ if (map_address)
++ addr = map_address;
++ else
++ addr = vdso_addr(mm->start_stack, vdso_size);
+ addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
+ if (IS_ERR_VALUE(addr)) {
+ ret = addr;
+@@ -132,6 +139,7 @@ up_fail:
+ up_write(&mm->mmap_sem);
+ return ret;
+ }
++EXPORT_SYMBOL_GPL(arch_setup_additional_pages);
+
+ static __init int vdso_setup(char *s)
+ {
+diff --git a/block/Kconfig b/block/Kconfig
+index 9be0b56..e20fbde 100644
+--- a/block/Kconfig
++++ b/block/Kconfig
+@@ -77,6 +77,28 @@ config BLK_DEV_INTEGRITY
+ T10/SCSI Data Integrity Field or the T13/ATA External Path
+ Protection. If in doubt, say N.
+
++config BLK_CGROUP
++ bool
++ depends on CGROUPS
++ default n
++ ---help---
++ Generic block IO controller cgroup interface. This is the common
++ cgroup interface which should be used by various IO controlling
++ policies.
++
++ Currently, CFQ IO scheduler uses it to recognize task groups and
++ control disk bandwidth allocation (proportional time slice allocation)
++ to such task groups.
++
++config DEBUG_BLK_CGROUP
++ bool
++ depends on BLK_CGROUP
++ default n
++ ---help---
++ Enable some debugging help. Currently it stores the cgroup path
++ in the blk group which can be used by cfq for tracing various
++ group related activity.
++
+ endif # BLOCK
+
+ config BLOCK_COMPAT
+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
+index 7e803fc..9c5f0b5 100644
+--- a/block/Kconfig.iosched
++++ b/block/Kconfig.iosched
+@@ -40,6 +40,23 @@ config IOSCHED_CFQ
+ working environment, suitable for desktop systems.
+ This is the default I/O scheduler.
+
++config CFQ_GROUP_IOSCHED
++ bool "CFQ Group Scheduling support"
++ depends on IOSCHED_CFQ && CGROUPS
++ select BLK_CGROUP
++ default n
++ ---help---
++ Enable group IO scheduling in CFQ.
++
++config DEBUG_CFQ_IOSCHED
++ bool "Debug CFQ Scheduling"
++ depends on CFQ_GROUP_IOSCHED
++ select DEBUG_BLK_CGROUP
++ default n
++ ---help---
++ Enable CFQ IO scheduling debugging in CFQ. Currently it makes
++ blktrace output more verbose.
++
+ choice
+ prompt "Default I/O scheduler"
+ default DEFAULT_CFQ
+diff --git a/block/Makefile b/block/Makefile
+index ba74ca6..16334c9 100644
+--- a/block/Makefile
++++ b/block/Makefile
+@@ -8,6 +8,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
+ blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
+
+ obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
++obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
+ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
+ obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
+ obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
+diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
+new file mode 100644
+index 0000000..444f20b
+--- /dev/null
++++ b/block/blk-cgroup.c
+@@ -0,0 +1,366 @@
++/*
++ * Common Block IO controller cgroup interface
++ *
++ * Based on ideas and code from CFQ, CFS and BFQ:
++ * Copyright (C) 2003 Jens Axboe <axboe at kernel.dk>
++ *
++ * Copyright (C) 2008 Fabio Checconi <fabio at gandalf.sssup.it>
++ * Paolo Valente <paolo.valente at unimore.it>
++ *
++ * Copyright (C) 2009 Vivek Goyal <vgoyal at redhat.com>
++ * Nauman Rafique <nauman at google.com>
++ */
++#include <linux/ioprio.h>
++#include <linux/seq_file.h>
++#include <linux/kdev_t.h>
++#include <linux/module.h>
++#include <linux/err.h>
++#include "blk-cgroup.h"
++
++static DEFINE_SPINLOCK(blkio_list_lock);
++static LIST_HEAD(blkio_list);
++
++struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
++EXPORT_SYMBOL_GPL(blkio_root_cgroup);
++
++bool blkiocg_css_tryget(struct blkio_cgroup *blkcg)
++{
++ if (!css_tryget(&blkcg->css))
++ return false;
++ return true;
++}
++EXPORT_SYMBOL_GPL(blkiocg_css_tryget);
++
++void blkiocg_css_put(struct blkio_cgroup *blkcg)
++{
++ css_put(&blkcg->css);
++}
++EXPORT_SYMBOL_GPL(blkiocg_css_put);
++
++struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
++{
++ return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
++ struct blkio_cgroup, css);
++}
++EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
++
++void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
++ unsigned long time, unsigned long sectors)
++{
++ blkg->time += time;
++ blkg->sectors += sectors;
++}
++EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
++
++void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
++ struct blkio_group *blkg, void *key, dev_t dev)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&blkcg->lock, flags);
++ rcu_assign_pointer(blkg->key, key);
++ blkg->blkcg_id = css_id(&blkcg->css);
++ hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
++ spin_unlock_irqrestore(&blkcg->lock, flags);
++#ifdef CONFIG_DEBUG_BLK_CGROUP
++ /* Need to take css reference ? */
++ cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
++#endif
++ blkg->dev = dev;
++}
++EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
++
++static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
++{
++ hlist_del_init_rcu(&blkg->blkcg_node);
++ blkg->blkcg_id = 0;
++}
++
++/*
++ * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
++ * indicating that blk_group was unhashed by the time we got to it.
++ */
++int blkiocg_del_blkio_group(struct blkio_group *blkg)
++{
++ struct blkio_cgroup *blkcg;
++ unsigned long flags;
++ struct cgroup_subsys_state *css;
++ int ret = 1;
++
++ rcu_read_lock();
++ css = css_lookup(&blkio_subsys, blkg->blkcg_id);
++ if (!css)
++ goto out;
++
++ blkcg = container_of(css, struct blkio_cgroup, css);
++ spin_lock_irqsave(&blkcg->lock, flags);
++ if (!hlist_unhashed(&blkg->blkcg_node)) {
++ __blkiocg_del_blkio_group(blkg);
++ ret = 0;
++ }
++ spin_unlock_irqrestore(&blkcg->lock, flags);
++out:
++ rcu_read_unlock();
++ return ret;
++}
++EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
++
++/* called under rcu_read_lock(). */
++struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
++{
++ struct blkio_group *blkg;
++ struct hlist_node *n;
++ void *__key;
++
++ hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
++ __key = blkg->key;
++ if (__key == key)
++ return blkg;
++ }
++
++ return NULL;
++}
++EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
++
++#define SHOW_FUNCTION(__VAR) \
++static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \
++ struct cftype *cftype) \
++{ \
++ struct blkio_cgroup *blkcg; \
++ \
++ blkcg = cgroup_to_blkio_cgroup(cgroup); \
++ return (u64)blkcg->__VAR; \
++}
++
++SHOW_FUNCTION(weight);
++#undef SHOW_FUNCTION
++
++static int
++blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
++{
++ struct blkio_cgroup *blkcg;
++ struct blkio_group *blkg;
++ struct hlist_node *n;
++ struct blkio_policy_type *blkiop;
++
++ if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
++ return -EINVAL;
++
++ blkcg = cgroup_to_blkio_cgroup(cgroup);
++ spin_lock(&blkio_list_lock);
++ spin_lock_irq(&blkcg->lock);
++ blkcg->weight = (unsigned int)val;
++ hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
++ list_for_each_entry(blkiop, &blkio_list, list)
++ blkiop->ops.blkio_update_group_weight_fn(blkg,
++ blkcg->weight);
++ }
++ spin_unlock_irq(&blkcg->lock);
++ spin_unlock(&blkio_list_lock);
++ return 0;
++}
++
++int blkiocg_set_weight(struct cgroup *cgroup, u64 val)
++{
++ return blkiocg_weight_write(cgroup, NULL, val);
++}
++
++#define SHOW_FUNCTION_PER_GROUP(__VAR) \
++static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
++ struct cftype *cftype, struct seq_file *m) \
++{ \
++ struct blkio_cgroup *blkcg; \
++ struct blkio_group *blkg; \
++ struct hlist_node *n; \
++ \
++ if (!cgroup_lock_live_group(cgroup)) \
++ return -ENODEV; \
++ \
++ blkcg = cgroup_to_blkio_cgroup(cgroup); \
++ rcu_read_lock(); \
++ hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
++ if (blkg->dev) \
++ seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \
++ MINOR(blkg->dev), blkg->__VAR); \
++ } \
++ rcu_read_unlock(); \
++ cgroup_unlock(); \
++ return 0; \
++}
++
++SHOW_FUNCTION_PER_GROUP(time);
++SHOW_FUNCTION_PER_GROUP(sectors);
++#ifdef CONFIG_DEBUG_BLK_CGROUP
++SHOW_FUNCTION_PER_GROUP(dequeue);
++#endif
++#undef SHOW_FUNCTION_PER_GROUP
++
++#ifdef CONFIG_DEBUG_BLK_CGROUP
++void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
++ unsigned long dequeue)
++{
++ blkg->dequeue += dequeue;
++}
++EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
++#endif
++
++struct cftype blkio_files[] = {
++ {
++ .name = "weight",
++ .read_u64 = blkiocg_weight_read,
++ .write_u64 = blkiocg_weight_write,
++ },
++ {
++ .name = "time",
++ .read_seq_string = blkiocg_time_read,
++ },
++ {
++ .name = "sectors",
++ .read_seq_string = blkiocg_sectors_read,
++ },
++#ifdef CONFIG_DEBUG_BLK_CGROUP
++ {
++ .name = "dequeue",
++ .read_seq_string = blkiocg_dequeue_read,
++ },
++#endif
++};
++
++static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
++{
++ return cgroup_add_files(cgroup, subsys, blkio_files,
++ ARRAY_SIZE(blkio_files));
++}
++
++static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
++{
++ struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
++ unsigned long flags;
++ struct blkio_group *blkg;
++ void *key;
++ struct blkio_policy_type *blkiop;
++
++ rcu_read_lock();
++remove_entry:
++ spin_lock_irqsave(&blkcg->lock, flags);
++
++ if (hlist_empty(&blkcg->blkg_list)) {
++ spin_unlock_irqrestore(&blkcg->lock, flags);
++ goto done;
++ }
++
++ blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
++ blkcg_node);
++ key = rcu_dereference(blkg->key);
++ __blkiocg_del_blkio_group(blkg);
++
++ spin_unlock_irqrestore(&blkcg->lock, flags);
++
++ /*
++ * This blkio_group is being unlinked as associated cgroup is going
++ * away. Let all the IO controlling policies know about this event.
++ *
++ * Currently this is static call to one io controlling policy. Once
++ * we have more policies in place, we need some dynamic registration
++ * of callback function.
++ */
++ spin_lock(&blkio_list_lock);
++ list_for_each_entry(blkiop, &blkio_list, list)
++ blkiop->ops.blkio_unlink_group_fn(key, blkg);
++ spin_unlock(&blkio_list_lock);
++ goto remove_entry;
++done:
++ free_css_id(&blkio_subsys, &blkcg->css);
++ rcu_read_unlock();
++ kfree(blkcg);
++}
++
++static struct cgroup_subsys_state *
++blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
++{
++ struct blkio_cgroup *blkcg, *parent_blkcg;
++
++ if (!cgroup->parent) {
++ blkcg = &blkio_root_cgroup;
++ goto done;
++ }
++
++ /* Currently we do not support hierarchy deeper than two level (0,1) */
++ parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
++ if (css_depth(&parent_blkcg->css) > 0)
++ return ERR_PTR(-EINVAL);
++
++ blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
++ if (!blkcg)
++ return ERR_PTR(-ENOMEM);
++
++ blkcg->weight = BLKIO_WEIGHT_DEFAULT;
++done:
++ spin_lock_init(&blkcg->lock);
++ INIT_HLIST_HEAD(&blkcg->blkg_list);
++
++ return &blkcg->css;
++}
++
++/*
++ * We cannot support shared io contexts, as we have no mean to support
++ * two tasks with the same ioc in two different groups without major rework
++ * of the main cic data structures. For now we allow a task to change
++ * its cgroup only if it's the only owner of its ioc.
++ */
++static int blkiocg_can_attach(struct cgroup_subsys *subsys,
++ struct cgroup *cgroup, struct task_struct *tsk,
++ bool threadgroup)
++{
++ struct io_context *ioc;
++ int ret = 0;
++
++ /* task_lock() is needed to avoid races with exit_io_context() */
++ task_lock(tsk);
++ ioc = tsk->io_context;
++ if (ioc && atomic_read(&ioc->nr_tasks) > 1)
++ ret = -EINVAL;
++ task_unlock(tsk);
++
++ return ret;
++}
++
++static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
++ struct cgroup *prev, struct task_struct *tsk,
++ bool threadgroup)
++{
++ struct io_context *ioc;
++
++ task_lock(tsk);
++ ioc = tsk->io_context;
++ if (ioc)
++ ioc->cgroup_changed = 1;
++ task_unlock(tsk);
++}
++
++struct cgroup_subsys blkio_subsys = {
++ .name = "blkio",
++ .create = blkiocg_create,
++ .can_attach = blkiocg_can_attach,
++ .attach = blkiocg_attach,
++ .destroy = blkiocg_destroy,
++ .populate = blkiocg_populate,
++ .subsys_id = blkio_subsys_id,
++ .use_id = 1,
++};
++
++void blkio_policy_register(struct blkio_policy_type *blkiop)
++{
++ spin_lock(&blkio_list_lock);
++ list_add_tail(&blkiop->list, &blkio_list);
++ spin_unlock(&blkio_list_lock);
++}
++EXPORT_SYMBOL_GPL(blkio_policy_register);
++
++void blkio_policy_unregister(struct blkio_policy_type *blkiop)
++{
++ spin_lock(&blkio_list_lock);
++ list_del_init(&blkiop->list);
++ spin_unlock(&blkio_list_lock);
++}
++EXPORT_SYMBOL_GPL(blkio_policy_unregister);
+diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
+new file mode 100644
+index 0000000..4d316df
+--- /dev/null
++++ b/block/blk-cgroup.h
+@@ -0,0 +1,127 @@
++#ifndef _BLK_CGROUP_H
++#define _BLK_CGROUP_H
++/*
++ * Common Block IO controller cgroup interface
++ *
++ * Based on ideas and code from CFQ, CFS and BFQ:
++ * Copyright (C) 2003 Jens Axboe <axboe at kernel.dk>
++ *
++ * Copyright (C) 2008 Fabio Checconi <fabio at gandalf.sssup.it>
++ * Paolo Valente <paolo.valente at unimore.it>
++ *
++ * Copyright (C) 2009 Vivek Goyal <vgoyal at redhat.com>
++ * Nauman Rafique <nauman at google.com>
++ */
++
++#include <linux/cgroup.h>
++
++#ifdef CONFIG_BLK_CGROUP
++
++struct blkio_cgroup {
++ struct cgroup_subsys_state css;
++ unsigned int weight;
++ spinlock_t lock;
++ struct hlist_head blkg_list;
++};
++
++struct blkio_group {
++ /* An rcu protected unique identifier for the group */
++ void *key;
++ struct hlist_node blkcg_node;
++ unsigned short blkcg_id;
++#ifdef CONFIG_DEBUG_BLK_CGROUP
++ /* Store cgroup path */
++ char path[128];
++ /* How many times this group has been removed from service tree */
++ unsigned long dequeue;
++#endif
++ /* The device MKDEV(major, minor), this group has been created for */
++ dev_t dev;
++
++ /* total disk time and nr sectors dispatched by this group */
++ unsigned long time;
++ unsigned long sectors;
++};
++
++extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg);
++extern void blkiocg_css_put(struct blkio_cgroup *blkcg);
++
++typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
++typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
++ unsigned int weight);
++
++struct blkio_policy_ops {
++ blkio_unlink_group_fn *blkio_unlink_group_fn;
++ blkio_update_group_weight_fn *blkio_update_group_weight_fn;
++};
++
++struct blkio_policy_type {
++ struct list_head list;
++ struct blkio_policy_ops ops;
++};
++
++/* Blkio controller policy registration */
++extern void blkio_policy_register(struct blkio_policy_type *);
++extern void blkio_policy_unregister(struct blkio_policy_type *);
++
++#else
++
++struct blkio_group {
++};
++
++struct blkio_policy_type {
++};
++
++static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
++static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
++
++#endif
++
++#define BLKIO_WEIGHT_MIN 100
++#define BLKIO_WEIGHT_MAX 1000
++#define BLKIO_WEIGHT_DEFAULT 500
++
++#ifdef CONFIG_DEBUG_BLK_CGROUP
++static inline char *blkg_path(struct blkio_group *blkg)
++{
++ return blkg->path;
++}
++void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
++ unsigned long dequeue);
++#else
++static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
++static inline void blkiocg_update_blkio_group_dequeue_stats(
++ struct blkio_group *blkg, unsigned long dequeue) {}
++#endif
++
++#ifdef CONFIG_BLK_CGROUP
++extern struct blkio_cgroup blkio_root_cgroup;
++extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
++extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
++ struct blkio_group *blkg, void *key, dev_t dev);
++extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
++extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
++ void *key);
++void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
++ unsigned long time, unsigned long sectors);
++#else
++struct cgroup;
++static inline struct blkio_cgroup *
++cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
++
++static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
++ struct blkio_group *blkg, void *key, dev_t dev)
++{
++}
++
++static inline int
++blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
++
++static inline struct blkio_group *
++blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
++static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
++ unsigned long time, unsigned long sectors)
++{
++}
++#endif
++#endif /* _BLK_CGROUP_H */
+diff --git a/block/blk-settings.c b/block/blk-settings.c
+index 9651c0a..06c6694 100644
+--- a/block/blk-settings.c
++++ b/block/blk-settings.c
+@@ -9,6 +9,7 @@
+ #include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
+ #include <linux/gcd.h>
+ #include <linux/lcm.h>
++#include <linux/jiffies.h>
+
+ #include "blk.h"
+
+@@ -142,7 +143,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
+ q->nr_batching = BLK_BATCH_REQ;
+
+ q->unplug_thresh = 4; /* hmm */
+- q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */
++ q->unplug_delay = msecs_to_jiffies(3); /* 3 milliseconds */
+ if (q->unplug_delay == 0)
+ q->unplug_delay = 1;
+
+diff --git a/block/bsg.c b/block/bsg.c
+index 0676301..a9fd2d8 100644
+--- a/block/bsg.c
++++ b/block/bsg.c
+@@ -15,6 +15,7 @@
+ #include <linux/blkdev.h>
+ #include <linux/poll.h>
+ #include <linux/cdev.h>
++#include <linux/jiffies.h>
+ #include <linux/percpu.h>
+ #include <linux/uio.h>
+ #include <linux/idr.h>
+@@ -197,7 +198,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
+ rq->cmd_len = hdr->request_len;
+ rq->cmd_type = REQ_TYPE_BLOCK_PC;
+
+- rq->timeout = (hdr->timeout * HZ) / 1000;
++ rq->timeout = msecs_to_jiffies(hdr->timeout);
+ if (!rq->timeout)
+ rq->timeout = q->sg_timeout;
+ if (!rq->timeout)
+diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
+index aa1e953..023f4e6 100644
+--- a/block/cfq-iosched.c
++++ b/block/cfq-iosched.c
+@@ -9,9 +9,11 @@
+ #include <linux/module.h>
+ #include <linux/blkdev.h>
+ #include <linux/elevator.h>
++#include <linux/jiffies.h>
+ #include <linux/rbtree.h>
+ #include <linux/ioprio.h>
+ #include <linux/blktrace_api.h>
++#include "blk-cgroup.h"
+
+ /*
+ * tunables
+@@ -27,6 +29,8 @@ static const int cfq_slice_sync = HZ / 10;
+ static int cfq_slice_async = HZ / 25;
+ static const int cfq_slice_async_rq = 2;
+ static int cfq_slice_idle = HZ / 125;
++static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
++static const int cfq_hist_divisor = 4;
+
+ /*
+ * offset from end of service tree
+@@ -40,6 +44,10 @@ static int cfq_slice_idle = HZ / 125;
+
+ #define CFQ_SLICE_SCALE (5)
+ #define CFQ_HW_QUEUE_MIN (5)
++#define CFQ_SERVICE_SHIFT 12
++
++#define CFQQ_SEEK_THR 8 * 1024
++#define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR)
+
+ #define RQ_CIC(rq) \
+ ((struct cfq_io_context *) (rq)->elevator_private)
+@@ -57,6 +65,7 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
+ #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
+
+ #define sample_valid(samples) ((samples) > 80)
++#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
+
+ /*
+ * Most of our rbtree usage is for sorting with min extraction, so
+@@ -67,8 +76,12 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
+ struct cfq_rb_root {
+ struct rb_root rb;
+ struct rb_node *left;
++ unsigned count;
++ u64 min_vdisktime;
++ struct rb_node *active;
++ unsigned total_weight;
+ };
+-#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, }
++#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, }
+
+ /*
+ * Per process-grouping structure
+@@ -99,6 +112,11 @@ struct cfq_queue {
+ /* fifo list of requests in sort_list */
+ struct list_head fifo;
+
++ /* time when queue got scheduled in to dispatch first request. */
++ unsigned long dispatch_start;
++ unsigned int allocated_slice;
++ /* time when first request from queue completed and slice started. */
++ unsigned long slice_start;
+ unsigned long slice_end;
+ long slice_resid;
+ unsigned int slice_dispatch;
+@@ -112,7 +130,70 @@ struct cfq_queue {
+ unsigned short ioprio, org_ioprio;
+ unsigned short ioprio_class, org_ioprio_class;
+
++ unsigned int seek_samples;
++ u64 seek_total;
++ sector_t seek_mean;
++ sector_t last_request_pos;
++
+ pid_t pid;
++
++ struct cfq_rb_root *service_tree;
++ struct cfq_queue *new_cfqq;
++ struct cfq_group *cfqg;
++ struct cfq_group *orig_cfqg;
++ /* Sectors dispatched in current dispatch round */
++ unsigned long nr_sectors;
++};
++
++/*
++ * First index in the service_trees.
++ * IDLE is handled separately, so it has negative index
++ */
++enum wl_prio_t {
++ BE_WORKLOAD = 0,
++ RT_WORKLOAD = 1,
++ IDLE_WORKLOAD = 2,
++};
++
++/*
++ * Second index in the service_trees.
++ */
++enum wl_type_t {
++ ASYNC_WORKLOAD = 0,
++ SYNC_NOIDLE_WORKLOAD = 1,
++ SYNC_WORKLOAD = 2
++};
++
++/* This is per cgroup per device grouping structure */
++struct cfq_group {
++ /* group service_tree member */
++ struct rb_node rb_node;
++
++ /* group service_tree key */
++ u64 vdisktime;
++ unsigned int weight;
++ bool on_st;
++
++ /* number of cfqq currently on this group */
++ int nr_cfqq;
++
++ /* Per group busy queus average. Useful for workload slice calc. */
++ unsigned int busy_queues_avg[2];
++ /*
++ * rr lists of queues with requests, onle rr for each priority class.
++ * Counts are embedded in the cfq_rb_root
++ */
++ struct cfq_rb_root service_trees[2][3];
++ struct cfq_rb_root service_tree_idle;
++
++ unsigned long saved_workload_slice;
++ enum wl_type_t saved_workload;
++ enum wl_prio_t saved_serving_prio;
++ struct blkio_group blkg;
++#ifdef CONFIG_CFQ_GROUP_IOSCHED
++ struct hlist_node cfqd_node;
++ atomic_t ref;
++#endif
+ };
+
+ /*
+@@ -120,11 +201,18 @@ struct cfq_queue {
+ */
+ struct cfq_data {
+ struct request_queue *queue;
++ /* Root service tree for cfq_groups */
++ struct cfq_rb_root grp_service_tree;
++ struct cfq_group root_group;
+
+ /*
+- * rr list of queues with requests and the count of them
++ * The priority currently being served
+ */
+- struct cfq_rb_root service_tree;
++ enum wl_prio_t serving_prio;
++ enum wl_type_t serving_type;
++ unsigned long workload_expires;
++ struct cfq_group *serving_group;
++ bool noidle_tree_requires_idle;
+
+ /*
+ * Each priority tree is sorted by next_request position. These
+@@ -143,8 +231,14 @@ struct cfq_data {
+ */
+ int rq_queued;
+ int hw_tag;
+- int hw_tag_samples;
+- int rq_in_driver_peak;
++ /*
++ * hw_tag can be
++ * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
++ * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
++ * 0 => no NCQ
++ */
++ int hw_tag_est_depth;
++ unsigned int hw_tag_samples;
+
+ /*
+ * idle window management
+@@ -174,6 +268,7 @@ struct cfq_data {
+ unsigned int cfq_slice_async_rq;
+ unsigned int cfq_slice_idle;
+ unsigned int cfq_latency;
++ unsigned int cfq_group_isolation;
+
+ struct list_head cic_list;
+
+@@ -182,9 +277,28 @@ struct cfq_data {
+ */
+ struct cfq_queue oom_cfqq;
+
+- unsigned long last_end_sync_rq;
++ unsigned long last_delayed_sync;
++
++ /* List of cfq groups being managed on this device*/
++ struct hlist_head cfqg_list;
++ struct rcu_head rcu;
+ };
+
++static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
++
++static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
++ enum wl_prio_t prio,
++ enum wl_type_t type)
++{
++ if (!cfqg)
++ return NULL;
++
++ if (prio == IDLE_WORKLOAD)
++ return &cfqg->service_tree_idle;
++
++ return &cfqg->service_trees[prio][type];
++}
++
+ enum cfqq_state_flags {
+ CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
+ CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
+@@ -195,8 +309,10 @@ enum cfqq_state_flags {
+ CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
+ CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
+ CFQ_CFQQ_FLAG_sync, /* synchronous queue */
+- CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */
+- CFQ_CFQQ_FLAG_coop_preempt, /* coop preempt */
++ CFQ_CFQQ_FLAG_coop, /* cfqq is shared */
++ CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */
++ CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */
++ CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */
+ };
+
+ #define CFQ_CFQQ_FNS(name) \
+@@ -223,14 +339,78 @@ CFQ_CFQQ_FNS(prio_changed);
+ CFQ_CFQQ_FNS(slice_new);
+ CFQ_CFQQ_FNS(sync);
+ CFQ_CFQQ_FNS(coop);
+-CFQ_CFQQ_FNS(coop_preempt);
++CFQ_CFQQ_FNS(split_coop);
++CFQ_CFQQ_FNS(deep);
++CFQ_CFQQ_FNS(wait_busy);
+ #undef CFQ_CFQQ_FNS
+
++#ifdef CONFIG_DEBUG_CFQ_IOSCHED
++#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
++ blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
++ cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
++ blkg_path(&(cfqq)->cfqg->blkg), ##args);
++
++#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \
++ blk_add_trace_msg((cfqd)->queue, "%s " fmt, \
++ blkg_path(&(cfqg)->blkg), ##args); \
++
++#else
+ #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
+ blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
++#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0);
++#endif
+ #define cfq_log(cfqd, fmt, args...) \
+ blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
+
++/* Traverses through cfq group service trees */
++#define for_each_cfqg_st(cfqg, i, j, st) \
++ for (i = 0; i <= IDLE_WORKLOAD; i++) \
++ for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
++ : &cfqg->service_tree_idle; \
++ (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
++ (i == IDLE_WORKLOAD && j == 0); \
++ j++, st = i < IDLE_WORKLOAD ? \
++ &cfqg->service_trees[i][j]: NULL) \
++
++
++static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
++{
++ if (cfq_class_idle(cfqq))
++ return IDLE_WORKLOAD;
++ if (cfq_class_rt(cfqq))
++ return RT_WORKLOAD;
++ return BE_WORKLOAD;
++}
++
++
++static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
++{
++ if (!cfq_cfqq_sync(cfqq))
++ return ASYNC_WORKLOAD;
++ if (!cfq_cfqq_idle_window(cfqq))
++ return SYNC_NOIDLE_WORKLOAD;
++ return SYNC_WORKLOAD;
++}
++
++static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
++ struct cfq_data *cfqd,
++ struct cfq_group *cfqg)
++{
++ if (wl == IDLE_WORKLOAD)
++ return cfqg->service_tree_idle.count;
++
++ return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
++ + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
++ + cfqg->service_trees[wl][SYNC_WORKLOAD].count;
++}
++
++static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
++ struct cfq_group *cfqg)
++{
++ return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
++ + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
++}
++
+ static void cfq_dispatch_insert(struct request_queue *, struct request *);
+ static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
+ struct io_context *, gfp_t);
+@@ -279,7 +459,7 @@ static int cfq_queue_empty(struct request_queue *q)
+ {
+ struct cfq_data *cfqd = q->elevator->elevator_data;
+
+- return !cfqd->busy_queues;
++ return !cfqd->rq_queued;
+ }
+
+ /*
+@@ -303,10 +483,110 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
+ }
+
++static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
++{
++ u64 d = delta << CFQ_SERVICE_SHIFT;
++
++ d = d * BLKIO_WEIGHT_DEFAULT;
++ do_div(d, cfqg->weight);
++ return d;
++}
++
++static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
++{
++ s64 delta = (s64)(vdisktime - min_vdisktime);
++ if (delta > 0)
++ min_vdisktime = vdisktime;
++
++ return min_vdisktime;
++}
++
++static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
++{
++ s64 delta = (s64)(vdisktime - min_vdisktime);
++ if (delta < 0)
++ min_vdisktime = vdisktime;
++
++ return min_vdisktime;
++}
++
++static void update_min_vdisktime(struct cfq_rb_root *st)
++{
++ u64 vdisktime = st->min_vdisktime;
++ struct cfq_group *cfqg;
++
++ if (st->active) {
++ cfqg = rb_entry_cfqg(st->active);
++ vdisktime = cfqg->vdisktime;
++ }
++
++ if (st->left) {
++ cfqg = rb_entry_cfqg(st->left);
++ vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
++ }
++
++ st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
++}
++
++/*
++ * get averaged number of queues of RT/BE priority.
++ * average is updated, with a formula that gives more weight to higher numbers,
++ * to quickly follows sudden increases and decrease slowly
++ */
++
++static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
++ struct cfq_group *cfqg, bool rt)
++{
++ unsigned min_q, max_q;
++ unsigned mult = cfq_hist_divisor - 1;
++ unsigned round = cfq_hist_divisor / 2;
++ unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
++
++ min_q = min(cfqg->busy_queues_avg[rt], busy);
++ max_q = max(cfqg->busy_queues_avg[rt], busy);
++ cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
++ cfq_hist_divisor;
++ return cfqg->busy_queues_avg[rt];
++}
++
++static inline unsigned
++cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
++{
++ struct cfq_rb_root *st = &cfqd->grp_service_tree;
++
++ return cfq_target_latency * cfqg->weight / st->total_weight;
++}
++
+ static inline void
+ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ {
+- cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
++ unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
++ if (cfqd->cfq_latency) {
++ /*
++ * interested queues (we consider only the ones with the same
++ * priority class in the cfq group)
++ */
++ unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
++ cfq_class_rt(cfqq));
++ unsigned sync_slice = cfqd->cfq_slice[1];
++ unsigned expect_latency = sync_slice * iq;
++ unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
++
++ if (expect_latency > group_slice) {
++ unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
++ /* scale low_slice according to IO priority
++ * and sync vs async */
++ unsigned low_slice =
++ min(slice, base_low_slice * slice / sync_slice);
++ /* the adapted slice value is scaled to fit all iqs
++ * into the target latency */
++ slice = max(slice * group_slice / expect_latency,
++ low_slice);
++ }
++ }
++ cfqq->slice_start = jiffies;
++ cfqq->slice_end = jiffies + slice;
++ cfqq->allocated_slice = slice;
+ cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
+ }
+
+@@ -331,9 +611,9 @@ static inline bool cfq_slice_used(struct cfq_queue *cfqq)
+ * behind the head is penalized and only allowed to a certain extent.
+ */
+ static struct request *
+-cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
++cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
+ {
+- sector_t last, s1, s2, d1 = 0, d2 = 0;
++ sector_t s1, s2, d1 = 0, d2 = 0;
+ unsigned long back_max;
+ #define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */
+ #define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */
+@@ -356,8 +636,6 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
+ s1 = blk_rq_pos(rq1);
+ s2 = blk_rq_pos(rq2);
+
+- last = cfqd->last_position;
+-
+ /*
+ * by definition, 1KiB is 2 sectors
+ */
+@@ -425,6 +703,10 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
+ */
+ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
+ {
++ /* Service tree is empty */
++ if (!root->count)
++ return NULL;
++
+ if (!root->left)
+ root->left = rb_first(&root->rb);
+
+@@ -434,6 +716,17 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
+ return NULL;
+ }
+
++static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
++{
++ if (!root->left)
++ root->left = rb_first(&root->rb);
++
++ if (root->left)
++ return rb_entry_cfqg(root->left);
++
++ return NULL;
++}
++
+ static void rb_erase_init(struct rb_node *n, struct rb_root *root)
+ {
+ rb_erase(n, root);
+@@ -445,6 +738,7 @@ static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
+ if (root->left == n)
+ root->left = NULL;
+ rb_erase_init(n, &root->rb);
++ --root->count;
+ }
+
+ /*
+@@ -471,7 +765,7 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ next = rb_entry_rq(rbnext);
+ }
+
+- return cfq_choose_req(cfqd, next, prev);
++ return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
+ }
+
+ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
+@@ -480,12 +774,334 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
+ /*
+ * just an approximation, should be ok.
+ */
+- return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) -
++ return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
+ cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
+ }
+
++static inline s64
++cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
++{
++ return cfqg->vdisktime - st->min_vdisktime;
++}
++
++static void
++__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
++{
++ struct rb_node **node = &st->rb.rb_node;
++ struct rb_node *parent = NULL;
++ struct cfq_group *__cfqg;
++ s64 key = cfqg_key(st, cfqg);
++ int left = 1;
++
++ while (*node != NULL) {
++ parent = *node;
++ __cfqg = rb_entry_cfqg(parent);
++
++ if (key < cfqg_key(st, __cfqg))
++ node = &parent->rb_left;
++ else {
++ node = &parent->rb_right;
++ left = 0;
++ }
++ }
++
++ if (left)
++ st->left = &cfqg->rb_node;
++
++ rb_link_node(&cfqg->rb_node, parent, node);
++ rb_insert_color(&cfqg->rb_node, &st->rb);
++}
++
++static void
++cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
++{
++ struct cfq_rb_root *st = &cfqd->grp_service_tree;
++ struct cfq_group *__cfqg;
++ struct rb_node *n;
++
++ cfqg->nr_cfqq++;
++ if (cfqg->on_st)
++ return;
++
++ /*
++ * Currently put the group at the end. Later implement something
++ * so that groups get lesser vtime based on their weights, so that
++ * if group does not loose all if it was not continously backlogged.
++ */
++ n = rb_last(&st->rb);
++ if (n) {
++ __cfqg = rb_entry_cfqg(n);
++ cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
++ } else
++ cfqg->vdisktime = st->min_vdisktime;
++
++ __cfq_group_service_tree_add(st, cfqg);
++ cfqg->on_st = true;
++ st->total_weight += cfqg->weight;
++}
++
++static void
++cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
++{
++ struct cfq_rb_root *st = &cfqd->grp_service_tree;
++
++ if (st->active == &cfqg->rb_node)
++ st->active = NULL;
++
++ BUG_ON(cfqg->nr_cfqq < 1);
++ cfqg->nr_cfqq--;
++
++ /* If there are other cfq queues under this group, don't delete it */
++ if (cfqg->nr_cfqq)
++ return;
++
++ cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
++ cfqg->on_st = false;
++ st->total_weight -= cfqg->weight;
++ if (!RB_EMPTY_NODE(&cfqg->rb_node))
++ cfq_rb_erase(&cfqg->rb_node, st);
++ cfqg->saved_workload_slice = 0;
++ blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1);
++}
++
++static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
++{
++ unsigned int slice_used;
++
++ /*
++ * Queue got expired before even a single request completed or
++ * got expired immediately after first request completion.
++ */
++ if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
++ /*
++ * Also charge the seek time incurred to the group, otherwise
++ * if there are mutiple queues in the group, each can dispatch
++ * a single request on seeky media and cause lots of seek time
++ * and group will never know it.
++ */
++ slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
++ 1);
++ } else {
++ slice_used = jiffies - cfqq->slice_start;
++ if (slice_used > cfqq->allocated_slice)
++ slice_used = cfqq->allocated_slice;
++ }
++
++ cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used,
++ cfqq->nr_sectors);
++ return slice_used;
++}
++
++static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
++ struct cfq_queue *cfqq)
++{
++ struct cfq_rb_root *st = &cfqd->grp_service_tree;
++ unsigned int used_sl, charge_sl;
++ int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
++ - cfqg->service_tree_idle.count;
++
++ BUG_ON(nr_sync < 0);
++ used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);
++
++ if (!cfq_cfqq_sync(cfqq) && !nr_sync)
++ charge_sl = cfqq->allocated_slice;
++
++ /* Can't update vdisktime while group is on service tree */
++ cfq_rb_erase(&cfqg->rb_node, st);
++ cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);
++ __cfq_group_service_tree_add(st, cfqg);
++
++ /* This group is being expired. Save the context */
++ if (time_after(cfqd->workload_expires, jiffies)) {
++ cfqg->saved_workload_slice = cfqd->workload_expires
++ - jiffies;
++ cfqg->saved_workload = cfqd->serving_type;
++ cfqg->saved_serving_prio = cfqd->serving_prio;
++ } else
++ cfqg->saved_workload_slice = 0;
++
++ cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
++ st->min_vdisktime);
++ blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl,
++ cfqq->nr_sectors);
++}
++
++#ifdef CONFIG_CFQ_GROUP_IOSCHED
++static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
++{
++ if (blkg)
++ return container_of(blkg, struct cfq_group, blkg);
++ return NULL;
++}
++
++void
++cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
++{
++ cfqg_of_blkg(blkg)->weight = weight;
++}
++
++static struct cfq_group *
++cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
++{
++ struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
++ struct cfq_group *cfqg = NULL;
++ void *key = cfqd;
++ int i, j;
++ struct cfq_rb_root *st;
++ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
++ unsigned int major, minor;
++
++ /* Do we need to take this reference */
++ if (!blkiocg_css_tryget(blkcg))
++ return NULL;;
++
++ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
++ if (cfqg || !create)
++ goto done;
++
++ cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
++ if (!cfqg)
++ goto done;
++
++ cfqg->weight = blkcg->weight;
++ for_each_cfqg_st(cfqg, i, j, st)
++ *st = CFQ_RB_ROOT;
++ RB_CLEAR_NODE(&cfqg->rb_node);
++
++ /*
++ * Take the initial reference that will be released on destroy
++ * This can be thought of a joint reference by cgroup and
++ * elevator which will be dropped by either elevator exit
++ * or cgroup deletion path depending on who is exiting first.
++ */
++ atomic_set(&cfqg->ref, 1);
++
++ /* Add group onto cgroup list */
++ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
++ blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
++ MKDEV(major, minor));
++
++ /* Add group on cfqd list */
++ hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
++
++done:
++ blkiocg_css_put(blkcg);
++ return cfqg;
++}
++
++/*
++ * Search for the cfq group current task belongs to. If create = 1, then also
++ * create the cfq group if it does not exist. request_queue lock must be held.
++ */
++static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
++{
++ struct cgroup *cgroup;
++ struct cfq_group *cfqg = NULL;
++
++ rcu_read_lock();
++ cgroup = task_cgroup(current, blkio_subsys_id);
++ cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
++ if (!cfqg && create)
++ cfqg = &cfqd->root_group;
++ rcu_read_unlock();
++ return cfqg;
++}
++
++static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
++{
++ /* Currently, all async queues are mapped to root group */
++ if (!cfq_cfqq_sync(cfqq))
++ cfqg = &cfqq->cfqd->root_group;
++
++ cfqq->cfqg = cfqg;
++ /* cfqq reference on cfqg */
++ atomic_inc(&cfqq->cfqg->ref);
++}
++
++static void cfq_put_cfqg(struct cfq_group *cfqg)
++{
++ struct cfq_rb_root *st;
++ int i, j;
++
++ BUG_ON(atomic_read(&cfqg->ref) <= 0);
++ if (!atomic_dec_and_test(&cfqg->ref))
++ return;
++ for_each_cfqg_st(cfqg, i, j, st)
++ BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
++ kfree(cfqg);
++}
++
++static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
++{
++ /* Something wrong if we are trying to remove same group twice */
++ BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
++
++ hlist_del_init(&cfqg->cfqd_node);
++
++ /*
++ * Put the reference taken at the time of creation so that when all
++ * queues are gone, group can be destroyed.
++ */
++ cfq_put_cfqg(cfqg);
++}
++
++static void cfq_release_cfq_groups(struct cfq_data *cfqd)
++{
++ struct hlist_node *pos, *n;
++ struct cfq_group *cfqg;
++
++ hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
++ /*
++ * If cgroup removal path got to blk_group first and removed
++ * it from cgroup list, then it will take care of destroying
++ * cfqg also.
++ */
++ if (!blkiocg_del_blkio_group(&cfqg->blkg))
++ cfq_destroy_cfqg(cfqd, cfqg);
++ }
++}
++
++/*
++ * Blk cgroup controller notification saying that blkio_group object is being
++ * delinked as associated cgroup object is going away. That also means that
++ * no new IO will come in this group. So get rid of this group as soon as
++ * any pending IO in the group is finished.
++ *
++ * This function is called under rcu_read_lock(). key is the rcu protected
++ * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
++ * read lock.
++ *
++ * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
++ * it should not be NULL as even if elevator was exiting, cgroup deltion
++ * path got to it first.
++ */
++void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
++{
++ unsigned long flags;
++ struct cfq_data *cfqd = key;
++
++ spin_lock_irqsave(cfqd->queue->queue_lock, flags);
++ cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
++ spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
++}
++
++#else /* GROUP_IOSCHED */
++static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
++{
++ return &cfqd->root_group;
++}
++static inline void
++cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
++ cfqq->cfqg = cfqg;
++}
++
++static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
++static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
++
++#endif /* GROUP_IOSCHED */
++
+ /*
+- * The cfqd->service_tree holds all pending cfq_queue's that have
++ * The cfqd->service_trees holds all pending cfq_queue's that have
+ * requests waiting to be processed. It is sorted in the order that
+ * we will service the queues.
+ */
+@@ -495,11 +1111,42 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ struct rb_node **p, *parent;
+ struct cfq_queue *__cfqq;
+ unsigned long rb_key;
++ struct cfq_rb_root *service_tree;
+ int left;
++ int new_cfqq = 1;
++ int group_changed = 0;
++
++#ifdef CONFIG_CFQ_GROUP_IOSCHED
++ if (!cfqd->cfq_group_isolation
++ && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
++ && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
++ /* Move this cfq to root group */
++ cfq_log_cfqq(cfqd, cfqq, "moving to root group");
++ if (!RB_EMPTY_NODE(&cfqq->rb_node))
++ cfq_group_service_tree_del(cfqd, cfqq->cfqg);
++ cfqq->orig_cfqg = cfqq->cfqg;
++ cfqq->cfqg = &cfqd->root_group;
++ atomic_inc(&cfqd->root_group.ref);
++ group_changed = 1;
++ } else if (!cfqd->cfq_group_isolation
++ && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
++ /* cfqq is sequential now needs to go to its original group */
++ BUG_ON(cfqq->cfqg != &cfqd->root_group);
++ if (!RB_EMPTY_NODE(&cfqq->rb_node))
++ cfq_group_service_tree_del(cfqd, cfqq->cfqg);
++ cfq_put_cfqg(cfqq->cfqg);
++ cfqq->cfqg = cfqq->orig_cfqg;
++ cfqq->orig_cfqg = NULL;
++ group_changed = 1;
++ cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
++ }
++#endif
+
++ service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
++ cfqq_type(cfqq));
+ if (cfq_class_idle(cfqq)) {
+ rb_key = CFQ_IDLE_DELAY;
+- parent = rb_last(&cfqd->service_tree.rb);
++ parent = rb_last(&service_tree->rb);
+ if (parent && parent != &cfqq->rb_node) {
+ __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
+ rb_key += __cfqq->rb_key;
+@@ -517,23 +1164,27 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ cfqq->slice_resid = 0;
+ } else {
+ rb_key = -HZ;
+- __cfqq = cfq_rb_first(&cfqd->service_tree);
++ __cfqq = cfq_rb_first(service_tree);
+ rb_key += __cfqq ? __cfqq->rb_key : jiffies;
+ }
+
+ if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
++ new_cfqq = 0;
+ /*
+ * same position, nothing more to do
+ */
+- if (rb_key == cfqq->rb_key)
++ if (rb_key == cfqq->rb_key &&
++ cfqq->service_tree == service_tree)
+ return;
+
+- cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
++ cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
++ cfqq->service_tree = NULL;
+ }
+
+ left = 1;
+ parent = NULL;
+- p = &cfqd->service_tree.rb.rb_node;
++ cfqq->service_tree = service_tree;
++ p = &service_tree->rb.rb_node;
+ while (*p) {
+ struct rb_node **n;
+
+@@ -541,35 +1192,28 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
+
+ /*
+- * sort RT queues first, we always want to give
+- * preference to them. IDLE queues goes to the back.
+- * after that, sort on the next service time.
++ * sort by key, that represents service time.
+ */
+- if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq))
++ if (time_before(rb_key, __cfqq->rb_key))
+ n = &(*p)->rb_left;
+- else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
+- n = &(*p)->rb_right;
+- else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
+- n = &(*p)->rb_left;
+- else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
+- n = &(*p)->rb_right;
+- else if (time_before(rb_key, __cfqq->rb_key))
+- n = &(*p)->rb_left;
+- else
++ else {
+ n = &(*p)->rb_right;
+-
+- if (n == &(*p)->rb_right)
+ left = 0;
++ }
+
+ p = n;
+ }
+
+ if (left)
+- cfqd->service_tree.left = &cfqq->rb_node;
++ service_tree->left = &cfqq->rb_node;
+
+ cfqq->rb_key = rb_key;
+ rb_link_node(&cfqq->rb_node, parent, p);
+- rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
++ rb_insert_color(&cfqq->rb_node, &service_tree->rb);
++ service_tree->count++;
++ if ((add_front || !new_cfqq) && !group_changed)
++ return;
++ cfq_group_service_tree_add(cfqd, cfqq->cfqg);
+ }
+
+ static struct cfq_queue *
+@@ -671,13 +1315,16 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ BUG_ON(!cfq_cfqq_on_rr(cfqq));
+ cfq_clear_cfqq_on_rr(cfqq);
+
+- if (!RB_EMPTY_NODE(&cfqq->rb_node))
+- cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
++ if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
++ cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
++ cfqq->service_tree = NULL;
++ }
+ if (cfqq->p_root) {
+ rb_erase(&cfqq->p_node, cfqq->p_root);
+ cfqq->p_root = NULL;
+ }
+
++ cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+ BUG_ON(!cfqd->busy_queues);
+ cfqd->busy_queues--;
+ }
+@@ -688,7 +1335,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ static void cfq_del_rq_rb(struct request *rq)
+ {
+ struct cfq_queue *cfqq = RQ_CFQQ(rq);
+- struct cfq_data *cfqd = cfqq->cfqd;
+ const int sync = rq_is_sync(rq);
+
+ BUG_ON(!cfqq->queued[sync]);
+@@ -696,8 +1342,17 @@ static void cfq_del_rq_rb(struct request *rq)
+
+ elv_rb_del(&cfqq->sort_list, rq);
+
+- if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
+- cfq_del_cfqq_rr(cfqd, cfqq);
++ if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
++ /*
++ * Queue will be deleted from service tree when we actually
++ * expire it later. Right now just remove it from prio tree
++ * as it is empty.
++ */
++ if (cfqq->p_root) {
++ rb_erase(&cfqq->p_node, cfqq->p_root);
++ cfqq->p_root = NULL;
++ }
++ }
+ }
+
+ static void cfq_add_rq_rb(struct request *rq)
+@@ -722,7 +1377,7 @@ static void cfq_add_rq_rb(struct request *rq)
+ * check if this request is a better next-serve candidate
+ */
+ prev = cfqq->next_rq;
+- cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
++ cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
+
+ /*
+ * adjust priority tree position, if ->next_rq changes
+@@ -829,6 +1484,7 @@ static void
+ cfq_merged_requests(struct request_queue *q, struct request *rq,
+ struct request *next)
+ {
++ struct cfq_queue *cfqq = RQ_CFQQ(rq);
+ /*
+ * reposition in fifo if next is older than rq
+ */
+@@ -838,6 +1494,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
+ rq_set_fifo_time(rq, rq_fifo_time(next));
+ }
+
++ if (cfqq->next_rq == next)
++ cfqq->next_rq = rq;
+ cfq_remove_request(next);
+ }
+
+@@ -871,8 +1529,12 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
+ {
+ if (cfqq) {
+ cfq_log_cfqq(cfqd, cfqq, "set_active");
++ cfqq->slice_start = 0;
++ cfqq->dispatch_start = jiffies;
++ cfqq->allocated_slice = 0;
+ cfqq->slice_end = 0;
+ cfqq->slice_dispatch = 0;
++ cfqq->nr_sectors = 0;
+
+ cfq_clear_cfqq_wait_request(cfqq);
+ cfq_clear_cfqq_must_dispatch(cfqq);
+@@ -899,6 +1561,16 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ del_timer(&cfqd->idle_slice_timer);
+
+ cfq_clear_cfqq_wait_request(cfqq);
++ cfq_clear_cfqq_wait_busy(cfqq);
++
++ /*
++ * If this cfqq is shared between multiple processes, check to
++ * make sure that those processes are still issuing I/Os within
++ * the mean seek distance. If not, it may be time to break the
++ * queues apart again.
++ */
++ if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
++ cfq_mark_cfqq_split_coop(cfqq);
+
+ /*
+ * store what was left of this slice, if the queue idled/timed out
+@@ -908,11 +1580,19 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
+ }
+
++ cfq_group_served(cfqd, cfqq->cfqg, cfqq);
++
++ if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
++ cfq_del_cfqq_rr(cfqd, cfqq);
++
+ cfq_resort_rr_list(cfqd, cfqq);
+
+ if (cfqq == cfqd->active_queue)
+ cfqd->active_queue = NULL;
+
++ if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
++ cfqd->grp_service_tree.active = NULL;
++
+ if (cfqd->active_cic) {
+ put_io_context(cfqd->active_cic->ioc);
+ cfqd->active_cic = NULL;
+@@ -933,10 +1613,39 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
+ */
+ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
+ {
+- if (RB_EMPTY_ROOT(&cfqd->service_tree.rb))
++ struct cfq_rb_root *service_tree =
++ service_tree_for(cfqd->serving_group, cfqd->serving_prio,
++ cfqd->serving_type);
++
++ if (!cfqd->rq_queued)
+ return NULL;
+
+- return cfq_rb_first(&cfqd->service_tree);
++ /* There is nothing to dispatch */
++ if (!service_tree)
++ return NULL;
++ if (RB_EMPTY_ROOT(&service_tree->rb))
++ return NULL;
++ return cfq_rb_first(service_tree);
++}
++
++static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
++{
++ struct cfq_group *cfqg;
++ struct cfq_queue *cfqq;
++ int i, j;
++ struct cfq_rb_root *st;
++
++ if (!cfqd->rq_queued)
++ return NULL;
++
++ cfqg = cfq_get_next_cfqg(cfqd);
++ if (!cfqg)
++ return NULL;
++
++ for_each_cfqg_st(cfqg, i, j, st)
++ if ((cfqq = cfq_rb_first(st)) != NULL)
++ return cfqq;
++ return NULL;
+ }
+
+ /*
+@@ -945,14 +1654,8 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
+ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
+ struct cfq_queue *cfqq)
+ {
+- if (!cfqq) {
++ if (!cfqq)
+ cfqq = cfq_get_next_queue(cfqd);
+- if (cfqq && !cfq_cfqq_coop_preempt(cfqq))
+- cfq_clear_cfqq_coop(cfqq);
+- }
+-
+- if (cfqq)
+- cfq_clear_cfqq_coop_preempt(cfqq);
+
+ __cfq_set_active_queue(cfqd, cfqq);
+ return cfqq;
+@@ -967,16 +1670,17 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
+ return cfqd->last_position - blk_rq_pos(rq);
+ }
+
+-#define CIC_SEEK_THR 8 * 1024
+-#define CIC_SEEKY(cic) ((cic)->seek_mean > CIC_SEEK_THR)
+-
+-static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
++static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
++ struct request *rq, bool for_preempt)
+ {
+- struct cfq_io_context *cic = cfqd->active_cic;
+- sector_t sdist = cic->seek_mean;
++ sector_t sdist = cfqq->seek_mean;
++
++ if (!sample_valid(cfqq->seek_samples))
++ sdist = CFQQ_SEEK_THR;
+
+- if (!sample_valid(cic->seek_samples))
+- sdist = CIC_SEEK_THR;
++ /* if seek_mean is big, using it as close criteria is meaningless */
++ if (sdist > CFQQ_SEEK_THR && !for_preempt)
++ sdist = CFQQ_SEEK_THR;
+
+ return cfq_dist_from_last(cfqd, rq) <= sdist;
+ }
+@@ -1005,7 +1709,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
+ * will contain the closest sector.
+ */
+ __cfqq = rb_entry(parent, struct cfq_queue, p_node);
+- if (cfq_rq_close(cfqd, __cfqq->next_rq))
++ if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false))
+ return __cfqq;
+
+ if (blk_rq_pos(__cfqq->next_rq) < sector)
+@@ -1016,7 +1720,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
+ return NULL;
+
+ __cfqq = rb_entry(node, struct cfq_queue, p_node);
+- if (cfq_rq_close(cfqd, __cfqq->next_rq))
++ if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false))
+ return __cfqq;
+
+ return NULL;
+@@ -1033,16 +1737,19 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
+ * assumption.
+ */
+ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
+- struct cfq_queue *cur_cfqq,
+- bool probe)
++ struct cfq_queue *cur_cfqq)
+ {
+ struct cfq_queue *cfqq;
+
++ if (!cfq_cfqq_sync(cur_cfqq))
++ return NULL;
++ if (CFQQ_SEEKY(cur_cfqq))
++ return NULL;
++
+ /*
+- * A valid cfq_io_context is necessary to compare requests against
+- * the seek_mean of the current cfqq.
++ * Don't search priority tree if it's the only queue in the group.
+ */
+- if (!cfqd->active_cic)
++ if (cur_cfqq->cfqg->nr_cfqq == 1)
+ return NULL;
+
+ /*
+@@ -1054,14 +1761,55 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
+ if (!cfqq)
+ return NULL;
+
+- if (cfq_cfqq_coop(cfqq))
++ /* If new queue belongs to different cfq_group, don't choose it */
++ if (cur_cfqq->cfqg != cfqq->cfqg)
++ return NULL;
++
++ /*
++ * It only makes sense to merge sync queues.
++ */
++ if (!cfq_cfqq_sync(cfqq))
++ return NULL;
++ if (CFQQ_SEEKY(cfqq))
++ return NULL;
++
++ /*
++ * Do not merge queues of different priority classes
++ */
++ if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
+ return NULL;
+
+- if (!probe)
+- cfq_mark_cfqq_coop(cfqq);
+ return cfqq;
+ }
+
++/*
++ * Determine whether we should enforce idle window for this queue.
++ */
++
++static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
++{
++ enum wl_prio_t prio = cfqq_prio(cfqq);
++ struct cfq_rb_root *service_tree = cfqq->service_tree;
++
++ BUG_ON(!service_tree);
++ BUG_ON(!service_tree->count);
++
++ /* We never do for idle class queues. */
++ if (prio == IDLE_WORKLOAD)
++ return false;
++
++ /* We do for queues that were marked with idle window flag. */
++ if (cfq_cfqq_idle_window(cfqq) &&
++ !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
++ return true;
++
++ /*
++ * Otherwise, we do only if they are the last ones
++ * in their service tree.
++ */
++ return service_tree->count == 1 && cfq_cfqq_sync(cfqq);
++}
++
+ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
+ {
+ struct cfq_queue *cfqq = cfqd->active_queue;
+@@ -1082,13 +1830,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
+ /*
+ * idle is disabled, either manually or by past process history
+ */
+- if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq))
++ if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
+ return;
+
+ /*
+- * still requests with the driver, don't idle
++ * still active requests from this queue, don't idle
+ */
+- if (rq_in_driver(cfqd))
++ if (cfqq->dispatched)
+ return;
+
+ /*
+@@ -1109,14 +1857,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
+
+ cfq_mark_cfqq_wait_request(cfqq);
+
+- /*
+- * we don't want to idle for seeks, but we do want to allow
+- * fair distribution of slice time for a process doing back-to-back
+- * seeks. so allow a little bit of time for him to submit a new rq
+- */
+ sl = cfqd->cfq_slice_idle;
+- if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
+- sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
+
+ mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
+ cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
+@@ -1139,6 +1880,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
+
+ if (cfq_cfqq_sync(cfqq))
+ cfqd->sync_flight++;
++ cfqq->nr_sectors += blk_rq_sectors(rq);
+ }
+
+ /*
+@@ -1175,6 +1917,186 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ }
+
+ /*
++ * Must be called with the queue_lock held.
++ */
++static int cfqq_process_refs(struct cfq_queue *cfqq)
++{
++ int process_refs, io_refs;
++
++ io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
++ process_refs = atomic_read(&cfqq->ref) - io_refs;
++ BUG_ON(process_refs < 0);
++ return process_refs;
++}
++
++static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
++{
++ int process_refs, new_process_refs;
++ struct cfq_queue *__cfqq;
++
++ /* Avoid a circular list and skip interim queue merges */
++ while ((__cfqq = new_cfqq->new_cfqq)) {
++ if (__cfqq == cfqq)
++ return;
++ new_cfqq = __cfqq;
++ }
++
++ process_refs = cfqq_process_refs(cfqq);
++ /*
++ * If the process for the cfqq has gone away, there is no
++ * sense in merging the queues.
++ */
++ if (process_refs == 0)
++ return;
++
++ /*
++ * Merge in the direction of the lesser amount of work.
++ */
++ new_process_refs = cfqq_process_refs(new_cfqq);
++ if (new_process_refs >= process_refs) {
++ cfqq->new_cfqq = new_cfqq;
++ atomic_add(process_refs, &new_cfqq->ref);
++ } else {
++ new_cfqq->new_cfqq = cfqq;
++ atomic_add(new_process_refs, &cfqq->ref);
++ }
++}
++
++static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
++ struct cfq_group *cfqg, enum wl_prio_t prio)
++{
++ struct cfq_queue *queue;
++ int i;
++ bool key_valid = false;
++ unsigned long lowest_key = 0;
++ enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
++
++ for (i = 0; i <= SYNC_WORKLOAD; ++i) {
++ /* select the one with lowest rb_key */
++ queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
++ if (queue &&
++ (!key_valid || time_before(queue->rb_key, lowest_key))) {
++ lowest_key = queue->rb_key;
++ cur_best = i;
++ key_valid = true;
++ }
++ }
++
++ return cur_best;
++}
++
++static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
++{
++ unsigned slice;
++ unsigned count;
++ struct cfq_rb_root *st;
++ unsigned group_slice;
++
++ if (!cfqg) {
++ cfqd->serving_prio = IDLE_WORKLOAD;
++ cfqd->workload_expires = jiffies + 1;
++ return;
++ }
++
++ /* Choose next priority. RT > BE > IDLE */
++ if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
++ cfqd->serving_prio = RT_WORKLOAD;
++ else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
++ cfqd->serving_prio = BE_WORKLOAD;
++ else {
++ cfqd->serving_prio = IDLE_WORKLOAD;
++ cfqd->workload_expires = jiffies + 1;
++ return;
++ }
++
++ /*
++ * For RT and BE, we have to choose also the type
++ * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
++ * expiration time
++ */
++ st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
++ count = st->count;
++
++ /*
++ * check workload expiration, and that we still have other queues ready
++ */
++ if (count && !time_after(jiffies, cfqd->workload_expires))
++ return;
++
++ /* otherwise select new workload type */
++ cfqd->serving_type =
++ cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
++ st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
++ count = st->count;
++
++ /*
++ * the workload slice is computed as a fraction of target latency
++ * proportional to the number of queues in that workload, over
++ * all the queues in the same priority class
++ */
++ group_slice = cfq_group_slice(cfqd, cfqg);
++
++ slice = group_slice * count /
++ max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
++ cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
++
++ if (cfqd->serving_type == ASYNC_WORKLOAD) {
++ unsigned int tmp;
++
++ /*
++ * Async queues are currently system wide. Just taking
++ * proportion of queues with-in same group will lead to higher
++ * async ratio system wide as generally root group is going
++ * to have higher weight. A more accurate thing would be to
++ * calculate system wide asnc/sync ratio.
++ */
++ tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
++ tmp = tmp/cfqd->busy_queues;
++ slice = min_t(unsigned, slice, tmp);
++
++ /* async workload slice is scaled down according to
++ * the sync/async slice ratio. */
++ slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
++ } else
++ /* sync workload slice is at least 2 * cfq_slice_idle */
++ slice = max(slice, 2 * cfqd->cfq_slice_idle);
++
++ slice = max_t(unsigned, slice, CFQ_MIN_TT);
++ cfqd->workload_expires = jiffies + slice;
++ cfqd->noidle_tree_requires_idle = false;
++}
++
++static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
++{
++ struct cfq_rb_root *st = &cfqd->grp_service_tree;
++ struct cfq_group *cfqg;
++
++ if (RB_EMPTY_ROOT(&st->rb))
++ return NULL;
++ cfqg = cfq_rb_first_group(st);
++ st->active = &cfqg->rb_node;
++ update_min_vdisktime(st);
++ return cfqg;
++}
++
++static void cfq_choose_cfqg(struct cfq_data *cfqd)
++{
++ struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
++
++ cfqd->serving_group = cfqg;
++
++ /* Restore the workload type data */
++ if (cfqg->saved_workload_slice) {
++ cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
++ cfqd->serving_type = cfqg->saved_workload;
++ cfqd->serving_prio = cfqg->saved_serving_prio;
++ } else
++ cfqd->workload_expires = jiffies - 1;
++
++ choose_service_tree(cfqd, cfqg);
++}
++
++/*
+ * Select a queue for service. If we have a current active queue,
+ * check whether to continue servicing it, or retrieve and set a new one.
+ */
+@@ -1186,13 +2108,37 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
+ if (!cfqq)
+ goto new_queue;
+
++ if (!cfqd->rq_queued)
++ return NULL;
++
+ /*
+- * The active queue has run out of time, expire it and select new.
++ * We were waiting for group to get backlogged. Expire the queue
+ */
+- if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
++ if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
+ goto expire;
+
+ /*
++ * The active queue has run out of time, expire it and select new.
++ */
++ if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
++ /*
++ * If slice had not expired at the completion of last request
++ * we might not have turned on wait_busy flag. Don't expire
++ * the queue yet. Allow the group to get backlogged.
++ *
++ * The very fact that we have used the slice, that means we
++ * have been idling all along on this queue and it should be
++ * ok to wait for this request to complete.
++ */
++ if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
++ && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
++ cfqq = NULL;
++ goto keep_queue;
++ } else
++ goto expire;
++ }
++
++ /*
+ * The active queue has requests and isn't expired, allow it to
+ * dispatch.
+ */
+@@ -1203,11 +2149,14 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
+ * If another queue has a request waiting within our mean seek
+ * distance, let it run. The expire code will check for close
+ * cooperators and put the close queue at the front of the service
+- * tree.
++ * tree. If possible, merge the expiring queue with the new cfqq.
+ */
+- new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0);
+- if (new_cfqq)
++ new_cfqq = cfq_close_cooperator(cfqd, cfqq);
++ if (new_cfqq) {
++ if (!cfqq->new_cfqq)
++ cfq_setup_merge(cfqq, new_cfqq);
+ goto expire;
++ }
+
+ /*
+ * No requests pending. If the active queue still has requests in
+@@ -1215,7 +2164,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
+ * conditions to happen (or time out) before selecting a new queue.
+ */
+ if (timer_pending(&cfqd->idle_slice_timer) ||
+- (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) {
++ (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {
+ cfqq = NULL;
+ goto keep_queue;
+ }
+@@ -1223,6 +2172,13 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
+ expire:
+ cfq_slice_expired(cfqd, 0);
+ new_queue:
++ /*
++ * Current queue expired. Check if we have to switch to a new
++ * service tree
++ */
++ if (!new_cfqq)
++ cfq_choose_cfqg(cfqd);
++
+ cfqq = cfq_set_active_queue(cfqd, new_cfqq);
+ keep_queue:
+ return cfqq;
+@@ -1238,6 +2194,9 @@ static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
+ }
+
+ BUG_ON(!list_empty(&cfqq->fifo));
++
++ /* By default cfqq is not expired if it is empty. Do it explicitly */
++ __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
+ return dispatched;
+ }
+
+@@ -1250,11 +2209,10 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
+ struct cfq_queue *cfqq;
+ int dispatched = 0;
+
+- while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL)
++ while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL)
+ dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+
+ cfq_slice_expired(cfqd, 0);
+-
+ BUG_ON(cfqd->busy_queues);
+
+ cfq_log(cfqd, "forced_dispatch=%d", dispatched);
+@@ -1268,7 +2226,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ /*
+ * Drain async requests before we start sync IO
+ */
+- if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
++ if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
+ return false;
+
+ /*
+@@ -1298,9 +2256,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ return false;
+
+ /*
+- * Sole queue user, allow bigger slice
++ * Sole queue user, no limit
+ */
+- max_dispatch *= 4;
++ max_dispatch = -1;
+ }
+
+ /*
+@@ -1309,7 +2267,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ * based on the last sync IO we serviced
+ */
+ if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
+- unsigned long last_sync = jiffies - cfqd->last_end_sync_rq;
++ unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
+ unsigned int depth;
+
+ depth = last_sync / cfqd->cfq_slice[1];
+@@ -1407,11 +2365,13 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
+ * task holds one reference to the queue, dropped when task exits. each rq
+ * in-flight on this queue also holds a reference, dropped when rq is freed.
+ *
++ * Each cfq queue took a reference on the parent group. Drop it now.
+ * queue lock must be held here.
+ */
+ static void cfq_put_queue(struct cfq_queue *cfqq)
+ {
+ struct cfq_data *cfqd = cfqq->cfqd;
++ struct cfq_group *cfqg, *orig_cfqg;
+
+ BUG_ON(atomic_read(&cfqq->ref) <= 0);
+
+@@ -1421,14 +2381,19 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
+ cfq_log_cfqq(cfqd, cfqq, "put_queue");
+ BUG_ON(rb_first(&cfqq->sort_list));
+ BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
+- BUG_ON(cfq_cfqq_on_rr(cfqq));
++ cfqg = cfqq->cfqg;
++ orig_cfqg = cfqq->orig_cfqg;
+
+ if (unlikely(cfqd->active_queue == cfqq)) {
+ __cfq_slice_expired(cfqd, cfqq, 0);
+ cfq_schedule_dispatch(cfqd);
+ }
+
++ BUG_ON(cfq_cfqq_on_rr(cfqq));
+ kmem_cache_free(cfq_pool, cfqq);
++ cfq_put_cfqg(cfqg);
++ if (orig_cfqg)
++ cfq_put_cfqg(orig_cfqg);
+ }
+
+ /*
+@@ -1518,11 +2483,29 @@ static void cfq_free_io_context(struct io_context *ioc)
+
+ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ {
++ struct cfq_queue *__cfqq, *next;
++
+ if (unlikely(cfqq == cfqd->active_queue)) {
+ __cfq_slice_expired(cfqd, cfqq, 0);
+ cfq_schedule_dispatch(cfqd);
+ }
+
++ /*
++ * If this queue was scheduled to merge with another queue, be
++ * sure to drop the reference taken on that queue (and others in
++ * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.
++ */
++ __cfqq = cfqq->new_cfqq;
++ while (__cfqq) {
++ if (__cfqq == cfqq) {
++ WARN(1, "cfqq->new_cfqq loop detected\n");
++ break;
++ }
++ next = __cfqq->new_cfqq;
++ cfq_put_queue(__cfqq);
++ __cfqq = next;
++ }
++
+ cfq_put_queue(cfqq);
+ }
+
+@@ -1703,14 +2686,51 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ cfqq->pid = pid;
+ }
+
++#ifdef CONFIG_CFQ_GROUP_IOSCHED
++static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
++{
++ struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
++ struct cfq_data *cfqd = cic->key;
++ unsigned long flags;
++ struct request_queue *q;
++
++ if (unlikely(!cfqd))
++ return;
++
++ q = cfqd->queue;
++
++ spin_lock_irqsave(q->queue_lock, flags);
++
++ if (sync_cfqq) {
++ /*
++ * Drop reference to sync queue. A new sync queue will be
++ * assigned in new group upon arrival of a fresh request.
++ */
++ cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
++ cic_set_cfqq(cic, NULL, 1);
++ cfq_put_queue(sync_cfqq);
++ }
++
++ spin_unlock_irqrestore(q->queue_lock, flags);
++}
++
++static void cfq_ioc_set_cgroup(struct io_context *ioc)
++{
++ call_for_each_cic(ioc, changed_cgroup);
++ ioc->cgroup_changed = 0;
++}
++#endif /* CONFIG_CFQ_GROUP_IOSCHED */
++
+ static struct cfq_queue *
+ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
+ struct io_context *ioc, gfp_t gfp_mask)
+ {
+ struct cfq_queue *cfqq, *new_cfqq = NULL;
+ struct cfq_io_context *cic;
++ struct cfq_group *cfqg;
+
+ retry:
++ cfqg = cfq_get_cfqg(cfqd, 1);
+ cic = cfq_cic_lookup(cfqd, ioc);
+ /* cic always exists here */
+ cfqq = cic_to_cfqq(cic, is_sync);
+@@ -1741,6 +2761,7 @@ retry:
+ if (cfqq) {
+ cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
+ cfq_init_prio_data(cfqq, ioc);
++ cfq_link_cfqq_cfqg(cfqq, cfqg);
+ cfq_log_cfqq(cfqd, cfqq, "alloced");
+ } else
+ cfqq = &cfqd->oom_cfqq;
+@@ -1932,6 +2953,10 @@ out:
+ if (unlikely(ioc->ioprio_changed))
+ cfq_ioc_set_ioprio(ioc);
+
++#ifdef CONFIG_CFQ_GROUP_IOSCHED
++ if (unlikely(ioc->cgroup_changed))
++ cfq_ioc_set_cgroup(ioc);
++#endif
+ return cic;
+ err_free:
+ cfq_cic_free(cic);
+@@ -1952,33 +2977,33 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
+ }
+
+ static void
+-cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
++cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ struct request *rq)
+ {
+ sector_t sdist;
+ u64 total;
+
+- if (!cic->last_request_pos)
++ if (!cfqq->last_request_pos)
+ sdist = 0;
+- else if (cic->last_request_pos < blk_rq_pos(rq))
+- sdist = blk_rq_pos(rq) - cic->last_request_pos;
++ else if (cfqq->last_request_pos < blk_rq_pos(rq))
++ sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
+ else
+- sdist = cic->last_request_pos - blk_rq_pos(rq);
++ sdist = cfqq->last_request_pos - blk_rq_pos(rq);
+
/*
- * Reset it - just in case we boot another CPU later:
+ * Don't allow the seek distance to get too large from the
+ * odd fragment, pagein, etc
*/
-diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
-index 3909e3b..bbfa7af 100644
---- a/arch/x86/kernel/x8664_ksyms_64.c
-+++ b/arch/x86/kernel/x8664_ksyms_64.c
-@@ -3,6 +3,7 @@
+- if (cic->seek_samples <= 60) /* second&third seek */
+- sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
++ if (cfqq->seek_samples <= 60) /* second&third seek */
++ sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024);
+ else
+- sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64);
++ sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64);
- #include <linux/module.h>
- #include <linux/smp.h>
-+#include <linux/syscalls.h>
+- cic->seek_samples = (7*cic->seek_samples + 256) / 8;
+- cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
+- total = cic->seek_total + (cic->seek_samples/2);
+- do_div(total, cic->seek_samples);
+- cic->seek_mean = (sector_t)total;
++ cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8;
++ cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8;
++ total = cfqq->seek_total + (cfqq->seek_samples/2);
++ do_div(total, cfqq->seek_samples);
++ cfqq->seek_mean = (sector_t)total;
+ }
- #include <net/checksum.h>
+ /*
+@@ -1999,14 +3024,15 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-@@ -17,6 +18,7 @@
- EXPORT_SYMBOL(mcount);
- #endif
+ enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
-+EXPORT_SYMBOL(kernel_execve);
- EXPORT_SYMBOL(kernel_thread);
++ if (cfqq->queued[0] + cfqq->queued[1] >= 4)
++ cfq_mark_cfqq_deep(cfqq);
++
+ if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
+- (!cfqd->cfq_latency && cfqd->hw_tag && CIC_SEEKY(cic)))
++ (!cfq_cfqq_deep(cfqq) && sample_valid(cfqq->seek_samples)
++ && CFQQ_SEEKY(cfqq)))
+ enable_idle = 0;
+ else if (sample_valid(cic->ttime_samples)) {
+- unsigned int slice_idle = cfqd->cfq_slice_idle;
+- if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
+- slice_idle = msecs_to_jiffies(CFQ_MIN_TT);
+- if (cic->ttime_mean > slice_idle)
++ if (cic->ttime_mean > cfqd->cfq_slice_idle)
+ enable_idle = 0;
+ else
+ enable_idle = 1;
+@@ -2035,9 +3061,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
+ if (!cfqq)
+ return false;
- EXPORT_SYMBOL(__get_user_1);
-diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
-index f4cee90..3e549cd 100644
---- a/arch/x86/mm/fault.c
-+++ b/arch/x86/mm/fault.c
-@@ -689,7 +689,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
- if (!printk_ratelimit())
- return;
+- if (cfq_slice_used(cfqq))
+- return true;
+-
+ if (cfq_class_idle(new_cfqq))
+ return false;
-- printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
-+ ve_printk(VE_LOG, "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
- task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
- tsk->comm, task_pid_nr(tsk), address,
- (void *)regs->ip, (void *)regs->sp, error_code);
-@@ -909,7 +909,7 @@ spurious_fault(unsigned long error_code, unsigned long address)
- return ret;
- }
+@@ -2045,12 +3068,31 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
+ return true;
--int show_unhandled_signals = 1;
-+int show_unhandled_signals = 0;
+ /*
++ * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
++ */
++ if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
++ return false;
++
++ /*
+ * if the new request is sync, but the currently running queue is
+ * not, let the sync request have priority.
+ */
+ if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
+ return true;
- static inline int
- access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
-diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
-index f46c340..6b7330c 100644
---- a/arch/x86/mm/hugetlbpage.c
-+++ b/arch/x86/mm/hugetlbpage.c
-@@ -12,6 +12,7 @@
- #include <linux/slab.h>
- #include <linux/err.h>
- #include <linux/sysctl.h>
-+#include <linux/module.h>
- #include <asm/mman.h>
- #include <asm/tlb.h>
- #include <asm/tlbflush.h>
-@@ -230,6 +231,7 @@ int pud_huge(pud_t pud)
- {
- return !!(pud_val(pud) & _PAGE_PSE);
++ if (new_cfqq->cfqg != cfqq->cfqg)
++ return false;
++
++ if (cfq_slice_used(cfqq))
++ return true;
++
++ /* Allow preemption only if we are idling on sync-noidle tree */
++ if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
++ cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
++ new_cfqq->service_tree->count == 2 &&
++ RB_EMPTY_ROOT(&cfqq->sort_list))
++ return true;
++
+ /*
+ * So both queues are sync. Let the new request get disk time if
+ * it's a metadata request and the current queue is doing regular IO.
+@@ -2071,16 +3113,8 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
+ * if this request is as-good as one we would expect from the
+ * current cfqq, let it preempt
+ */
+- if (cfq_rq_close(cfqd, rq) && (!cfq_cfqq_coop(new_cfqq) ||
+- cfqd->busy_queues == 1)) {
+- /*
+- * Mark new queue coop_preempt, so its coop flag will not be
+- * cleared when new queue gets scheduled at the very first time
+- */
+- cfq_mark_cfqq_coop_preempt(new_cfqq);
+- cfq_mark_cfqq_coop(new_cfqq);
++ if (cfq_rq_close(cfqd, cfqq, rq, true))
+ return true;
+- }
+
+ return false;
}
-+EXPORT_SYMBOL(pmd_huge);
+@@ -2121,10 +3155,10 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ cfqq->meta_pending++;
- struct page *
- follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
-index c9ba9de..589a93b 100644
---- a/arch/x86/mm/pgtable.c
-+++ b/arch/x86/mm/pgtable.c
-@@ -4,7 +4,8 @@
- #include <asm/tlb.h>
- #include <asm/fixmap.h>
+ cfq_update_io_thinktime(cfqd, cic);
+- cfq_update_io_seektime(cfqd, cic, rq);
++ cfq_update_io_seektime(cfqd, cfqq, rq);
+ cfq_update_idle_window(cfqd, cfqq, cic);
--#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
-+#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO | __GFP_UBC
-+#define PGALLOC_KERN_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+- cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
++ cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
- #ifdef CONFIG_HIGHPTE
- #define PGALLOC_USER_GFP __GFP_HIGHMEM
-@@ -16,7 +17,7 @@ gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
+ if (cfqq == cfqd->active_queue) {
+ /*
+@@ -2141,9 +3175,10 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
+ cfqd->busy_queues > 1) {
+ del_timer(&cfqd->idle_slice_timer);
+- __blk_run_queue(cfqd->queue);
+- }
+- cfq_mark_cfqq_must_dispatch(cfqq);
++ cfq_clear_cfqq_wait_request(cfqq);
++ __blk_run_queue(cfqd->queue);
++ } else
++ cfq_mark_cfqq_must_dispatch(cfqq);
+ }
+ } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
+ /*
+@@ -2165,10 +3200,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
+ cfq_log_cfqq(cfqd, cfqq, "insert_request");
+ cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
- pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
- {
-- return (pte_t *)__get_free_page(PGALLOC_GFP);
-+ return (pte_t *)__get_free_page(PGALLOC_KERN_GFP);
- }
+- cfq_add_rq_rb(rq);
+-
+ rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
+ list_add_tail(&rq->queuelist, &cfqq->fifo);
++ cfq_add_rq_rb(rq);
- pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
-diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
-index 36fe08e..42445e5 100644
---- a/arch/x86/mm/tlb.c
-+++ b/arch/x86/mm/tlb.c
-@@ -256,6 +256,8 @@ void flush_tlb_mm(struct mm_struct *mm)
- preempt_enable();
+ cfq_rq_enqueued(cfqd, cfqq, rq);
}
+@@ -2179,23 +3213,64 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
+ */
+ static void cfq_update_hw_tag(struct cfq_data *cfqd)
+ {
+- if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak)
+- cfqd->rq_in_driver_peak = rq_in_driver(cfqd);
++ struct cfq_queue *cfqq = cfqd->active_queue;
++
++ if (rq_in_driver(cfqd) > cfqd->hw_tag_est_depth)
++ cfqd->hw_tag_est_depth = rq_in_driver(cfqd);
++
++ if (cfqd->hw_tag == 1)
++ return;
-+EXPORT_SYMBOL(flush_tlb_mm);
+ if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
+ rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
+ return;
+
++ /*
++ * If active queue hasn't enough requests and can idle, cfq might not
++ * dispatch sufficient requests to hardware. Don't zero hw_tag in this
++ * case
++ */
++ if (cfqq && cfq_cfqq_idle_window(cfqq) &&
++ cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
++ CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN)
++ return;
+
- void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
- {
- struct mm_struct *mm = vma->vm_mm;
-diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
-index 58bc00f..b7028c5 100644
---- a/arch/x86/vdso/vdso32-setup.c
-+++ b/arch/x86/vdso/vdso32-setup.c
-@@ -17,6 +17,8 @@
- #include <linux/err.h>
- #include <linux/module.h>
+ if (cfqd->hw_tag_samples++ < 50)
+ return;
-+#include <bc/vmpages.h>
+- if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN)
++ if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
+ cfqd->hw_tag = 1;
+ else
+ cfqd->hw_tag = 0;
++}
+
- #include <asm/cpufeature.h>
- #include <asm/msr.h>
- #include <asm/pgtable.h>
-@@ -37,6 +39,8 @@ enum {
- #else
- #define VDSO_DEFAULT VDSO_ENABLED
- #endif
-+#undef VDSO_DEFAULT
-+#define VDSO_DEFAULT VDSO_DISABLED
++static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
++{
++ struct cfq_io_context *cic = cfqd->active_cic;
++
++ /* If there are other queues in the group, don't wait */
++ if (cfqq->cfqg->nr_cfqq > 1)
++ return false;
++
++ if (cfq_slice_used(cfqq))
++ return true;
++
++ /* if slice left is less than think time, wait busy */
++ if (cic && sample_valid(cic->ttime_samples)
++ && (cfqq->slice_end - jiffies < cic->ttime_mean))
++ return true;
++
++ /*
++ * If think times is less than a jiffy than ttime_mean=0 and above
++ * will not be true. It might happen that slice has not expired yet
++ * but will expire soon (4-5 ns) during select_queue(). To cover the
++ * case where think time is less than a jiffy, mark the queue wait
++ * busy if only 1 jiffy is left in the slice.
++ */
++ if (cfqq->slice_end - jiffies == 1)
++ return true;
- #ifdef CONFIG_X86_64
- #define vdso_enabled sysctl_vsyscall32
-@@ -193,7 +197,8 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
- }
+- cfqd->hw_tag_samples = 0;
+- cfqd->rq_in_driver_peak = 0;
++ return false;
}
--static struct page *vdso32_pages[1];
-+struct page *vdso32_pages[1];
-+EXPORT_SYMBOL_GPL(vdso32_pages);
+ static void cfq_completed_request(struct request_queue *q, struct request *rq)
+@@ -2206,7 +3281,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
+ unsigned long now;
- #ifdef CONFIG_X86_64
+ now = jiffies;
+- cfq_log_cfqq(cfqd, cfqq, "complete");
++ cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq));
-@@ -309,16 +314,30 @@ int __init sysenter_setup(void)
- return 0;
- }
+ cfq_update_hw_tag(cfqd);
-+EXPORT_SYMBOL_GPL(VDSO32_SYSENTER_RETURN);
-+EXPORT_SYMBOL_GPL(VDSO32_PRELINK);
+@@ -2220,7 +3295,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
+
+ if (sync) {
+ RQ_CIC(rq)->last_end_request = now;
+- cfqd->last_end_sync_rq = now;
++ if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
++ cfqd->last_delayed_sync = now;
+ }
+
+ /*
+@@ -2234,18 +3310,39 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
+ cfq_set_prio_slice(cfqd, cfqq);
+ cfq_clear_cfqq_slice_new(cfqq);
+ }
+
- /* Setup a VMA at program startup for the vsyscall page */
--int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
-+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp,
-+ unsigned long map_address)
- {
- struct mm_struct *mm = current->mm;
-- unsigned long addr;
-+ unsigned long addr = map_address;
- int ret = 0;
- bool compat;
-+ unsigned long flags;
++ /*
++ * Should we wait for next request to come in before we expire
++ * the queue.
++ */
++ if (cfq_should_wait_busy(cfqd, cfqq)) {
++ cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
++ cfq_mark_cfqq_wait_busy(cfqq);
++ }
++
+ /*
+- * If there are no requests waiting in this queue, and
+- * there are other queues ready to issue requests, AND
+- * those other queues are issuing requests within our
+- * mean seek distance, give them a chance to run instead
+- * of idling.
++ * Idling is not enabled on:
++ * - expired queues
++ * - idle-priority queues
++ * - async queues
++ * - queues with still some requests queued
++ * - when there is a close cooperator
+ */
+ if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
+ cfq_slice_expired(cfqd, 1);
+- else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) &&
+- sync && !rq_noidle(rq))
+- cfq_arm_slice_timer(cfqd);
++ else if (sync && cfqq_empty &&
++ !cfq_close_cooperator(cfqd, cfqq)) {
++ cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);
++ /*
++ * Idling is enabled for SYNC_WORKLOAD.
++ * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
++ * only if we processed at least one !rq_noidle request
++ */
++ if (cfqd->serving_type == SYNC_WORKLOAD
++ || cfqd->noidle_tree_requires_idle
++ || cfqq->cfqg->nr_cfqq == 1)
++ cfq_arm_slice_timer(cfqd);
++ }
+ }
-- if (vdso_enabled == VDSO_DISABLED)
-+ if (vdso_enabled == VDSO_DISABLED && map_address == 0) {
-+ current->mm->context.vdso = NULL;
- return 0;
+ if (!rq_in_driver(cfqd))
+@@ -2269,12 +3366,10 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
+ cfqq->ioprio = IOPRIO_NORM;
+ } else {
+ /*
+- * check if we need to unboost the queue
++ * unboost the queue (if needed)
+ */
+- if (cfqq->ioprio_class != cfqq->org_ioprio_class)
+- cfqq->ioprio_class = cfqq->org_ioprio_class;
+- if (cfqq->ioprio != cfqq->org_ioprio)
+- cfqq->ioprio = cfqq->org_ioprio;
++ cfqq->ioprio_class = cfqq->org_ioprio_class;
++ cfqq->ioprio = cfqq->org_ioprio;
+ }
+ }
+
+@@ -2338,6 +3433,35 @@ static void cfq_put_request(struct request *rq)
+ }
+ }
+
++static struct cfq_queue *
++cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
++ struct cfq_queue *cfqq)
++{
++ cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
++ cic_set_cfqq(cic, cfqq->new_cfqq, 1);
++ cfq_mark_cfqq_coop(cfqq->new_cfqq);
++ cfq_put_queue(cfqq);
++ return cic_to_cfqq(cic, 1);
++}
++
++/*
++ * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
++ * was the last process referring to said cfqq.
++ */
++static struct cfq_queue *
++split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
++{
++ if (cfqq_process_refs(cfqq) == 1) {
++ cfqq->pid = current->pid;
++ cfq_clear_cfqq_coop(cfqq);
++ cfq_clear_cfqq_split_coop(cfqq);
++ return cfqq;
+ }
+
-+ flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | VM_MAYWRITE |
-+ mm->def_flags;
++ cic_set_cfqq(cic, NULL, 1);
++ cfq_put_queue(cfqq);
++ return NULL;
++}
+ /*
+ * Allocate cfq data structures associated with this request.
+ */
+@@ -2360,10 +3484,30 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
+ if (!cic)
+ goto queue_fail;
+
++new_queue:
+ cfqq = cic_to_cfqq(cic, is_sync);
+ if (!cfqq || cfqq == &cfqd->oom_cfqq) {
+ cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
+ cic_set_cfqq(cic, cfqq, is_sync);
++ } else {
++ /*
++ * If the queue was seeky for too long, break it apart.
++ */
++ if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
++ cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
++ cfqq = split_cfqq(cic, cfqq);
++ if (!cfqq)
++ goto new_queue;
++ }
+
-+ ret = -ENOMEM;
-+ if (ub_memory_charge(mm, PAGE_SIZE, flags, NULL, UB_SOFT))
-+ goto err_charge;
++ /*
++ * Check to see if this queue is scheduled to merge with
++ * another, closely cooperating queue. The merging of
++ * queues happens here as it must be done in process context.
++ * The reference on new_cfqq was taken in merge_cfqqs.
++ */
++ if (cfqq->new_cfqq)
++ cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
+ }
- down_write(&mm->mmap_sem);
+ cfqq->allocated[rw]++;
+@@ -2438,6 +3582,11 @@ static void cfq_idle_slice_timer(unsigned long data)
+ */
+ if (!RB_EMPTY_ROOT(&cfqq->sort_list))
+ goto out_kick;
++
++ /*
++ * Queue depth flag is reset only when the idle didn't succeed
++ */
++ cfq_clear_cfqq_deep(cfqq);
+ }
+ expire:
+ cfq_slice_expired(cfqd, timed_out);
+@@ -2468,6 +3617,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
+ cfq_put_queue(cfqd->async_idle_cfqq);
+ }
-@@ -328,19 +347,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
++static void cfq_cfqd_free(struct rcu_head *head)
++{
++ kfree(container_of(head, struct cfq_data, rcu));
++}
++
+ static void cfq_exit_queue(struct elevator_queue *e)
+ {
+ struct cfq_data *cfqd = e->elevator_data;
+@@ -2489,25 +3643,49 @@ static void cfq_exit_queue(struct elevator_queue *e)
+ }
- map_compat_vdso(compat);
+ cfq_put_async_queues(cfqd);
++ cfq_release_cfq_groups(cfqd);
++ blkiocg_del_blkio_group(&cfqd->root_group.blkg);
-- if (compat)
-- addr = VDSO_HIGH_BASE;
-- else {
-- addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
-+ if (!compat || map_address) {
-+ addr = get_unmapped_area(NULL, addr, PAGE_SIZE, 0, 0);
- if (IS_ERR_VALUE(addr)) {
- ret = addr;
- goto up_fail;
- }
-- }
-+ } else
-+ addr = VDSO_HIGH_BASE;
+ spin_unlock_irq(q->queue_lock);
- current->mm->context.vdso = (void *)addr;
+ cfq_shutdown_timer_wq(cfqd);
-- if (compat_uses_vma || !compat) {
-+ if (compat_uses_vma || !compat || map_address) {
- /*
- * MAYWRITE to allow gdb to COW and set breakpoints
- *
-@@ -368,9 +386,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
- current->mm->context.vdso = NULL;
+- kfree(cfqd);
++ /* Wait for cfqg->blkg->key accessors to exit their grace periods. */
++ call_rcu(&cfqd->rcu, cfq_cfqd_free);
+ }
- up_write(&mm->mmap_sem);
-+ if (ret < 0)
-+ ub_memory_uncharge(mm, PAGE_SIZE, flags, NULL);
-+err_charge:
+ static void *cfq_init_queue(struct request_queue *q)
+ {
+ struct cfq_data *cfqd;
+- int i;
++ int i, j;
++ struct cfq_group *cfqg;
++ struct cfq_rb_root *st;
- return ret;
- }
-+EXPORT_SYMBOL_GPL(arch_setup_additional_pages);
+ cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
+ if (!cfqd)
+ return NULL;
- #ifdef CONFIG_X86_64
+- cfqd->service_tree = CFQ_RB_ROOT;
++ /* Init root service tree */
++ cfqd->grp_service_tree = CFQ_RB_ROOT;
++
++ /* Init root group */
++ cfqg = &cfqd->root_group;
++ for_each_cfqg_st(cfqg, i, j, st)
++ *st = CFQ_RB_ROOT;
++ RB_CLEAR_NODE(&cfqg->rb_node);
-diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
-index 21e1aeb..507ba17 100644
---- a/arch/x86/vdso/vma.c
-+++ b/arch/x86/vdso/vma.c
-@@ -4,6 +4,7 @@
- * Subject to the GPL, v.2
- */
- #include <linux/mm.h>
-+#include <linux/module.h>
- #include <linux/err.h>
- #include <linux/sched.h>
- #include <linux/init.h>
-@@ -99,17 +100,23 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
++ /* Give preference to root group over other groups */
++ cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
++
++#ifdef CONFIG_CFQ_GROUP_IOSCHED
++ /*
++ * Take a reference to root group which we never drop. This is just
++ * to make sure that cfq_put_cfqg() does not try to kfree root group
++ */
++ atomic_set(&cfqg->ref, 1);
++ blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,
++ 0);
++#endif
+ /*
+ * Not strictly needed (since RB_ROOT just clears the node and we
+ * zeroed cfqd on alloc), but better be safe in case someone decides
+@@ -2523,6 +3701,7 @@ static void *cfq_init_queue(struct request_queue *q)
+ */
+ cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
+ atomic_inc(&cfqd->oom_cfqq.ref);
++ cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
+
+ INIT_LIST_HEAD(&cfqd->cic_list);
+
+@@ -2544,8 +3723,14 @@ static void *cfq_init_queue(struct request_queue *q)
+ cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
+ cfqd->cfq_slice_idle = cfq_slice_idle;
+ cfqd->cfq_latency = 1;
+- cfqd->hw_tag = 1;
+- cfqd->last_end_sync_rq = jiffies;
++ cfqd->cfq_group_isolation = 0;
++ cfqd->hw_tag = -1;
++ /*
++ * we optimistically start assuming sync ops weren't delayed in last
++ * second, in order to have larger depth for async operations.
++ */
++ cfqd->last_delayed_sync = jiffies - HZ;
++ INIT_RCU_HEAD(&cfqd->rcu);
+ return cfqd;
+ }
+
+@@ -2614,6 +3799,7 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
+ SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
+ SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
+ SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
++SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
+ #undef SHOW_FUNCTION
+
+ #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
+@@ -2646,6 +3832,7 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
+ STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
+ UINT_MAX, 0);
+ STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
++STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
+ #undef STORE_FUNCTION
+
+ #define CFQ_ATTR(name) \
+@@ -2662,6 +3849,7 @@ static struct elv_fs_entry cfq_attrs[] = {
+ CFQ_ATTR(slice_async_rq),
+ CFQ_ATTR(slice_idle),
+ CFQ_ATTR(low_latency),
++ CFQ_ATTR(group_isolation),
+ __ATTR_NULL
+ };
- /* Setup a VMA at program startup for the vsyscall page.
- Not called for compat tasks */
--int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
-+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp,
-+ unsigned long map_address)
+@@ -2691,6 +3879,17 @@ static struct elevator_type iosched_cfq = {
+ .elevator_owner = THIS_MODULE,
+ };
+
++#ifdef CONFIG_CFQ_GROUP_IOSCHED
++static struct blkio_policy_type blkio_policy_cfq = {
++ .ops = {
++ .blkio_unlink_group_fn = cfq_unlink_blkio_group,
++ .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
++ },
++};
++#else
++static struct blkio_policy_type blkio_policy_cfq;
++#endif
++
+ static int __init cfq_init(void)
{
- struct mm_struct *mm = current->mm;
- unsigned long addr;
- int ret;
+ /*
+@@ -2705,6 +3904,7 @@ static int __init cfq_init(void)
+ return -ENOMEM;
-- if (!vdso_enabled)
-+ if (!vdso_enabled && map_address == 0) {
-+ current->mm->context.vdso = NULL;
- return 0;
-+ }
+ elv_register(&iosched_cfq);
++ blkio_policy_register(&blkio_policy_cfq);
- down_write(&mm->mmap_sem);
-- addr = vdso_addr(mm->start_stack, vdso_size);
-+ if (map_address)
-+ addr = map_address;
-+ else
-+ addr = vdso_addr(mm->start_stack, vdso_size);
- addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
- if (IS_ERR_VALUE(addr)) {
- ret = addr;
-@@ -132,6 +139,7 @@ up_fail:
- up_write(&mm->mmap_sem);
- return ret;
+ return 0;
}
-+EXPORT_SYMBOL_GPL(arch_setup_additional_pages);
-
- static __init int vdso_setup(char *s)
+@@ -2712,6 +3912,7 @@ static int __init cfq_init(void)
+ static void __exit cfq_exit(void)
{
+ DECLARE_COMPLETION_ONSTACK(all_gone);
++ blkio_policy_unregister(&blkio_policy_cfq);
+ elv_unregister(&iosched_cfq);
+ ioc_gone = &all_gone;
+ /* ioc_gone's update must be visible before reading ioc_count */
diff --git a/block/elevator.c b/block/elevator.c
index a847046..7e0fe67 100644
--- a/block/elevator.c
@@ -7477,7 +12017,7 @@
if (!sk)
goto out;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
-index 4fdfa2a..37d414d 100644
+index 4fdfa2a..a052759 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -61,6 +61,7 @@
@@ -7655,7 +12195,21 @@
tun_net_init(dev);
if (strchr(dev->name, '%')) {
-@@ -1316,6 +1340,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
+@@ -1006,9 +1030,10 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
+ if (err < 0)
+ goto err_free_sk;
+
+- if (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) ||
+- device_create_file(&tun->dev->dev, &dev_attr_owner) ||
+- device_create_file(&tun->dev->dev, &dev_attr_group))
++ if ((dev_net(tun->dev) == &init_net) &&
++ (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) ||
++ device_create_file(&tun->dev->dev, &dev_attr_owner) ||
++ device_create_file(&tun->dev->dev, &dev_attr_group)))
+ printk(KERN_ERR "Failed to create tun sysfs files\n");
+
+ sk->sk_destruct = tun_sock_destruct;
+@@ -1316,6 +1341,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
tfile->tun = NULL;
tfile->net = get_net(current->nsproxy->net_ns);
file->private_data = tfile;
@@ -7663,7 +12217,7 @@
return 0;
}
-@@ -1457,6 +1482,226 @@ static const struct ethtool_ops tun_ethtool_ops = {
+@@ -1457,6 +1483,226 @@ static const struct ethtool_ops tun_ethtool_ops = {
.set_rx_csum = tun_set_rx_csum
};
@@ -7890,7 +12444,7 @@
static int __init tun_init(void)
{
-@@ -1476,6 +1721,8 @@ static int __init tun_init(void)
+@@ -1476,6 +1722,8 @@ static int __init tun_init(void)
printk(KERN_ERR "tun: Can't register misc device %d\n", TUN_MINOR);
goto err_misc;
}
@@ -7899,7 +12453,7 @@
return 0;
err_misc:
rtnl_link_unregister(&tun_link_ops);
-@@ -1485,6 +1732,7 @@ err_linkops:
+@@ -1485,6 +1733,7 @@ err_linkops:
static void tun_cleanup(void)
{
@@ -11747,7 +16301,7 @@
EXPORT_SYMBOL(get_empty_filp);
diff --git a/fs/filesystems.c b/fs/filesystems.c
-index a24c58e..2723c3e 100644
+index a24c58e..bd5c213 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -14,6 +14,9 @@
@@ -11971,7 +16525,20 @@
tmp = tmp->next;
}
read_unlock(&file_systems_lock);
-@@ -247,7 +356,7 @@ static const struct file_operations filesystems_proc_fops = {
+@@ -224,9 +333,12 @@ static int filesystems_proc_show(struct seq_file *m, void *v)
+ read_lock(&file_systems_lock);
+ tmp = file_systems;
+ while (tmp) {
++ if (!check_ve_fstype(tmp, get_exec_env()))
++ goto next; /* skip in VE */
+ seq_printf(m, "%s\t%s\n",
+ (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+ tmp->name);
++next:
+ tmp = tmp->next;
+ }
+ read_unlock(&file_systems_lock);
+@@ -247,7 +359,7 @@ static const struct file_operations filesystems_proc_fops = {
static int __init proc_filesystems_init(void)
{
@@ -11980,7 +16547,7 @@
return 0;
}
module_init(proc_filesystems_init);
-@@ -258,8 +367,8 @@ static struct file_system_type *__get_fs_type(const char *name, int len)
+@@ -258,8 +370,8 @@ static struct file_system_type *__get_fs_type(const char *name, int len)
struct file_system_type *fs;
read_lock(&file_systems_lock);
@@ -12434,18 +17001,19 @@
return 0;
}
diff --git a/fs/ioprio.c b/fs/ioprio.c
-index c7c0b28..2a7e8ae 100644
+index c7c0b28..25f7275 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
-@@ -26,6 +26,7 @@
+@@ -26,6 +26,8 @@
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/pid_namespace.h>
+#include <linux/nsproxy.h>
++#include <linux/ve_proto.h>
int set_task_ioprio(struct task_struct *task, int ioprio)
{
-@@ -78,8 +79,11 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
+@@ -78,8 +80,11 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
int data = IOPRIO_PRIO_DATA(ioprio);
struct task_struct *p, *g;
struct user_struct *user;
@@ -12458,7 +17026,7 @@
switch (class) {
case IOPRIO_CLASS_RT:
-@@ -137,17 +141,25 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
+@@ -137,17 +142,25 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
if (!user)
break;
@@ -12481,12 +17049,12 @@
+ break;
+ }
+
-+ ret = 0; /* bc_set_ioprio(who, data); */
++ ret = ve_set_ioprio(who, data);
+ break;
default:
ret = -EINVAL;
}
-@@ -192,9 +204,9 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
+@@ -192,9 +205,9 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
{
struct task_struct *g, *p;
struct user_struct *user;
@@ -12497,7 +17065,7 @@
read_lock(&tasklist_lock);
switch (which) {
-@@ -230,7 +242,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
+@@ -230,7 +243,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
if (!user)
break;
@@ -12506,7 +17074,7 @@
if (__task_cred(p)->uid != user->uid)
continue;
tmpio = get_task_ioprio(p);
-@@ -240,7 +252,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
+@@ -240,7 +253,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
ret = tmpio;
else
ret = ioprio_best(ret, tmpio);
@@ -13540,7 +18108,7 @@
mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
-index 69d6a46..b9a8f89 100644
+index 127ed5c..95a31c8 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -125,6 +125,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
@@ -14554,7 +19122,7 @@
mmput(mm);
return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
-index 13b0378..eb8a70f 100644
+index a1bb0f6..ef6ee19 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -49,6 +49,7 @@
@@ -25476,6 +30044,41 @@
/**
* has_capability - Determine if a task has a superior capability available
* @t: The task in question
+diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
+index 0008dee..9665343 100644
+--- a/include/linux/cgroup.h
++++ b/include/linux/cgroup.h
+@@ -220,6 +220,8 @@ struct cgroup {
+
+ /* For RCU-protected deletion */
+ struct rcu_head rcu_head;
++
++ int cgroup_lite_id;
+ };
+
+ /*
+@@ -525,6 +527,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
+ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
+ int cgroup_scan_tasks(struct cgroup_scanner *scan);
+ int cgroup_attach_task(struct cgroup *, struct task_struct *);
++int cgroup_set_task_css(struct task_struct *tsk, struct css_set *css);
+
+ /*
+ * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
+diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
+index 9c8d31b..ccefff0 100644
+--- a/include/linux/cgroup_subsys.h
++++ b/include/linux/cgroup_subsys.h
+@@ -60,3 +60,9 @@ SUBSYS(net_cls)
+ #endif
+
+ /* */
++
++#ifdef CONFIG_BLK_CGROUP
++SUBSYS(blkio)
++#endif
++
++/* */
diff --git a/include/linux/compat.h b/include/linux/compat.h
index af931ee..499d84a 100644
--- a/include/linux/compat.h
@@ -28285,6 +32888,38 @@
#ifdef CONFIG_INOTIFY
/* Kernel API for producing events */
+diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
+index 4da4a75..d61b0b8 100644
+--- a/include/linux/iocontext.h
++++ b/include/linux/iocontext.h
+@@ -40,16 +40,11 @@ struct cfq_io_context {
+ struct io_context *ioc;
+
+ unsigned long last_end_request;
+- sector_t last_request_pos;
+
+ unsigned long ttime_total;
+ unsigned long ttime_samples;
+ unsigned long ttime_mean;
+
+- unsigned int seek_samples;
+- u64 seek_total;
+- sector_t seek_mean;
+-
+ struct list_head queue_list;
+ struct hlist_node cic_list;
+
+@@ -73,6 +68,10 @@ struct io_context {
+ unsigned short ioprio;
+ unsigned short ioprio_changed;
+
++#ifdef CONFIG_BLK_CGROUP
++ unsigned short cgroup_changed;
++#endif
++
+ /*
+ * For request batching
+ */
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 76dad48..c699950 100644
--- a/include/linux/ioprio.h
@@ -28646,6 +33281,19 @@
extern void put_mnt_ns(struct mnt_namespace *ns);
static inline void get_mnt_ns(struct mnt_namespace *ns)
{
+diff --git a/include/linux/module.h b/include/linux/module.h
+index 460df15..482efc8 100644
+--- a/include/linux/module.h
++++ b/include/linux/module.h
+@@ -455,7 +455,7 @@ void symbol_put_addr(void *addr);
+ static inline local_t *__module_ref_addr(struct module *mod, int cpu)
+ {
+ #ifdef CONFIG_SMP
+- return (local_t *) per_cpu_ptr(mod->refptr, cpu);
++ return (local_t *) (mod->refptr + per_cpu_offset(cpu));
+ #else
+ return &mod->ref;
+ #endif
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 5d52753..f4bf358 100644
--- a/include/linux/mount.h
@@ -31211,10 +35859,10 @@
+#endif
diff --git a/include/linux/ve_proto.h b/include/linux/ve_proto.h
new file mode 100644
-index 0000000..8bc4e01
+index 0000000..5bb93e8
--- /dev/null
+++ b/include/linux/ve_proto.h
-@@ -0,0 +1,96 @@
+@@ -0,0 +1,100 @@
+/*
+ * include/linux/ve_proto.h
+ *
@@ -31246,6 +35894,10 @@
+#endif
+#endif
+
++#define VE_IOPRIO_MIN 0
++#define VE_IOPRIO_MAX 8
++extern int ve_set_ioprio(int veid, int ioprio);
++
+extern struct list_head ve_list_head;
+#define for_each_ve(ve) list_for_each_entry((ve), &ve_list_head, ve_list)
+extern rwlock_t ve_list_lock;
@@ -41040,10 +45692,10 @@
diff --git a/kernel/cgroup_lite.c b/kernel/cgroup_lite.c
new file mode 100644
-index 0000000..0de6d16
+index 0000000..d299cf6
--- /dev/null
+++ b/kernel/cgroup_lite.c
-@@ -0,0 +1,226 @@
+@@ -0,0 +1,342 @@
+/*
+ * lite cgroups engine
+ */
@@ -41065,6 +45717,78 @@
+static struct cgroup init_cgroup;
+static struct cftype *subsys_cftypes[CGROUP_SUBSYS_COUNT];
+
++static struct idr cgroup_idr;
++static DEFINE_SPINLOCK(cgroup_idr_lock);
++
++unsigned short css_id(struct cgroup_subsys_state *css)
++{
++ return css->cgroup->cgroup_lite_id;
++}
++
++unsigned short css_depth(struct cgroup_subsys_state *css)
++{
++ return (css->cgroup == &init_cgroup) ? 0 : 1;
++}
++
++int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
++{
++ snprintf(buf, buflen, "/%d", cgrp->cgroup_lite_id);
++ return 0;
++}
++
++struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
++{
++ struct cgroup *g;
++
++ BUG_ON(!ss->use_id);
++ g = idr_find(&cgroup_idr, id);
++ if (!g)
++ return NULL;
++ return g->subsys[ss->subsys_id];
++}
++
++void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
++{
++}
++
++static int init_cgroup_id(struct cgroup *g)
++{
++ int err, id;
++
++ if (unlikely(!idr_pre_get(&cgroup_idr, GFP_KERNEL)))
++ return -ENOMEM;
++
++ spin_lock(&cgroup_idr_lock);
++ err = idr_get_new_above(&cgroup_idr, g, 1, &id);
++ spin_unlock(&cgroup_idr_lock);
++
++ if (err)
++ return err;
++
++ if (id > USHORT_MAX) {
++ spin_lock(&cgroup_idr_lock);
++ idr_remove(&cgroup_idr, id);
++ spin_unlock(&cgroup_idr_lock);
++ return -ENOSPC;
++ }
++
++ g->cgroup_lite_id = id;
++
++ return 0;
++}
++
++static void fini_cgroup_id(struct cgroup *g)
++{
++ spin_lock(&cgroup_idr_lock);
++ idr_remove(&cgroup_idr, g->cgroup_lite_id);
++ spin_unlock(&cgroup_idr_lock);
++}
++
++void __css_put(struct cgroup_subsys_state *css)
++{
++ atomic_dec(&css->refcnt);
++}
++
+static int init_css_set_subsystems(struct cgroup *g, struct css_set *set)
+{
+ int i;
@@ -41079,7 +45803,7 @@
+
+ g->subsys[i] = ss;
+ set->subsys[i] = ss;
-+ atomic_set(&ss->refcnt, 0);
++ atomic_set(&ss->refcnt, 1);
+ ss->cgroup = g;
+ }
+ return 0;
@@ -41108,6 +45832,10 @@
+ if (cs == NULL)
+ goto err_calloc;
+
++ err = init_cgroup_id(g);
++ if (err)
++ goto err_id;
++
+ g->parent = &init_cgroup;
+ err = init_css_set_subsystems(g, cs);
+ if (err)
@@ -41119,6 +45847,8 @@
+ return 0;
+
+err_subsys:
++ fini_cgroup_id(g);
++err_id:
+ kfree(cs);
+err_calloc:
+ kfree(g);
@@ -41142,13 +45872,14 @@
+ if (cs->pre_destroy)
+ cs->pre_destroy(cs, g);
+
-+ if (atomic_read(&ss->refcnt))
++ if (atomic_read(&ss->refcnt) != 1)
+ printk(KERN_ERR "CG: leaking %d/%s subsys\n",
+ ve->veid, subsys[i]->name);
+ else
+ cs->destroy(cs, g);
+ }
+
++ fini_cgroup_id(g);
+ kfree(g);
+ kfree(css);
+ ve->ve_cgroup = NULL;
@@ -41183,6 +45914,40 @@
+ return -ENODATA;
+}
+
++int cgroup_set_task_css(struct task_struct *tsk, struct css_set *css)
++{
++ int i, err;
++ struct cgroup_subsys *cs;
++ struct css_set *old_css;
++
++ old_css = tsk->cgroups;
++
++ if (old_css == css)
++ return 0;
++
++ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
++ cs = subsys[i];
++ if (!cs->can_attach)
++ continue;
++ err = cs->can_attach(cs, css->subsys[i]->cgroup, tsk, false);
++ if (err)
++ return err;
++ }
++
++ tsk->cgroups = css;
++
++ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
++ cs = subsys[i];
++ if (!cs->attach)
++ continue;
++ cs->attach(cs, css->subsys[i]->cgroup,
++ old_css->subsys[i]->cgroup, tsk, false);
++ }
++
++ return 0;
++}
++EXPORT_SYMBOL(cgroup_set_task_css);
++
+/*
+ * proc struts
+ */
@@ -41266,6 +46031,9 @@
+{
+ get_ve0()->ve_cgroup = &init_cgroup;
+ get_ve0()->ve_css_set = &init_css_set;
++ idr_init(&cgroup_idr);
++ if (init_cgroup_id(&init_cgroup))
++ panic("CG: Can't init initial cgroup id\n");
+ if (init_css_set_subsystems(&init_cgroup, &init_css_set) != 0)
+ panic("CG: Can't init initial set\n");
+ return 0;
@@ -43690,10 +48458,10 @@
+EXPORT_SYMBOL(lookup_cpt_obj_bypos);
diff --git a/kernel/cpt/cpt_files.c b/kernel/cpt/cpt_files.c
new file mode 100644
-index 0000000..3ada205
+index 0000000..927a4e3
--- /dev/null
+++ b/kernel/cpt/cpt_files.c
-@@ -0,0 +1,1783 @@
+@@ -0,0 +1,1782 @@
+/*
+ *
+ * kernel/cpt/cpt_files.c
@@ -44357,20 +49125,19 @@
+
+ if (file->f_op == &shm_file_operations ||
+ file->f_op == &shmem_file_operations) {
-+ struct file *shm_file = file;
+
+ /* shmget uses shm ops */
+ if (file->f_op == &shm_file_operations) {
+ struct shm_file_data *sfd = file->private_data;
-+ shm_file = sfd->file;
++ file = sfd->file;
+ }
+
-+ cpt_dump_content_sysvshm(shm_file, ctx);
++ cpt_dump_content_sysvshm(file, ctx);
+
-+ do_read = shm_file->f_dentry->d_inode->i_fop->read;
++ do_read = file->f_dentry->d_inode->i_fop->read;
+ if (!do_read) {
+ wprintk_ctx("TMPFS is not configured?\n");
-+ return dump_content_shm(shm_file, ctx);
++ return dump_content_shm(file, ctx);
+ }
+ }
+
@@ -67177,7 +71944,7 @@
+cond_syscall(sys_fairsched_chwt);
+cond_syscall(sys_fairsched_rate);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
-index b8bd058..5ef2188 100644
+index b8bd058..5b754e4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -50,6 +50,7 @@
@@ -67210,17 +71977,20 @@
extern int latencytop_enabled;
extern int sysctl_nr_open_min, sysctl_nr_open_max;
#ifndef CONFIG_MMU
-@@ -169,6 +185,9 @@ static int proc_taint(struct ctl_table *table, int write,
+@@ -169,6 +185,12 @@ static int proc_taint(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
#endif
+static int proc_dointvec_ve(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
++static int sysctl_data_ve(struct ctl_table *table,
++ void __user *oldval, size_t __user *oldlenp,
++ void __user *newval, size_t newlen);
+
static struct ctl_table root_table[];
static struct ctl_table_root sysctl_table_root;
static struct ctl_table_header root_table_header = {
-@@ -178,9 +197,31 @@ static struct ctl_table_header root_table_header = {
+@@ -178,9 +200,31 @@ static struct ctl_table_header root_table_header = {
.root = &sysctl_table_root,
.set = &sysctl_table_root.default_set,
};
@@ -67253,7 +72023,7 @@
};
static struct ctl_table kern_table[];
-@@ -504,6 +545,20 @@ static struct ctl_table kern_table[] = {
+@@ -504,6 +548,20 @@ static struct ctl_table kern_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
@@ -67274,7 +72044,7 @@
#ifdef __hppa__
{
.ctl_name = KERN_HPPA_PWRSW,
-@@ -699,6 +754,24 @@ static struct ctl_table kern_table[] = {
+@@ -699,6 +757,24 @@ static struct ctl_table kern_table[] = {
.extra1 = &pid_max_min,
.extra2 = &pid_max_max,
},
@@ -67299,7 +72069,7 @@
{
.ctl_name = KERN_PANIC_ON_OOPS,
.procname = "panic_on_oops",
-@@ -824,10 +897,12 @@ static struct ctl_table kern_table[] = {
+@@ -824,10 +900,13 @@ static struct ctl_table kern_table[] = {
{
.ctl_name = KERN_RANDOMIZE,
.procname = "randomize_va_space",
@@ -67311,10 +72081,11 @@
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &proc_dointvec_ve,
++ .strategy = &sysctl_data_ve,
},
#endif
#if defined(CONFIG_S390) && defined(CONFIG_SMP)
-@@ -1424,6 +1499,21 @@ static struct ctl_table vm_table[] = {
+@@ -1424,6 +1503,21 @@ static struct ctl_table vm_table[] = {
.extra2 = &one,
},
#endif
@@ -67336,7 +72107,7 @@
/*
* NOTE: do not add new entries to this table unless you have read
-@@ -1600,6 +1690,13 @@ static struct ctl_table fs_table[] = {
+@@ -1600,6 +1694,13 @@ static struct ctl_table fs_table[] = {
};
static struct ctl_table debug_table[] = {
@@ -67350,7 +72121,7 @@
#if defined(CONFIG_X86) || defined(CONFIG_PPC)
{
.ctl_name = CTL_UNNUMBERED,
-@@ -2150,10 +2247,27 @@ struct ctl_table_header *__register_sysctl_paths(
+@@ -2150,10 +2251,27 @@ struct ctl_table_header *__register_sysctl_paths(
struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
struct ctl_table *table)
{
@@ -67378,7 +72149,7 @@
/**
* register_sysctl_table - register a sysctl table hierarchy
* @table: the top-level table structure
-@@ -2170,6 +2284,14 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
+@@ -2170,6 +2288,14 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
return register_sysctl_paths(null_path, table);
}
@@ -67393,7 +72164,7 @@
/**
* unregister_sysctl_table - unregister a sysctl table hierarchy
* @header: the header returned from register_sysctl_table
-@@ -2231,6 +2353,18 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+@@ -2231,6 +2357,18 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
return NULL;
}
@@ -67412,7 +72183,7 @@
void unregister_sysctl_table(struct ctl_table_header * table)
{
}
-@@ -2902,6 +3036,25 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
+@@ -2902,6 +3040,25 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
return 0;
}
@@ -67438,7 +72209,49 @@
#else /* CONFIG_PROC_FS */
int proc_dostring(struct ctl_table *table, int write,
-@@ -3236,6 +3389,56 @@ static int deprecated_sysctl_warning(struct __sysctl_args *args)
+@@ -2996,6 +3153,27 @@ int sysctl_data(struct ctl_table *table,
+ return 1;
+ }
+
++#ifdef CONFIG_VE
++static int sysctl_data_ve(struct ctl_table *table,
++ void __user *oldval, size_t __user *oldlenp,
++ void __user *newval, size_t newlen)
++{
++ struct ctl_table tmp_table;
++
++ tmp_table = *table;
++ tmp_table.data = (char *)get_exec_env() + (unsigned long)table->extra1;
++
++ return sysctl_data(&tmp_table, oldval, oldlenp, newval, newlen);
++}
++#else
++static int sysctl_data_ve(struct ctl_table *table,
++ void __user *oldval, size_t __user *oldlenp,
++ void __user *newval, size_t newlen)
++{
++ return sysctl_data(table, oldval, oldlenp, newval, newlen);
++}
++#endif
++
+ /* The generic string strategy routine: */
+ int sysctl_string(struct ctl_table *table,
+ void __user *oldval, size_t __user *oldlenp,
+@@ -3175,6 +3353,13 @@ int sysctl_data(struct ctl_table *table,
+ return -ENOSYS;
+ }
+
++static int sysctl_data_ve(struct ctl_table *table,
++ void __user *oldval, size_t __user *oldlenp,
++ void __user *newval, size_t newlen)
++{
++ return -ENOSYS;
++}
++
+ int sysctl_string(struct ctl_table *table,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+@@ -3236,6 +3421,56 @@ static int deprecated_sysctl_warning(struct __sysctl_args *args)
return 0;
}
@@ -67495,7 +72308,7 @@
/*
* No sense putting this after each symbol definition, twice,
* exception granted :-)
-@@ -3249,7 +3452,9 @@ EXPORT_SYMBOL(proc_dostring);
+@@ -3249,7 +3484,9 @@ EXPORT_SYMBOL(proc_dostring);
EXPORT_SYMBOL(proc_doulongvec_minmax);
EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
EXPORT_SYMBOL(register_sysctl_table);
@@ -67924,10 +72737,10 @@
+
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
new file mode 100644
-index 0000000..85c42c3
+index 0000000..907d944
--- /dev/null
+++ b/kernel/ve/ve.c
-@@ -0,0 +1,129 @@
+@@ -0,0 +1,161 @@
+/*
+ * linux/kernel/ve/ve.c
+ *
@@ -68057,9 +72870,41 @@
+
+ wake_up_process(ve_cleanup_thread);
+}
++
++#ifdef CONFIG_BLK_CGROUP
++extern int blkiocg_set_weight(struct cgroup *cgroup, u64 val);
++
++static u64 ioprio_weight[VE_IOPRIO_MAX] = {200, 275, 350, 425, 500, 575, 650, 725};
++
++int ve_set_ioprio(int veid, int ioprio)
++{
++ struct ve_struct *ve;
++ int ret;
++
++ if (ioprio < VE_IOPRIO_MIN || ioprio >= VE_IOPRIO_MAX)
++ return -ERANGE;
++
++ ret = -ESRCH;
++ read_lock(&ve_list_lock);
++ for_each_ve(ve) {
++ if (ve->veid != veid)
++ continue;
++ ret = blkiocg_set_weight(ve->ve_cgroup, ioprio_weight[ioprio]);
++ break;
++ }
++ read_unlock(&ve_list_lock);
++
++ return ret;
++}
++#else
++int ve_set_ioprio(int veid, int ioprio)
++{
++ return -EINVAL;
++}
++#endif /* CONFIG_BLK_CGROUP */
diff --git a/kernel/ve/vecalls.c b/kernel/ve/vecalls.c
new file mode 100644
-index 0000000..cc27878
+index 0000000..9947b57
--- /dev/null
+++ b/kernel/ve/vecalls.c
@@ -0,0 +1,2335 @@
@@ -68905,7 +73750,7 @@
+ atomic_inc(&new->pcounter);
+ get_ve(new);
+
-+ tsk->cgroups = new->ve_css_set;
++ cgroup_set_task_css(tsk, new->ve_css_set);
+
+ new->user_ns = get_user_ns(new_creds->user->user_ns);
+}
@@ -70400,7 +75245,7 @@
+module_exit(vecalls_exit)
diff --git a/kernel/ve/veowner.c b/kernel/ve/veowner.c
new file mode 100644
-index 0000000..50f4d9a
+index 0000000..0726e44
--- /dev/null
+++ b/kernel/ve/veowner.c
@@ -0,0 +1,160 @@
@@ -70514,7 +75359,7 @@
+ .proc_handler = proc_dointvec,
+ },
+ {
-+ .ctl_name = 228,
++ .ctl_name = CTL_UNNUMBERED,
+ .procname = "ve-xattr-policy",
+ .data = &ve_xattr_policy,
+ .maxlen = sizeof(int),
@@ -72075,7 +76920,7 @@
static int do_mlockall(int flags)
{
diff --git a/mm/mmap.c b/mm/mmap.c
-index ae19746..991a1ac 100644
+index ae19746..a5dd0bf 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,7 @@
@@ -72138,15 +76983,6 @@
goto out;
set_brk:
mm->brk = brk;
-@@ -927,7 +946,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
- prot |= PROT_EXEC;
-
- if (!len)
-- return -EINVAL;
-+ return strncmp(current->comm, "rpm", 3) ? -EINVAL : addr;
-
- if (!(flags & MAP_FIXED))
- addr = round_hint_to_min(addr);
@@ -1106,6 +1125,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
More information about the Kernel-svn-changes
mailing list