diff --git a/.abf.yml b/.abf.yml index 3fe367d..8c7c525 100644 --- a/.abf.yml +++ b/.abf.yml @@ -1,3 +1,3 @@ sources: - linux-4.11.tar.xz: 0d2594b7aa3e79521f229569f9e14dc56bdcbd78 - patch-4.11.9.xz: 42627a156cb3815a3b2be144dee93d79f7d9829f + linux-4.12.tar.xz: f5dda0344401c436aa47685fbf345f4b7975fcf5 + patch-4.12.2.xz: 97521561145d5b79dd07d6c128042d1c471e7694 diff --git a/0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.11..patch b/0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.11..patch deleted file mode 100644 index 4555daa..0000000 --- a/0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.11..patch +++ /dev/null @@ -1,103 +0,0 @@ -From a0bd3c561ad7ec10c22a5ca345c6e4c5df117e41 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Tue, 7 Apr 2015 13:39:12 +0200 -Subject: [PATCH 1/4] block: cgroups, kconfig, build bits for BFQ-v7r11-4.11.0 - -Update Kconfig.iosched and do the related Makefile changes to include -kernel configuration options for BFQ. Also increase the number of -policies supported by the blkio controller so that BFQ can add its -own. - -Signed-off-by: Paolo Valente -Signed-off-by: Arianna Avanzini ---- - block/Kconfig.iosched | 32 ++++++++++++++++++++++++++++++++ - block/Makefile | 1 + - include/linux/blkdev.h | 2 +- - 3 files changed, 34 insertions(+), 1 deletion(-) - -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched -index 58fc868..bb47b1a 100644 ---- a/block/Kconfig.iosched -+++ b/block/Kconfig.iosched -@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED - ---help--- - Enable group IO scheduling in CFQ. - -+config IOSCHED_BFQ -+ tristate "BFQ I/O scheduler" -+ default n -+ ---help--- -+ The BFQ I/O scheduler tries to distribute bandwidth among -+ all processes according to their weights. -+ It aims at distributing the bandwidth as desired, independently of -+ the disk parameters and with any workload. It also tries to -+ guarantee low latency to interactive and soft real-time -+ applications. If compiled built-in (saying Y here), BFQ can -+ be configured to support hierarchical scheduling. -+ -+config CGROUP_BFQIO -+ bool "BFQ hierarchical scheduling support" -+ depends on CGROUPS && IOSCHED_BFQ=y -+ default n -+ ---help--- -+ Enable hierarchical scheduling in BFQ, using the cgroups -+ filesystem interface. The name of the subsystem will be -+ bfqio. -+ - choice - prompt "Default I/O scheduler" - default DEFAULT_CFQ -@@ -52,6 +73,16 @@ choice - config DEFAULT_CFQ - bool "CFQ" if IOSCHED_CFQ=y - -+ config DEFAULT_BFQ -+ bool "BFQ" if IOSCHED_BFQ=y -+ help -+ Selects BFQ as the default I/O scheduler which will be -+ used by default for all block devices. -+ The BFQ I/O scheduler aims at distributing the bandwidth -+ as desired, independently of the disk parameters and with -+ any workload. It also tries to guarantee low latency to -+ interactive and soft real-time applications. -+ - config DEFAULT_NOOP - bool "No-op" - -@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED - string - default "deadline" if DEFAULT_DEADLINE - default "cfq" if DEFAULT_CFQ -+ default "bfq" if DEFAULT_BFQ - default "noop" if DEFAULT_NOOP - - config MQ_IOSCHED_DEADLINE -diff --git a/block/Makefile b/block/Makefile -index 081bb68..91869f2 100644 ---- a/block/Makefile -+++ b/block/Makefile -@@ -20,6 +20,7 @@ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o - obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o - obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o - obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o -+obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o - - obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o - obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 01a696b..29d537d 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -48,7 +48,7 @@ struct rq_wb; - * Maximum number of blkcg policies allowed to be registered concurrently. - * Defined here to simplify include dependency. - */ --#define BLKCG_MAX_POLS 2 -+#define BLKCG_MAX_POLS 3 - - typedef void (rq_end_io_fn)(struct request *, int); - --- -2.10.0 - diff --git a/0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.11.0.patch b/0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.11.0.patch deleted file mode 100644 index 0c46a2e..0000000 --- a/0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.11.0.patch +++ /dev/null @@ -1,7109 +0,0 @@ -From ce617fdef48078f52afeec078dacbe7ac9d74588 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Thu, 9 May 2013 19:10:02 +0200 -Subject: [PATCH 2/4] block: introduce the BFQ-v7r11 I/O sched for 4.11.0 - -The general structure is borrowed from CFQ, as much of the code for -handling I/O contexts. Over time, several useful features have been -ported from CFQ as well (details in the changelog in README.BFQ). A -(bfq_)queue is associated to each task doing I/O on a device, and each -time a scheduling decision has to be made a queue is selected and served -until it expires. - - - Slices are given in the service domain: tasks are assigned - budgets, measured in number of sectors. Once got the disk, a task - must however consume its assigned budget within a configurable - maximum time (by default, the maximum possible value of the - budgets is automatically computed to comply with this timeout). - This allows the desired latency vs "throughput boosting" tradeoff - to be set. - - - Budgets are scheduled according to a variant of WF2Q+, implemented - using an augmented rb-tree to take eligibility into account while - preserving an O(log N) overall complexity. - - - A low-latency tunable is provided; if enabled, both interactive - and soft real-time applications are guaranteed a very low latency. - - - Latency guarantees are preserved also in the presence of NCQ. - - - Also with flash-based devices, a high throughput is achieved - while still preserving latency guarantees. - - - BFQ features Early Queue Merge (EQM), a sort of fusion of the - cooperating-queue-merging and the preemption mechanisms present - in CFQ. EQM is in fact a unified mechanism that tries to get a - sequential read pattern, and hence a high throughput, with any - set of processes performing interleaved I/O over a contiguous - sequence of sectors. - - - BFQ supports full hierarchical scheduling, exporting a cgroups - interface. Since each node has a full scheduler, each group can - be assigned its own weight. - - - If the cgroups interface is not used, only I/O priorities can be - assigned to processes, with ioprio values mapped to weights - with the relation weight = IOPRIO_BE_NR - ioprio. - - - ioprio classes are served in strict priority order, i.e., lower - priority queues are not served as long as there are higher - priority queues. Among queues in the same class the bandwidth is - distributed in proportion to the weight of each queue. A very - thin extra bandwidth is however guaranteed to the Idle class, to - prevent it from starving. - -Signed-off-by: Paolo Valente -Signed-off-by: Arianna Avanzini ---- - block/Kconfig.iosched | 6 +- - block/bfq-cgroup.c | 1186 ++++++++++++++++ - block/bfq-ioc.c | 36 + - block/bfq-iosched.c | 3763 +++++++++++++++++++++++++++++++++++++++++++++++++ - block/bfq-sched.c | 1199 ++++++++++++++++ - block/bfq.h | 801 +++++++++++ - 6 files changed, 6987 insertions(+), 4 deletions(-) - create mode 100644 block/bfq-cgroup.c - create mode 100644 block/bfq-ioc.c - create mode 100644 block/bfq-iosched.c - create mode 100644 block/bfq-sched.c - create mode 100644 block/bfq.h - -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched -index bb47b1a..b1ab0ca 100644 ---- a/block/Kconfig.iosched -+++ b/block/Kconfig.iosched -@@ -51,14 +51,12 @@ config IOSCHED_BFQ - applications. If compiled built-in (saying Y here), BFQ can - be configured to support hierarchical scheduling. - --config CGROUP_BFQIO -+config BFQ_GROUP_IOSCHED - bool "BFQ hierarchical scheduling support" - depends on CGROUPS && IOSCHED_BFQ=y - default n - ---help--- -- Enable hierarchical scheduling in BFQ, using the cgroups -- filesystem interface. The name of the subsystem will be -- bfqio. -+ Enable hierarchical scheduling in BFQ, using the blkio controller. - - choice - prompt "Default I/O scheduler" -diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c -new file mode 100644 -index 0000000..8b08a57 ---- /dev/null -+++ b/block/bfq-cgroup.c -@@ -0,0 +1,1186 @@ -+/* -+ * BFQ: CGROUPS support. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe -+ * -+ * Copyright (C) 2008 Fabio Checconi -+ * Paolo Valente -+ * -+ * Copyright (C) 2010 Paolo Valente -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ */ -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ -+/* bfqg stats flags */ -+enum bfqg_stats_flags { -+ BFQG_stats_waiting = 0, -+ BFQG_stats_idling, -+ BFQG_stats_empty, -+}; -+ -+#define BFQG_FLAG_FNS(name) \ -+static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ -+{ \ -+ stats->flags |= (1 << BFQG_stats_##name); \ -+} \ -+static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ -+{ \ -+ stats->flags &= ~(1 << BFQG_stats_##name); \ -+} \ -+static int bfqg_stats_##name(struct bfqg_stats *stats) \ -+{ \ -+ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ -+} \ -+ -+BFQG_FLAG_FNS(waiting) -+BFQG_FLAG_FNS(idling) -+BFQG_FLAG_FNS(empty) -+#undef BFQG_FLAG_FNS -+ -+/* This should be called with the queue_lock held. */ -+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) -+{ -+ unsigned long long now; -+ -+ if (!bfqg_stats_waiting(stats)) -+ return; -+ -+ now = sched_clock(); -+ if (time_after64(now, stats->start_group_wait_time)) -+ blkg_stat_add(&stats->group_wait_time, -+ now - stats->start_group_wait_time); -+ bfqg_stats_clear_waiting(stats); -+} -+ -+/* This should be called with the queue_lock held. */ -+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -+ struct bfq_group *curr_bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (bfqg_stats_waiting(stats)) -+ return; -+ if (bfqg == curr_bfqg) -+ return; -+ stats->start_group_wait_time = sched_clock(); -+ bfqg_stats_mark_waiting(stats); -+} -+ -+/* This should be called with the queue_lock held. */ -+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) -+{ -+ unsigned long long now; -+ -+ if (!bfqg_stats_empty(stats)) -+ return; -+ -+ now = sched_clock(); -+ if (time_after64(now, stats->start_empty_time)) -+ blkg_stat_add(&stats->empty_time, -+ now - stats->start_empty_time); -+ bfqg_stats_clear_empty(stats); -+} -+ -+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg) -+{ -+ blkg_stat_add(&bfqg->stats.dequeue, 1); -+} -+ -+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (blkg_rwstat_total(&stats->queued)) -+ return; -+ -+ /* -+ * group is already marked empty. This can happen if bfqq got new -+ * request in parent group and moved to this group while being added -+ * to service tree. Just ignore the event and move on. -+ */ -+ if (bfqg_stats_empty(stats)) -+ return; -+ -+ stats->start_empty_time = sched_clock(); -+ bfqg_stats_mark_empty(stats); -+} -+ -+static void bfqg_stats_update_idle_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (bfqg_stats_idling(stats)) { -+ unsigned long long now = sched_clock(); -+ -+ if (time_after64(now, stats->start_idle_time)) -+ blkg_stat_add(&stats->idle_time, -+ now - stats->start_idle_time); -+ bfqg_stats_clear_idling(stats); -+ } -+} -+ -+static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ stats->start_idle_time = sched_clock(); -+ bfqg_stats_mark_idling(stats); -+} -+ -+static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ blkg_stat_add(&stats->avg_queue_size_sum, -+ blkg_rwstat_total(&stats->queued)); -+ blkg_stat_add(&stats->avg_queue_size_samples, 1); -+ bfqg_stats_update_group_wait_time(stats); -+} -+ -+static struct blkcg_policy blkcg_policy_bfq; -+ -+/* -+ * blk-cgroup policy-related handlers -+ * The following functions help in converting between blk-cgroup -+ * internal structures and BFQ-specific structures. -+ */ -+ -+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) -+{ -+ return pd ? container_of(pd, struct bfq_group, pd) : NULL; -+} -+ -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) -+{ -+ return pd_to_blkg(&bfqg->pd); -+} -+ -+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) -+{ -+ struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); -+ -+ BUG_ON(!pd); -+ -+ return pd_to_bfqg(pd); -+} -+ -+/* -+ * bfq_group handlers -+ * The following functions help in navigating the bfq_group hierarchy -+ * by allowing to find the parent of a bfq_group or the bfq_group -+ * associated to a bfq_queue. -+ */ -+ -+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) -+{ -+ struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; -+ -+ return pblkg ? blkg_to_bfqg(pblkg) : NULL; -+} -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ return group_entity ? container_of(group_entity, struct bfq_group, -+ entity) : -+ bfqq->bfqd->root_group; -+} -+ -+/* -+ * The following two functions handle get and put of a bfq_group by -+ * wrapping the related blk-cgroup hooks. -+ */ -+ -+static void bfqg_get(struct bfq_group *bfqg) -+{ -+ return blkg_get(bfqg_to_blkg(bfqg)); -+} -+ -+static void bfqg_put(struct bfq_group *bfqg) -+{ -+ return blkg_put(bfqg_to_blkg(bfqg)); -+} -+ -+static void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, -+ int rw) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, rw, 1); -+ bfqg_stats_end_empty_time(&bfqg->stats); -+ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) -+ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); -+} -+ -+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, rw, -1); -+} -+ -+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) -+{ -+ blkg_rwstat_add(&bfqg->stats.merged, rw, 1); -+} -+ -+static void bfqg_stats_update_dispatch(struct bfq_group *bfqg, -+ uint64_t bytes, int rw) -+{ -+ blkg_stat_add(&bfqg->stats.sectors, bytes >> 9); -+ blkg_rwstat_add(&bfqg->stats.serviced, rw, 1); -+ blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes); -+} -+ -+static void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ uint64_t start_time, uint64_t io_start_time, int rw) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ unsigned long long now = sched_clock(); -+ -+ if (time_after64(now, io_start_time)) -+ blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); -+ if (time_after64(io_start_time, start_time)) -+ blkg_rwstat_add(&stats->wait_time, rw, -+ io_start_time - start_time); -+} -+ -+/* @stats = 0 */ -+static void bfqg_stats_reset(struct bfqg_stats *stats) -+{ -+ if (!stats) -+ return; -+ -+ /* queued stats shouldn't be cleared */ -+ blkg_rwstat_reset(&stats->service_bytes); -+ blkg_rwstat_reset(&stats->serviced); -+ blkg_rwstat_reset(&stats->merged); -+ blkg_rwstat_reset(&stats->service_time); -+ blkg_rwstat_reset(&stats->wait_time); -+ blkg_stat_reset(&stats->time); -+ blkg_stat_reset(&stats->unaccounted_time); -+ blkg_stat_reset(&stats->avg_queue_size_sum); -+ blkg_stat_reset(&stats->avg_queue_size_samples); -+ blkg_stat_reset(&stats->dequeue); -+ blkg_stat_reset(&stats->group_wait_time); -+ blkg_stat_reset(&stats->idle_time); -+ blkg_stat_reset(&stats->empty_time); -+} -+ -+/* @to += @from */ -+static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from) -+{ -+ if (!to || !from) -+ return; -+ -+ /* queued stats shouldn't be cleared */ -+ blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes); -+ blkg_rwstat_add_aux(&to->serviced, &from->serviced); -+ blkg_rwstat_add_aux(&to->merged, &from->merged); -+ blkg_rwstat_add_aux(&to->service_time, &from->service_time); -+ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); -+ blkg_stat_add_aux(&from->time, &from->time); -+ blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); -+ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); -+ blkg_stat_add_aux(&to->avg_queue_size_samples, -+ &from->avg_queue_size_samples); -+ blkg_stat_add_aux(&to->dequeue, &from->dequeue); -+ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); -+ blkg_stat_add_aux(&to->idle_time, &from->idle_time); -+ blkg_stat_add_aux(&to->empty_time, &from->empty_time); -+} -+ -+/* -+ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors' -+ * recursive stats can still account for the amount used by this bfqg after -+ * it's gone. -+ */ -+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) -+{ -+ struct bfq_group *parent; -+ -+ if (!bfqg) /* root_group */ -+ return; -+ -+ parent = bfqg_parent(bfqg); -+ -+ lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); -+ -+ if (unlikely(!parent)) -+ return; -+ -+ bfqg_stats_merge(&parent->dead_stats, &bfqg->stats); -+ bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats); -+ bfqg_stats_reset(&bfqg->stats); -+ bfqg_stats_reset(&bfqg->dead_stats); -+} -+ -+static void bfq_init_entity(struct bfq_entity *entity, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->weight = entity->new_weight; -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) { -+ bfqq->ioprio = bfqq->new_ioprio; -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ bfqg_get(bfqg); -+ } -+ entity->parent = bfqg->my_entity; -+ entity->sched_data = &bfqg->sched_data; -+} -+ -+static void bfqg_stats_exit(struct bfqg_stats *stats) -+{ -+ blkg_rwstat_exit(&stats->service_bytes); -+ blkg_rwstat_exit(&stats->serviced); -+ blkg_rwstat_exit(&stats->merged); -+ blkg_rwstat_exit(&stats->service_time); -+ blkg_rwstat_exit(&stats->wait_time); -+ blkg_rwstat_exit(&stats->queued); -+ blkg_stat_exit(&stats->sectors); -+ blkg_stat_exit(&stats->time); -+ blkg_stat_exit(&stats->unaccounted_time); -+ blkg_stat_exit(&stats->avg_queue_size_sum); -+ blkg_stat_exit(&stats->avg_queue_size_samples); -+ blkg_stat_exit(&stats->dequeue); -+ blkg_stat_exit(&stats->group_wait_time); -+ blkg_stat_exit(&stats->idle_time); -+ blkg_stat_exit(&stats->empty_time); -+} -+ -+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) -+{ -+ if (blkg_rwstat_init(&stats->service_bytes, gfp) || -+ blkg_rwstat_init(&stats->serviced, gfp) || -+ blkg_rwstat_init(&stats->merged, gfp) || -+ blkg_rwstat_init(&stats->service_time, gfp) || -+ blkg_rwstat_init(&stats->wait_time, gfp) || -+ blkg_rwstat_init(&stats->queued, gfp) || -+ blkg_stat_init(&stats->sectors, gfp) || -+ blkg_stat_init(&stats->time, gfp) || -+ blkg_stat_init(&stats->unaccounted_time, gfp) || -+ blkg_stat_init(&stats->avg_queue_size_sum, gfp) || -+ blkg_stat_init(&stats->avg_queue_size_samples, gfp) || -+ blkg_stat_init(&stats->dequeue, gfp) || -+ blkg_stat_init(&stats->group_wait_time, gfp) || -+ blkg_stat_init(&stats->idle_time, gfp) || -+ blkg_stat_init(&stats->empty_time, gfp)) { -+ bfqg_stats_exit(stats); -+ return -ENOMEM; -+ } -+ -+ return 0; -+} -+ -+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) -+{ -+ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; -+} -+ -+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) -+{ -+ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); -+} -+ -+static void bfq_cpd_init(struct blkcg_policy_data *cpd) -+{ -+ struct bfq_group_data *d = cpd_to_bfqgd(cpd); -+ -+ d->weight = BFQ_DEFAULT_GRP_WEIGHT; -+} -+ -+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) -+{ -+ struct bfq_group *bfqg; -+ -+ bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); -+ if (!bfqg) -+ return NULL; -+ -+ if (bfqg_stats_init(&bfqg->stats, gfp) || -+ bfqg_stats_init(&bfqg->dead_stats, gfp)) { -+ kfree(bfqg); -+ return NULL; -+ } -+ -+ return &bfqg->pd; -+} -+ -+static void bfq_group_set_parent(struct bfq_group *bfqg, -+ struct bfq_group *parent) -+{ -+ struct bfq_entity *entity; -+ -+ BUG_ON(!parent); -+ BUG_ON(!bfqg); -+ BUG_ON(bfqg == parent); -+ -+ entity = &bfqg->entity; -+ entity->parent = parent->my_entity; -+ entity->sched_data = &parent->sched_data; -+} -+ -+static void bfq_pd_init(struct blkg_policy_data *pd) -+{ -+ struct blkcg_gq *blkg = pd_to_blkg(pd); -+ struct bfq_group *bfqg = blkg_to_bfqg(blkg); -+ struct bfq_data *bfqd = blkg->q->elevator->elevator_data; -+ struct bfq_entity *entity = &bfqg->entity; -+ struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); -+ -+ entity->orig_weight = entity->weight = entity->new_weight = d->weight; -+ entity->my_sched_data = &bfqg->sched_data; -+ bfqg->my_entity = entity; /* -+ * the root_group's will be set to NULL -+ * in bfq_init_queue() -+ */ -+ bfqg->bfqd = bfqd; -+ bfqg->active_entities = 0; -+} -+ -+static void bfq_pd_free(struct blkg_policy_data *pd) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ -+ bfqg_stats_exit(&bfqg->stats); -+ bfqg_stats_exit(&bfqg->dead_stats); -+ -+ return kfree(bfqg); -+} -+ -+/* offset delta from bfqg->stats to bfqg->dead_stats */ -+static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) - -+ offsetof(struct bfq_group, stats); -+ -+/* to be used by recursive prfill, sums live and dead stats recursively */ -+static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) -+{ -+ u64 sum = 0; -+ -+ sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); -+ sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, -+ off + dead_stats_off_delta); -+ return sum; -+} -+ -+/* to be used by recursive prfill, sums live and dead rwstats recursively */ -+static struct blkg_rwstat -+bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, int off) -+{ -+ struct blkg_rwstat a, b; -+ -+ a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); -+ b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, -+ off + dead_stats_off_delta); -+ blkg_rwstat_add_aux(&a, &b); -+ return a; -+} -+ -+static void bfq_pd_reset_stats(struct blkg_policy_data *pd) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ -+ bfqg_stats_reset(&bfqg->stats); -+ bfqg_stats_reset(&bfqg->dead_stats); -+} -+ -+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ struct request_queue *q = bfqd->queue; -+ struct bfq_group *bfqg = NULL, *parent; -+ struct bfq_entity *entity = NULL; -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ -+ /* avoid lookup for the common case where there's no blkcg */ -+ if (blkcg == &blkcg_root) { -+ bfqg = bfqd->root_group; -+ } else { -+ struct blkcg_gq *blkg; -+ -+ blkg = blkg_lookup_create(blkcg, q); -+ if (!IS_ERR(blkg)) -+ bfqg = blkg_to_bfqg(blkg); -+ else /* fallback to root_group */ -+ bfqg = bfqd->root_group; -+ } -+ -+ BUG_ON(!bfqg); -+ -+ /* -+ * Update chain of bfq_groups as we might be handling a leaf group -+ * which, along with some of its relatives, has not been hooked yet -+ * to the private hierarchy of BFQ. -+ */ -+ entity = &bfqg->entity; -+ for_each_entity(entity) { -+ bfqg = container_of(entity, struct bfq_group, entity); -+ BUG_ON(!bfqg); -+ if (bfqg != bfqd->root_group) { -+ parent = bfqg_parent(bfqg); -+ if (!parent) -+ parent = bfqd->root_group; -+ BUG_ON(!parent); -+ bfq_group_set_parent(bfqg, parent); -+ } -+ } -+ -+ return bfqg; -+} -+ -+/** -+ * bfq_bfqq_move - migrate @bfqq to @bfqg. -+ * @bfqd: queue descriptor. -+ * @bfqq: the queue to move. -+ * @entity: @bfqq's entity. -+ * @bfqg: the group to move to. -+ * -+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating -+ * it on the new one. Avoid putting the entity on the old group idle tree. -+ * -+ * Must be called under the queue lock; the cgroup owning @bfqg must -+ * not disappear (by now this just means that we are called under -+ * rcu_read_lock()). -+ */ -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_entity *entity, struct bfq_group *bfqg) -+{ -+ int busy, resume; -+ -+ busy = bfq_bfqq_busy(bfqq); -+ resume = !RB_EMPTY_ROOT(&bfqq->sort_list); -+ -+ BUG_ON(resume && !entity->on_st); -+ BUG_ON(busy && !resume && entity->on_st && -+ bfqq != bfqd->in_service_queue); -+ -+ if (busy) { -+ BUG_ON(atomic_read(&bfqq->ref) < 2); -+ -+ if (!resume) -+ bfq_del_bfqq_busy(bfqd, bfqq, 0); -+ else -+ bfq_deactivate_bfqq(bfqd, bfqq, 0); -+ } else if (entity->on_st) -+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); -+ bfqg_put(bfqq_group(bfqq)); -+ -+ /* -+ * Here we use a reference to bfqg. We don't need a refcounter -+ * as the cgroup reference will not be dropped, so that its -+ * destroy() callback will not be invoked. -+ */ -+ entity->parent = bfqg->my_entity; -+ entity->sched_data = &bfqg->sched_data; -+ bfqg_get(bfqg); -+ -+ if (busy) { -+ if (resume) -+ bfq_activate_bfqq(bfqd, bfqq); -+ } -+ -+ if (!bfqd->in_service_queue && !bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+} -+ -+/** -+ * __bfq_bic_change_cgroup - move @bic to @cgroup. -+ * @bfqd: the queue descriptor. -+ * @bic: the bic to move. -+ * @blkcg: the blk-cgroup to move to. -+ * -+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller -+ * has to make sure that the reference to cgroup is valid across the call. -+ * -+ * NOTE: an alternative approach might have been to store the current -+ * cgroup in bfqq and getting a reference to it, reducing the lookup -+ * time here, at the price of slightly more complex code. -+ */ -+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, -+ struct blkcg *blkcg) -+{ -+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); -+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); -+ struct bfq_group *bfqg; -+ struct bfq_entity *entity; -+ -+ lockdep_assert_held(bfqd->queue->queue_lock); -+ -+ bfqg = bfq_find_alloc_group(bfqd, blkcg); -+ if (async_bfqq) { -+ entity = &async_bfqq->entity; -+ -+ if (entity->sched_data != &bfqg->sched_data) { -+ bic_set_bfqq(bic, NULL, 0); -+ bfq_log_bfqq(bfqd, async_bfqq, -+ "bic_change_group: %p %d", -+ async_bfqq, atomic_read(&async_bfqq->ref)); -+ bfq_put_queue(async_bfqq); -+ } -+ } -+ -+ if (sync_bfqq) { -+ entity = &sync_bfqq->entity; -+ if (entity->sched_data != &bfqg->sched_data) -+ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); -+ } -+ -+ return bfqg; -+} -+ -+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct blkcg *blkcg; -+ struct bfq_group *bfqg = NULL; -+ uint64_t id; -+ -+ rcu_read_lock(); -+ blkcg = bio_blkcg(bio); -+ id = blkcg->css.serial_nr; -+ rcu_read_unlock(); -+ -+ /* -+ * Check whether blkcg has changed. The condition may trigger -+ * spuriously on a newly created cic but there's no harm. -+ */ -+ if (unlikely(!bfqd) || likely(bic->blkcg_id == id)) -+ return; -+ -+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg); -+ BUG_ON(!bfqg); -+ bic->blkcg_id = id; -+} -+ -+/** -+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. -+ * @st: the service tree being flushed. -+ */ -+static void bfq_flush_idle_tree(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *entity = st->first_idle; -+ -+ for (; entity ; entity = st->first_idle) -+ __bfq_deactivate_entity(entity, 0); -+} -+ -+/** -+ * bfq_reparent_leaf_entity - move leaf entity to the root_group. -+ * @bfqd: the device data structure with the root group. -+ * @entity: the entity to move. -+ */ -+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ BUG_ON(!bfqq); -+ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); -+} -+ -+/** -+ * bfq_reparent_active_entities - move to the root group all active -+ * entities. -+ * @bfqd: the device data structure with the root group. -+ * @bfqg: the group to move from. -+ * @st: the service tree with the entities. -+ * -+ * Needs queue_lock to be taken and reference to be valid over the call. -+ */ -+static void bfq_reparent_active_entities(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ struct bfq_service_tree *st) -+{ -+ struct rb_root *active = &st->active; -+ struct bfq_entity *entity = NULL; -+ -+ if (!RB_EMPTY_ROOT(&st->active)) -+ entity = bfq_entity_of(rb_first(active)); -+ -+ for (; entity ; entity = bfq_entity_of(rb_first(active))) -+ bfq_reparent_leaf_entity(bfqd, entity); -+ -+ if (bfqg->sched_data.in_service_entity) -+ bfq_reparent_leaf_entity(bfqd, -+ bfqg->sched_data.in_service_entity); -+} -+ -+/** -+ * bfq_destroy_group - destroy @bfqg. -+ * @bfqg: the group being destroyed. -+ * -+ * Destroy @bfqg, making sure that it is not referenced from its parent. -+ * blkio already grabs the queue_lock for us, so no need to use RCU-based magic -+ */ -+static void bfq_pd_offline(struct blkg_policy_data *pd) -+{ -+ struct bfq_service_tree *st; -+ struct bfq_group *bfqg; -+ struct bfq_data *bfqd; -+ struct bfq_entity *entity; -+ int i; -+ -+ BUG_ON(!pd); -+ bfqg = pd_to_bfqg(pd); -+ BUG_ON(!bfqg); -+ bfqd = bfqg->bfqd; -+ BUG_ON(bfqd && !bfqd->root_group); -+ -+ entity = bfqg->my_entity; -+ -+ if (!entity) /* root group */ -+ return; -+ -+ /* -+ * Empty all service_trees belonging to this group before -+ * deactivating the group itself. -+ */ -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { -+ BUG_ON(!bfqg->sched_data.service_tree); -+ st = bfqg->sched_data.service_tree + i; -+ /* -+ * The idle tree may still contain bfq_queues belonging -+ * to exited task because they never migrated to a different -+ * cgroup from the one being destroyed now. No one else -+ * can access them so it's safe to act without any lock. -+ */ -+ bfq_flush_idle_tree(st); -+ -+ /* -+ * It may happen that some queues are still active -+ * (busy) upon group destruction (if the corresponding -+ * processes have been forced to terminate). We move -+ * all the leaf entities corresponding to these queues -+ * to the root_group. -+ * Also, it may happen that the group has an entity -+ * in service, which is disconnected from the active -+ * tree: it must be moved, too. -+ * There is no need to put the sync queues, as the -+ * scheduler has taken no reference. -+ */ -+ bfq_reparent_active_entities(bfqd, bfqg, st); -+ BUG_ON(!RB_EMPTY_ROOT(&st->active)); -+ BUG_ON(!RB_EMPTY_ROOT(&st->idle)); -+ } -+ BUG_ON(bfqg->sched_data.next_in_service); -+ BUG_ON(bfqg->sched_data.in_service_entity); -+ -+ __bfq_deactivate_entity(entity, 0); -+ bfq_put_async_queues(bfqd, bfqg); -+ BUG_ON(entity->tree); -+ -+ bfqg_stats_xfer_dead(bfqg); -+} -+ -+static void bfq_end_wr_async(struct bfq_data *bfqd) -+{ -+ struct blkcg_gq *blkg; -+ -+ list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { -+ struct bfq_group *bfqg = blkg_to_bfqg(blkg); -+ -+ bfq_end_wr_async_queues(bfqd, bfqg); -+ } -+ bfq_end_wr_async_queues(bfqd, bfqd->root_group); -+} -+ -+static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css, -+ struct cftype *cftype) -+{ -+ struct blkcg *blkcg = css_to_blkcg(css); -+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -+ int ret = -EINVAL; -+ -+ spin_lock_irq(&blkcg->lock); -+ ret = bfqgd->weight; -+ spin_unlock_irq(&blkcg->lock); -+ -+ return ret; -+} -+ -+static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v) -+{ -+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); -+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -+ -+ spin_lock_irq(&blkcg->lock); -+ seq_printf(sf, "%u\n", bfqgd->weight); -+ spin_unlock_irq(&blkcg->lock); -+ -+ return 0; -+} -+ -+static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, -+ struct cftype *cftype, -+ u64 val) -+{ -+ struct blkcg *blkcg = css_to_blkcg(css); -+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -+ struct blkcg_gq *blkg; -+ int ret = -EINVAL; -+ -+ if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) -+ return ret; -+ -+ ret = 0; -+ spin_lock_irq(&blkcg->lock); -+ bfqgd->weight = (unsigned short)val; -+ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { -+ struct bfq_group *bfqg = blkg_to_bfqg(blkg); -+ -+ if (!bfqg) -+ continue; -+ /* -+ * Setting the prio_changed flag of the entity -+ * to 1 with new_weight == weight would re-set -+ * the value of the weight to its ioprio mapping. -+ * Set the flag only if necessary. -+ */ -+ if ((unsigned short)val != bfqg->entity.new_weight) { -+ bfqg->entity.new_weight = (unsigned short)val; -+ /* -+ * Make sure that the above new value has been -+ * stored in bfqg->entity.new_weight before -+ * setting the prio_changed flag. In fact, -+ * this flag may be read asynchronously (in -+ * critical sections protected by a different -+ * lock than that held here), and finding this -+ * flag set may cause the execution of the code -+ * for updating parameters whose value may -+ * depend also on bfqg->entity.new_weight (in -+ * __bfq_entity_update_weight_prio). -+ * This barrier makes sure that the new value -+ * of bfqg->entity.new_weight is correctly -+ * seen in that code. -+ */ -+ smp_wmb(); -+ bfqg->entity.prio_changed = 1; -+ } -+ } -+ spin_unlock_irq(&blkcg->lock); -+ -+ return ret; -+} -+ -+static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of, -+ char *buf, size_t nbytes, -+ loff_t off) -+{ -+ /* First unsigned long found in the file is used */ -+ return bfqio_cgroup_weight_write(of_css(of), NULL, -+ simple_strtoull(strim(buf), NULL, 0)); -+} -+ -+static int bfqg_print_stat(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, -+ &blkcg_policy_bfq, seq_cft(sf)->private, false); -+ return 0; -+} -+ -+static int bfqg_print_rwstat(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, -+ &blkcg_policy_bfq, seq_cft(sf)->private, true); -+ return 0; -+} -+ -+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ u64 sum = bfqg_stat_pd_recursive_sum(pd, off); -+ -+ return __blkg_prfill_u64(sf, pd, sum); -+} -+ -+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off); -+ -+ return __blkg_prfill_rwstat(sf, pd, &sum); -+} -+ -+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_stat_recursive, &blkcg_policy_bfq, -+ seq_cft(sf)->private, false); -+ return 0; -+} -+ -+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, -+ seq_cft(sf)->private, true); -+ return 0; -+} -+ -+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); -+ u64 v = 0; -+ -+ if (samples) { -+ v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); -+ v = div64_u64(v, samples); -+ } -+ __blkg_prfill_u64(sf, pd, v); -+ return 0; -+} -+ -+/* print avg_queue_size */ -+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, -+ 0, false); -+ return 0; -+} -+ -+static struct bfq_group * -+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -+{ -+ int ret; -+ -+ ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); -+ if (ret) -+ return NULL; -+ -+ return blkg_to_bfqg(bfqd->queue->root_blkg); -+} -+ -+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) -+{ -+ struct bfq_group_data *bgd; -+ -+ bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); -+ if (!bgd) -+ return NULL; -+ return &bgd->pd; -+} -+ -+static void bfq_cpd_free(struct blkcg_policy_data *cpd) -+{ -+ kfree(cpd_to_bfqgd(cpd)); -+} -+ -+static struct cftype bfqio_files_dfl[] = { -+ { -+ .name = "weight", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .seq_show = bfqio_cgroup_weight_read_dfl, -+ .write = bfqio_cgroup_weight_write_dfl, -+ }, -+ {} /* terminate */ -+}; -+ -+static struct cftype bfqio_files[] = { -+ { -+ .name = "bfq.weight", -+ .read_u64 = bfqio_cgroup_weight_read, -+ .write_u64 = bfqio_cgroup_weight_write, -+ }, -+ /* statistics, cover only the tasks in the bfqg */ -+ { -+ .name = "bfq.time", -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.sectors", -+ .private = offsetof(struct bfq_group, stats.sectors), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.io_service_bytes", -+ .private = offsetof(struct bfq_group, stats.service_bytes), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_serviced", -+ .private = offsetof(struct bfq_group, stats.serviced), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_service_time", -+ .private = offsetof(struct bfq_group, stats.service_time), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_wait_time", -+ .private = offsetof(struct bfq_group, stats.wait_time), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_merged", -+ .private = offsetof(struct bfq_group, stats.merged), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_queued", -+ .private = offsetof(struct bfq_group, stats.queued), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ -+ /* the same statictics which cover the bfqg and its descendants */ -+ { -+ .name = "bfq.time_recursive", -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat_recursive, -+ }, -+ { -+ .name = "bfq.sectors_recursive", -+ .private = offsetof(struct bfq_group, stats.sectors), -+ .seq_show = bfqg_print_stat_recursive, -+ }, -+ { -+ .name = "bfq.io_service_bytes_recursive", -+ .private = offsetof(struct bfq_group, stats.service_bytes), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_serviced_recursive", -+ .private = offsetof(struct bfq_group, stats.serviced), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_service_time_recursive", -+ .private = offsetof(struct bfq_group, stats.service_time), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_wait_time_recursive", -+ .private = offsetof(struct bfq_group, stats.wait_time), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_merged_recursive", -+ .private = offsetof(struct bfq_group, stats.merged), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_queued_recursive", -+ .private = offsetof(struct bfq_group, stats.queued), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.avg_queue_size", -+ .seq_show = bfqg_print_avg_queue_size, -+ }, -+ { -+ .name = "bfq.group_wait_time", -+ .private = offsetof(struct bfq_group, stats.group_wait_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.idle_time", -+ .private = offsetof(struct bfq_group, stats.idle_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.empty_time", -+ .private = offsetof(struct bfq_group, stats.empty_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.dequeue", -+ .private = offsetof(struct bfq_group, stats.dequeue), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.unaccounted_time", -+ .private = offsetof(struct bfq_group, stats.unaccounted_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { } /* terminate */ -+}; -+ -+static struct blkcg_policy blkcg_policy_bfq = { -+ .dfl_cftypes = bfqio_files_dfl, -+ .legacy_cftypes = bfqio_files, -+ -+ .pd_alloc_fn = bfq_pd_alloc, -+ .pd_init_fn = bfq_pd_init, -+ .pd_offline_fn = bfq_pd_offline, -+ .pd_free_fn = bfq_pd_free, -+ .pd_reset_stats_fn = bfq_pd_reset_stats, -+ -+ .cpd_alloc_fn = bfq_cpd_alloc, -+ .cpd_init_fn = bfq_cpd_init, -+ .cpd_bind_fn = bfq_cpd_init, -+ .cpd_free_fn = bfq_cpd_free, -+}; -+ -+#else -+ -+static void bfq_init_entity(struct bfq_entity *entity, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->weight = entity->new_weight; -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) { -+ bfqq->ioprio = bfqq->new_ioprio; -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ } -+ entity->sched_data = &bfqg->sched_data; -+} -+ -+static struct bfq_group * -+bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ -+ return bfqd->root_group; -+} -+ -+static void bfq_bfqq_move(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct bfq_entity *entity, -+ struct bfq_group *bfqg) -+{ -+} -+ -+static void bfq_end_wr_async(struct bfq_data *bfqd) -+{ -+ bfq_end_wr_async_queues(bfqd, bfqd->root_group); -+} -+ -+static void bfq_disconnect_groups(struct bfq_data *bfqd) -+{ -+ bfq_put_async_queues(bfqd, bfqd->root_group); -+} -+ -+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ return bfqd->root_group; -+} -+ -+static struct bfq_group * -+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -+{ -+ struct bfq_group *bfqg; -+ int i; -+ -+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); -+ if (!bfqg) -+ return NULL; -+ -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ -+ return bfqg; -+} -+#endif -diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c -new file mode 100644 -index 0000000..fb7bb8f ---- /dev/null -+++ b/block/bfq-ioc.c -@@ -0,0 +1,36 @@ -+/* -+ * BFQ: I/O context handling. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe -+ * -+ * Copyright (C) 2008 Fabio Checconi -+ * Paolo Valente -+ * -+ * Copyright (C) 2010 Paolo Valente -+ */ -+ -+/** -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. -+ * @icq: the iocontext queue. -+ */ -+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) -+{ -+ /* bic->icq is the first member, %NULL will convert to %NULL */ -+ return container_of(icq, struct bfq_io_cq, icq); -+} -+ -+/** -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. -+ * @bfqd: the lookup key. -+ * @ioc: the io_context of the process doing I/O. -+ * -+ * Queue lock must be held. -+ */ -+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, -+ struct io_context *ioc) -+{ -+ if (ioc) -+ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); -+ return NULL; -+} -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c -new file mode 100644 -index 0000000..85e2169 ---- /dev/null -+++ b/block/bfq-iosched.c -@@ -0,0 +1,3763 @@ -+/* -+ * Budget Fair Queueing (BFQ) disk scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe -+ * -+ * Copyright (C) 2008 Fabio Checconi -+ * Paolo Valente -+ * -+ * Copyright (C) 2010 Paolo Valente -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ * -+ * BFQ is a proportional-share storage-I/O scheduling algorithm based on -+ * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, -+ * measured in number of sectors, to processes instead of time slices. The -+ * device is not granted to the in-service process for a given time slice, -+ * but until it has exhausted its assigned budget. This change from the time -+ * to the service domain allows BFQ to distribute the device throughput -+ * among processes as desired, without any distortion due to ZBR, workload -+ * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, -+ * called B-WF2Q+, to schedule processes according to their budgets. More -+ * precisely, BFQ schedules queues associated to processes. Thanks to the -+ * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to -+ * I/O-bound processes issuing sequential requests (to boost the -+ * throughput), and yet guarantee a low latency to interactive and soft -+ * real-time applications. -+ * -+ * BFQ is described in [1], where also a reference to the initial, more -+ * theoretical paper on BFQ can be found. The interested reader can find -+ * in the latter paper full details on the main algorithm, as well as -+ * formulas of the guarantees and formal proofs of all the properties. -+ * With respect to the version of BFQ presented in these papers, this -+ * implementation adds a few more heuristics, such as the one that -+ * guarantees a low latency to soft real-time applications, and a -+ * hierarchical extension based on H-WF2Q+. -+ * -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) -+ * complexity derives from the one introduced with EEVDF in [3]. -+ * -+ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness -+ * with the BFQ Disk I/O Scheduler'', -+ * Proceedings of the 5th Annual International Systems and Storage -+ * Conference (SYSTOR '12), June 2012. -+ * -+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf -+ * -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, -+ * Oct 1997. -+ * -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz -+ * -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline -+ * First: A Flexible and Accurate Mechanism for Proportional Share -+ * Resource Allocation,'' technical report. -+ * -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "bfq.h" -+#include "blk.h" -+ -+/* Expiration time of sync (0) and async (1) requests, in jiffies. */ -+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; -+ -+/* Maximum backwards seek, in KiB. */ -+static const int bfq_back_max = 16 * 1024; -+ -+/* Penalty of a backwards seek, in number of sectors. */ -+static const int bfq_back_penalty = 2; -+ -+/* Idling period duration, in jiffies. */ -+static int bfq_slice_idle = HZ / 125; -+ -+/* Minimum number of assigned budgets for which stats are safe to compute. */ -+static const int bfq_stats_min_budgets = 194; -+ -+/* Default maximum budget values, in sectors and number of requests. */ -+static const int bfq_default_max_budget = 16 * 1024; -+static const int bfq_max_budget_async_rq = 4; -+ -+/* -+ * Async to sync throughput distribution is controlled as follows: -+ * when an async request is served, the entity is charged the number -+ * of sectors of the request, multiplied by the factor below -+ */ -+static const int bfq_async_charge_factor = 10; -+ -+/* Default timeout values, in jiffies, approximating CFQ defaults. */ -+static const int bfq_timeout_sync = HZ / 8; -+static int bfq_timeout_async = HZ / 25; -+ -+struct kmem_cache *bfq_pool; -+ -+/* Below this threshold (in ms), we consider thinktime immediate. */ -+#define BFQ_MIN_TT 2 -+ -+/* hw_tag detection: parallel requests threshold and min samples needed. */ -+#define BFQ_HW_QUEUE_THRESHOLD 4 -+#define BFQ_HW_QUEUE_SAMPLES 32 -+ -+#define BFQQ_SEEK_THR (sector_t)(8 * 1024) -+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) -+ -+/* Min samples used for peak rate estimation (for autotuning). */ -+#define BFQ_PEAK_RATE_SAMPLES 32 -+ -+/* Shift used for peak rate fixed precision calculations. */ -+#define BFQ_RATE_SHIFT 16 -+ -+/* -+ * By default, BFQ computes the duration of the weight raising for -+ * interactive applications automatically, using the following formula: -+ * duration = (R / r) * T, where r is the peak rate of the device, and -+ * R and T are two reference parameters. -+ * In particular, R is the peak rate of the reference device (see below), -+ * and T is a reference time: given the systems that are likely to be -+ * installed on the reference device according to its speed class, T is -+ * about the maximum time needed, under BFQ and while reading two files in -+ * parallel, to load typical large applications on these systems. -+ * In practice, the slower/faster the device at hand is, the more/less it -+ * takes to load applications with respect to the reference device. -+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive -+ * applications. -+ * -+ * BFQ uses four different reference pairs (R, T), depending on: -+ * . whether the device is rotational or non-rotational; -+ * . whether the device is slow, such as old or portable HDDs, as well as -+ * SD cards, or fast, such as newer HDDs and SSDs. -+ * -+ * The device's speed class is dynamically (re)detected in -+ * bfq_update_peak_rate() every time the estimated peak rate is updated. -+ * -+ * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0] -+ * are the reference values for a slow/fast rotational device, whereas -+ * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for -+ * a slow/fast non-rotational device. Finally, device_speed_thresh are the -+ * thresholds used to switch between speed classes. -+ * Both the reference peak rates and the thresholds are measured in -+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT. -+ */ -+static int R_slow[2] = {1536, 10752}; -+static int R_fast[2] = {17415, 34791}; -+/* -+ * To improve readability, a conversion function is used to initialize the -+ * following arrays, which entails that they can be initialized only in a -+ * function. -+ */ -+static int T_slow[2]; -+static int T_fast[2]; -+static int device_speed_thresh[2]; -+ -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) -+ -+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -+ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd); -+ -+#include "bfq-ioc.c" -+#include "bfq-sched.c" -+#include "bfq-cgroup.c" -+ -+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) -+ -+#define bfq_sample_valid(samples) ((samples) > 80) -+ -+/* -+ * We regard a request as SYNC, if either it's a read or has the SYNC bit -+ * set (in which case it could also be a direct WRITE). -+ */ -+static int bfq_bio_sync(struct bio *bio) -+{ -+ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) -+ return 1; -+ -+ return 0; -+} -+ -+/* -+ * Scheduler run of queue, if there are requests pending and no one in the -+ * driver that will restart queueing. -+ */ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd) -+{ -+ if (bfqd->queued != 0) { -+ bfq_log(bfqd, "schedule dispatch"); -+ kblockd_schedule_work(&bfqd->unplug_work); -+ } -+} -+ -+/* -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now. -+ * We choose the request that is closesr to the head right now. Distance -+ * behind the head is penalized and only allowed to a certain extent. -+ */ -+static struct request *bfq_choose_req(struct bfq_data *bfqd, -+ struct request *rq1, -+ struct request *rq2, -+ sector_t last) -+{ -+ sector_t s1, s2, d1 = 0, d2 = 0; -+ unsigned long back_max; -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ -+ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ -+ -+ if (!rq1 || rq1 == rq2) -+ return rq2; -+ if (!rq2) -+ return rq1; -+ -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) -+ return rq1; -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) -+ return rq2; -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) -+ return rq1; -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) -+ return rq2; -+ -+ s1 = blk_rq_pos(rq1); -+ s2 = blk_rq_pos(rq2); -+ -+ /* -+ * By definition, 1KiB is 2 sectors. -+ */ -+ back_max = bfqd->bfq_back_max * 2; -+ -+ /* -+ * Strict one way elevator _except_ in the case where we allow -+ * short backward seeks which are biased as twice the cost of a -+ * similar forward seek. -+ */ -+ if (s1 >= last) -+ d1 = s1 - last; -+ else if (s1 + back_max >= last) -+ d1 = (last - s1) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ1_WRAP; -+ -+ if (s2 >= last) -+ d2 = s2 - last; -+ else if (s2 + back_max >= last) -+ d2 = (last - s2) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ2_WRAP; -+ -+ /* Found required data */ -+ -+ /* -+ * By doing switch() on the bit mask "wrap" we avoid having to -+ * check two variables for all permutations: --> faster! -+ */ -+ switch (wrap) { -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ -+ if (d1 < d2) -+ return rq1; -+ else if (d2 < d1) -+ return rq2; -+ -+ if (s1 >= s2) -+ return rq1; -+ else -+ return rq2; -+ -+ case BFQ_RQ2_WRAP: -+ return rq1; -+ case BFQ_RQ1_WRAP: -+ return rq2; -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ -+ default: -+ /* -+ * Since both rqs are wrapped, -+ * start with the one that's further behind head -+ * (--> only *one* back seek required), -+ * since back seek takes more time than forward. -+ */ -+ if (s1 <= s2) -+ return rq1; -+ else -+ return rq2; -+ } -+} -+ -+/* -+ * Tell whether there are active queues or groups with differentiated weights. -+ */ -+static bool bfq_differentiated_weights(struct bfq_data *bfqd) -+{ -+ /* -+ * For weights to differ, at least one of the trees must contain -+ * at least two nodes. -+ */ -+ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && -+ (bfqd->queue_weights_tree.rb_node->rb_left || -+ bfqd->queue_weights_tree.rb_node->rb_right) -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ ) || -+ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && -+ (bfqd->group_weights_tree.rb_node->rb_left || -+ bfqd->group_weights_tree.rb_node->rb_right) -+#endif -+ ); -+} -+ -+/* -+ * The following function returns true if every queue must receive the -+ * same share of the throughput (this condition is used when deciding -+ * whether idling may be disabled, see the comments in the function -+ * bfq_bfqq_may_idle()). -+ * -+ * Such a scenario occurs when: -+ * 1) all active queues have the same weight, -+ * 2) all active groups at the same level in the groups tree have the same -+ * weight, -+ * 3) all active groups at the same level in the groups tree have the same -+ * number of children. -+ * -+ * Unfortunately, keeping the necessary state for evaluating exactly the -+ * above symmetry conditions would be quite complex and time-consuming. -+ * Therefore this function evaluates, instead, the following stronger -+ * sub-conditions, for which it is much easier to maintain the needed -+ * state: -+ * 1) all active queues have the same weight, -+ * 2) all active groups have the same weight, -+ * 3) all active groups have at most one active child each. -+ * In particular, the last two conditions are always true if hierarchical -+ * support and the cgroups interface are not enabled, thus no state needs -+ * to be maintained in this case. -+ */ -+static bool bfq_symmetric_scenario(struct bfq_data *bfqd) -+{ -+ return -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ !bfqd->active_numerous_groups && -+#endif -+ !bfq_differentiated_weights(bfqd); -+} -+ -+/* -+ * If the weight-counter tree passed as input contains no counter for -+ * the weight of the input entity, then add that counter; otherwise just -+ * increment the existing counter. -+ * -+ * Note that weight-counter trees contain few nodes in mostly symmetric -+ * scenarios. For example, if all queues have the same weight, then the -+ * weight-counter tree for the queues may contain at most one node. -+ * This holds even if low_latency is on, because weight-raised queues -+ * are not inserted in the tree. -+ * In most scenarios, the rate at which nodes are created/destroyed -+ * should be low too. -+ */ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root) -+{ -+ struct rb_node **new = &(root->rb_node), *parent = NULL; -+ -+ /* -+ * Do not insert if the entity is already associated with a -+ * counter, which happens if: -+ * 1) the entity is associated with a queue, -+ * 2) a request arrival has caused the queue to become both -+ * non-weight-raised, and hence change its weight, and -+ * backlogged; in this respect, each of the two events -+ * causes an invocation of this function, -+ * 3) this is the invocation of this function caused by the -+ * second event. This second invocation is actually useless, -+ * and we handle this fact by exiting immediately. More -+ * efficient or clearer solutions might possibly be adopted. -+ */ -+ if (entity->weight_counter) -+ return; -+ -+ while (*new) { -+ struct bfq_weight_counter *__counter = container_of(*new, -+ struct bfq_weight_counter, -+ weights_node); -+ parent = *new; -+ -+ if (entity->weight == __counter->weight) { -+ entity->weight_counter = __counter; -+ goto inc_counter; -+ } -+ if (entity->weight < __counter->weight) -+ new = &((*new)->rb_left); -+ else -+ new = &((*new)->rb_right); -+ } -+ -+ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), -+ GFP_ATOMIC); -+ entity->weight_counter->weight = entity->weight; -+ rb_link_node(&entity->weight_counter->weights_node, parent, new); -+ rb_insert_color(&entity->weight_counter->weights_node, root); -+ -+inc_counter: -+ entity->weight_counter->num_active++; -+} -+ -+/* -+ * Decrement the weight counter associated with the entity, and, if the -+ * counter reaches 0, remove the counter from the tree. -+ * See the comments to the function bfq_weights_tree_add() for considerations -+ * about overhead. -+ */ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root) -+{ -+ if (!entity->weight_counter) -+ return; -+ -+ BUG_ON(RB_EMPTY_ROOT(root)); -+ BUG_ON(entity->weight_counter->weight != entity->weight); -+ -+ BUG_ON(!entity->weight_counter->num_active); -+ entity->weight_counter->num_active--; -+ if (entity->weight_counter->num_active > 0) -+ goto reset_entity_pointer; -+ -+ rb_erase(&entity->weight_counter->weights_node, root); -+ kfree(entity->weight_counter); -+ -+reset_entity_pointer: -+ entity->weight_counter = NULL; -+} -+ -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct rb_node *rbnext = rb_next(&last->rb_node); -+ struct rb_node *rbprev = rb_prev(&last->rb_node); -+ struct request *next = NULL, *prev = NULL; -+ -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); -+ -+ if (rbprev) -+ prev = rb_entry_rq(rbprev); -+ -+ if (rbnext) -+ next = rb_entry_rq(rbnext); -+ else { -+ rbnext = rb_first(&bfqq->sort_list); -+ if (rbnext && rbnext != &last->rb_node) -+ next = rb_entry_rq(rbnext); -+ } -+ -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); -+} -+ -+/* see the definition of bfq_async_charge_factor for details */ -+static unsigned long bfq_serv_to_charge(struct request *rq, -+ struct bfq_queue *bfqq) -+{ -+ return blk_rq_sectors(rq) * -+ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * -+ bfq_async_charge_factor)); -+} -+ -+/** -+ * bfq_updated_next_req - update the queue after a new next_rq selection. -+ * @bfqd: the device data the queue belongs to. -+ * @bfqq: the queue to update. -+ * -+ * If the first request of a queue changes we make sure that the queue -+ * has enough budget to serve at least its first request (if the -+ * request has grown). We do this because if the queue has not enough -+ * budget for its first request, it has to go through two dispatch -+ * rounds to actually get it dispatched. -+ */ -+static void bfq_updated_next_req(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct request *next_rq = bfqq->next_rq; -+ unsigned long new_budget; -+ -+ if (!next_rq) -+ return; -+ -+ if (bfqq == bfqd->in_service_queue) -+ /* -+ * In order not to break guarantees, budgets cannot be -+ * changed after an entity has been selected. -+ */ -+ return; -+ -+ BUG_ON(entity->tree != &st->active); -+ BUG_ON(entity == entity->sched_data->in_service_entity); -+ -+ new_budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ if (entity->budget != new_budget) { -+ entity->budget = new_budget; -+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", -+ new_budget); -+ bfq_activate_bfqq(bfqd, bfqq); -+ } -+} -+ -+static unsigned int bfq_wr_duration(struct bfq_data *bfqd) -+{ -+ u64 dur; -+ -+ if (bfqd->bfq_wr_max_time > 0) -+ return bfqd->bfq_wr_max_time; -+ -+ dur = bfqd->RT_prod; -+ do_div(dur, bfqd->peak_rate); -+ -+ return dur; -+} -+ -+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ -+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *item; -+ struct hlist_node *n; -+ -+ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) -+ hlist_del_init(&item->burst_list_node); -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+ bfqd->burst_size = 1; -+} -+ -+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ -+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* Increment burst size to take into account also bfqq */ -+ bfqd->burst_size++; -+ -+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { -+ struct bfq_queue *pos, *bfqq_item; -+ struct hlist_node *n; -+ -+ /* -+ * Enough queues have been activated shortly after each -+ * other to consider this burst as large. -+ */ -+ bfqd->large_burst = true; -+ -+ /* -+ * We can now mark all queues in the burst list as -+ * belonging to a large burst. -+ */ -+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, -+ burst_list_node) -+ bfq_mark_bfqq_in_large_burst(bfqq_item); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ -+ /* -+ * From now on, and until the current burst finishes, any -+ * new queue being activated shortly after the last queue -+ * was inserted in the burst can be immediately marked as -+ * belonging to a large burst. So the burst list is not -+ * needed any more. Remove it. -+ */ -+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, -+ burst_list_node) -+ hlist_del_init(&pos->burst_list_node); -+ } else /* burst not yet large: add bfqq to the burst list */ -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+} -+ -+/* -+ * If many queues happen to become active shortly after each other, then, -+ * to help the processes associated to these queues get their job done as -+ * soon as possible, it is usually better to not grant either weight-raising -+ * or device idling to these queues. In this comment we describe, firstly, -+ * the reasons why this fact holds, and, secondly, the next function, which -+ * implements the main steps needed to properly mark these queues so that -+ * they can then be treated in a different way. -+ * -+ * As for the terminology, we say that a queue becomes active, i.e., -+ * switches from idle to backlogged, either when it is created (as a -+ * consequence of the arrival of an I/O request), or, if already existing, -+ * when a new request for the queue arrives while the queue is idle. -+ * Bursts of activations, i.e., activations of different queues occurring -+ * shortly after each other, are typically caused by services or applications -+ * that spawn or reactivate many parallel threads/processes. Examples are -+ * systemd during boot or git grep. -+ * -+ * These services or applications benefit mostly from a high throughput: -+ * the quicker the requests of the activated queues are cumulatively served, -+ * the sooner the target job of these queues gets completed. As a consequence, -+ * weight-raising any of these queues, which also implies idling the device -+ * for it, is almost always counterproductive: in most cases it just lowers -+ * throughput. -+ * -+ * On the other hand, a burst of activations may be also caused by the start -+ * of an application that does not consist in a lot of parallel I/O-bound -+ * threads. In fact, with a complex application, the burst may be just a -+ * consequence of the fact that several processes need to be executed to -+ * start-up the application. To start an application as quickly as possible, -+ * the best thing to do is to privilege the I/O related to the application -+ * with respect to all other I/O. Therefore, the best strategy to start as -+ * quickly as possible an application that causes a burst of activations is -+ * to weight-raise all the queues activated during the burst. This is the -+ * exact opposite of the best strategy for the other type of bursts. -+ * -+ * In the end, to take the best action for each of the two cases, the two -+ * types of bursts need to be distinguished. Fortunately, this seems -+ * relatively easy to do, by looking at the sizes of the bursts. In -+ * particular, we found a threshold such that bursts with a larger size -+ * than that threshold are apparently caused only by services or commands -+ * such as systemd or git grep. For brevity, hereafter we call just 'large' -+ * these bursts. BFQ *does not* weight-raise queues whose activations occur -+ * in a large burst. In addition, for each of these queues BFQ performs or -+ * does not perform idling depending on which choice boosts the throughput -+ * most. The exact choice depends on the device and request pattern at -+ * hand. -+ * -+ * Turning back to the next function, it implements all the steps needed -+ * to detect the occurrence of a large burst and to properly mark all the -+ * queues belonging to it (so that they can then be treated in a different -+ * way). This goal is achieved by maintaining a special "burst list" that -+ * holds, temporarily, the queues that belong to the burst in progress. The -+ * list is then used to mark these queues as belonging to a large burst if -+ * the burst does become large. The main steps are the following. -+ * -+ * . when the very first queue is activated, the queue is inserted into the -+ * list (as it could be the first queue in a possible burst) -+ * -+ * . if the current burst has not yet become large, and a queue Q that does -+ * not yet belong to the burst is activated shortly after the last time -+ * at which a new queue entered the burst list, then the function appends -+ * Q to the burst list -+ * -+ * . if, as a consequence of the previous step, the burst size reaches -+ * the large-burst threshold, then -+ * -+ * . all the queues in the burst list are marked as belonging to a -+ * large burst -+ * -+ * . the burst list is deleted; in fact, the burst list already served -+ * its purpose (keeping temporarily track of the queues in a burst, -+ * so as to be able to mark them as belonging to a large burst in the -+ * previous sub-step), and now is not needed any more -+ * -+ * . the device enters a large-burst mode -+ * -+ * . if a queue Q that does not belong to the burst is activated while -+ * the device is in large-burst mode and shortly after the last time -+ * at which a queue either entered the burst list or was marked as -+ * belonging to the current large burst, then Q is immediately marked -+ * as belonging to a large burst. -+ * -+ * . if a queue Q that does not belong to the burst is activated a while -+ * later, i.e., not shortly after, than the last time at which a queue -+ * either entered the burst list or was marked as belonging to the -+ * current large burst, then the current burst is deemed as finished and: -+ * -+ * . the large-burst mode is reset if set -+ * -+ * . the burst list is emptied -+ * -+ * . Q is inserted in the burst list, as Q may be the first queue -+ * in a possible new burst (then the burst list contains just Q -+ * after this step). -+ */ -+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool idle_for_long_time) -+{ -+ /* -+ * If bfqq happened to be activated in a burst, but has been idle -+ * for at least as long as an interactive queue, then we assume -+ * that, in the overall I/O initiated in the burst, the I/O -+ * associated to bfqq is finished. So bfqq does not need to be -+ * treated as a queue belonging to a burst anymore. Accordingly, -+ * we reset bfqq's in_large_burst flag if set, and remove bfqq -+ * from the burst list if it's there. We do not decrement instead -+ * burst_size, because the fact that bfqq does not need to belong -+ * to the burst list any more does not invalidate the fact that -+ * bfqq may have been activated during the current burst. -+ */ -+ if (idle_for_long_time) { -+ hlist_del_init(&bfqq->burst_list_node); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ } -+ -+ /* -+ * If bfqq is already in the burst list or is part of a large -+ * burst, then there is nothing else to do. -+ */ -+ if (!hlist_unhashed(&bfqq->burst_list_node) || -+ bfq_bfqq_in_large_burst(bfqq)) -+ return; -+ -+ /* -+ * If bfqq's activation happens late enough, then the current -+ * burst is finished, and related data structures must be reset. -+ * -+ * In this respect, consider the special case where bfqq is the very -+ * first queue being activated. In this case, last_ins_in_burst is -+ * not yet significant when we get here. But it is easy to verify -+ * that, whether or not the following condition is true, bfqq will -+ * end up being inserted into the burst list. In particular the -+ * list will happen to contain only bfqq. And this is exactly what -+ * has to happen, as bfqq may be the first queue in a possible -+ * burst. -+ */ -+ if (time_is_before_jiffies(bfqd->last_ins_in_burst + -+ bfqd->bfq_burst_interval)) { -+ bfqd->large_burst = false; -+ bfq_reset_burst_list(bfqd, bfqq); -+ return; -+ } -+ -+ /* -+ * If we get here, then bfqq is being activated shortly after the -+ * last queue. So, if the current burst is also large, we can mark -+ * bfqq as belonging to this large burst immediately. -+ */ -+ if (bfqd->large_burst) { -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ return; -+ } -+ -+ /* -+ * If we get here, then a large-burst state has not yet been -+ * reached, but bfqq is being activated shortly after the last -+ * queue. Then we add bfqq to the burst. -+ */ -+ bfq_add_to_burst(bfqd, bfqq); -+} -+ -+static void bfq_add_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *next_rq, *prev; -+ unsigned long old_wr_coeff = bfqq->wr_coeff; -+ bool interactive = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); -+ bfqq->queued[rq_is_sync(rq)]++; -+ bfqd->queued++; -+ -+ elv_rb_add(&bfqq->sort_list, rq); -+ -+ /* -+ * Check if this request is a better next-serve candidate. -+ */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ -+ if (!bfq_bfqq_busy(bfqq)) { -+ bool soft_rt, in_burst, -+ idle_for_long_time = time_is_before_jiffies( -+ bfqq->budget_timeout + -+ bfqd->bfq_wr_min_idle_time); -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, -+ rq->cmd_flags); -+#endif -+ if (bfq_bfqq_sync(bfqq)) { -+ bool already_in_burst = -+ !hlist_unhashed(&bfqq->burst_list_node) || -+ bfq_bfqq_in_large_burst(bfqq); -+ bfq_handle_burst(bfqd, bfqq, idle_for_long_time); -+ /* -+ * If bfqq was not already in the current burst, -+ * then, at this point, bfqq either has been -+ * added to the current burst or has caused the -+ * current burst to terminate. In particular, in -+ * the second case, bfqq has become the first -+ * queue in a possible new burst. -+ * In both cases last_ins_in_burst needs to be -+ * moved forward. -+ */ -+ if (!already_in_burst) -+ bfqd->last_ins_in_burst = jiffies; -+ } -+ -+ in_burst = bfq_bfqq_in_large_burst(bfqq); -+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && -+ !in_burst && -+ time_is_before_jiffies(bfqq->soft_rt_next_start); -+ interactive = !in_burst && idle_for_long_time; -+ entity->budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ -+ if (!bfq_bfqq_IO_bound(bfqq)) { -+ if (time_before(jiffies, -+ RQ_BIC(rq)->ttime.last_end_request + -+ bfqd->bfq_slice_idle)) { -+ bfqq->requests_within_timer++; -+ if (bfqq->requests_within_timer >= -+ bfqd->bfq_requests_within_timer) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ } else -+ bfqq->requests_within_timer = 0; -+ } -+ -+ if (!bfqd->low_latency) -+ goto add_bfqq_busy; -+ -+ /* -+ * If the queue: -+ * - is not being boosted, -+ * - has been idle for enough time, -+ * - is not a sync queue or is linked to a bfq_io_cq (it is -+ * shared "for its nature" or it is not shared and its -+ * requests have not been redirected to a shared queue) -+ * start a weight-raising period. -+ */ -+ if (old_wr_coeff == 1 && (interactive || soft_rt) && -+ (!bfq_bfqq_sync(bfqq) || bfqq->bic)) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ if (interactive) -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ else -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais starting at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } else if (old_wr_coeff > 1) { -+ if (interactive) -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ else if (in_burst || -+ (bfqq->wr_cur_max_time == -+ bfqd->bfq_wr_rt_max_time && -+ !soft_rt)) { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq-> -+ wr_cur_max_time)); -+ } else if (time_before( -+ bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time, -+ jiffies + -+ bfqd->bfq_wr_rt_max_time) && -+ soft_rt) { -+ /* -+ * -+ * The remaining weight-raising time is lower -+ * than bfqd->bfq_wr_rt_max_time, which means -+ * that the application is enjoying weight -+ * raising either because deemed soft-rt in -+ * the near past, or because deemed interactive -+ * a long ago. -+ * In both cases, resetting now the current -+ * remaining weight-raising time for the -+ * application to the weight-raising duration -+ * for soft rt applications would not cause any -+ * latency increase for the application (as the -+ * new duration would be higher than the -+ * remaining time). -+ * -+ * In addition, the application is now meeting -+ * the requirements for being deemed soft rt. -+ * In the end we can correctly and safely -+ * (re)charge the weight-raising duration for -+ * the application with the weight-raising -+ * duration for soft rt applications. -+ * -+ * In particular, doing this recharge now, i.e., -+ * before the weight-raising period for the -+ * application finishes, reduces the probability -+ * of the following negative scenario: -+ * 1) the weight of a soft rt application is -+ * raised at startup (as for any newly -+ * created application), -+ * 2) since the application is not interactive, -+ * at a certain time weight-raising is -+ * stopped for the application, -+ * 3) at that time the application happens to -+ * still have pending requests, and hence -+ * is destined to not have a chance to be -+ * deemed soft rt before these requests are -+ * completed (see the comments to the -+ * function bfq_bfqq_softrt_next_start() -+ * for details on soft rt detection), -+ * 4) these pending requests experience a high -+ * latency because the application is not -+ * weight-raised while they are pending. -+ */ -+ bfqq->last_wr_start_finish = jiffies; -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ } -+ } -+ if (old_wr_coeff != bfqq->wr_coeff) -+ entity->prio_changed = 1; -+add_bfqq_busy: -+ bfqq->last_idle_bklogged = jiffies; -+ bfqq->service_from_backlogged = 0; -+ bfq_clear_bfqq_softrt_update(bfqq); -+ bfq_add_bfqq_busy(bfqd, bfqq); -+ } else { -+ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && -+ time_is_before_jiffies( -+ bfqq->last_wr_start_finish + -+ bfqd->bfq_wr_min_inter_arr_async)) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ -+ bfqd->wr_busy_queues++; -+ entity->prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "non-idle wrais starting at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ if (prev != bfqq->next_rq) -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ if (bfqd->low_latency && -+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) -+ bfqq->last_wr_start_finish = jiffies; -+} -+ -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, -+ struct bio *bio) -+{ -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return NULL; -+ -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); -+ if (bfqq) -+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); -+ -+ return NULL; -+} -+ -+static void bfq_activate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ -+ bfqd->rq_in_driver++; -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", -+ (unsigned long long) bfqd->last_position); -+} -+ -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ -+ BUG_ON(bfqd->rq_in_driver == 0); -+ bfqd->rq_in_driver--; -+} -+ -+static void bfq_remove_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ const int sync = rq_is_sync(rq); -+ -+ if (bfqq->next_rq == rq) { -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ if (rq->queuelist.prev != &rq->queuelist) -+ list_del_init(&rq->queuelist); -+ BUG_ON(bfqq->queued[sync] == 0); -+ bfqq->queued[sync]--; -+ bfqd->queued--; -+ elv_rb_del(&bfqq->sort_list, rq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) -+ bfq_del_bfqq_busy(bfqd, bfqq, 1); -+ /* -+ * Remove queue from request-position tree as it is empty. -+ */ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ } -+ -+ if (rq->cmd_flags & REQ_META) { -+ BUG_ON(bfqq->meta_pending == 0); -+ bfqq->meta_pending--; -+ } -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); -+#endif -+} -+ -+static int bfq_merge(struct request_queue *q, struct request **req, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct request *__rq; -+ -+ __rq = bfq_find_rq_fmerge(bfqd, bio); -+ if (__rq && elv_rq_merge_ok(__rq, bio)) { -+ *req = __rq; -+ return ELEVATOR_FRONT_MERGE; -+ } -+ -+ return ELEVATOR_NO_MERGE; -+} -+ -+static void bfq_merged_request(struct request_queue *q, struct request *req, -+ int type) -+{ -+ if (type == ELEVATOR_FRONT_MERGE && -+ rb_prev(&req->rb_node) && -+ blk_rq_pos(req) < -+ blk_rq_pos(container_of(rb_prev(&req->rb_node), -+ struct request, rb_node))) { -+ struct bfq_queue *bfqq = RQ_BFQQ(req); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *prev, *next_rq; -+ -+ /* Reposition request in its sort_list */ -+ elv_rb_del(&bfqq->sort_list, req); -+ elv_rb_add(&bfqq->sort_list, req); -+ /* Choose next request to be served for bfqq */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, -+ bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ } -+} -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+static void bfq_bio_merged(struct request_queue *q, struct request *req, -+ struct bio *bio) -+{ -+ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw); -+} -+#endif -+ -+static void bfq_merged_requests(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); -+ -+ /* -+ * If next and rq belong to the same bfq_queue and next is older -+ * than rq, then reposition rq in the fifo (by substituting next -+ * with rq). Otherwise, if next and rq belong to different -+ * bfq_queues, never reposition rq: in fact, we would have to -+ * reposition it with respect to next's position in its own fifo, -+ * which would most certainly be too expensive with respect to -+ * the benefits. -+ */ -+ if (bfqq == next_bfqq && -+ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && -+ time_before(next->fifo_time, rq->fifo_time)) { -+ list_del_init(&rq->queuelist); -+ list_replace_init(&next->queuelist, &rq->queuelist); -+ rq->fifo_time = next->fifo_time; -+ } -+ -+ if (bfqq->next_rq == next) -+ bfqq->next_rq = rq; -+ -+ bfq_remove_request(next); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -+#endif -+} -+ -+/* Must be called with bfqq != NULL */ -+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) -+{ -+ BUG_ON(!bfqq); -+ if (bfq_bfqq_busy(bfqq)) -+ bfqq->bfqd->wr_busy_queues--; -+ bfqq->wr_coeff = 1; -+ bfqq->wr_cur_max_time = 0; -+ /* Trigger a weight change on the next activation of the queue */ -+ bfqq->entity.prio_changed = 1; -+} -+ -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ if (bfqg->async_bfqq[i][j]) -+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); -+ if (bfqg->async_idle_bfqq) -+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); -+} -+ -+static void bfq_end_wr(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ bfq_end_wr_async(bfqd); -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+} -+ -+static int bfq_allow_merge(struct request_queue *q, struct request *rq, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_io_cq *bic; -+ -+ /* -+ * Disallow merge of a sync bio into an async request. -+ */ -+ if (bfq_bio_sync(bio) && !rq_is_sync(rq)) -+ return 0; -+ -+ /* -+ * Lookup the bfqq that this bio will be queued with. Allow -+ * merge only if rq is queued there. -+ * Queue lock is held here. -+ */ -+ bic = bfq_bic_lookup(bfqd, current->io_context); -+ if (!bic) -+ return 0; -+ -+ return bic_to_bfqq(bic, bfq_bio_sync(bio)) == RQ_BFQQ(rq); -+} -+ -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ if (bfqq) { -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); -+#endif -+ bfq_mark_bfqq_must_alloc(bfqq); -+ bfq_mark_bfqq_budget_new(bfqq); -+ bfq_clear_bfqq_fifo_expire(bfqq); -+ -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_in_service_queue, cur-budget = %d", -+ bfqq->entity.budget); -+ } -+ -+ bfqd->in_service_queue = bfqq; -+} -+ -+/* -+ * Get and set a new queue for service. -+ */ -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); -+ -+ __bfq_set_in_service_queue(bfqd, bfqq); -+ return bfqq; -+} -+ -+/* -+ * If enough samples have been computed, return the current max budget -+ * stored in bfqd, which is dynamically updated according to the -+ * estimated disk peak rate; otherwise return the default max budget -+ */ -+static int bfq_max_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget; -+ else -+ return bfqd->bfq_max_budget; -+} -+ -+/* -+ * Return min budget, which is a fraction of the current or default -+ * max budget (trying with 1/32) -+ */ -+static int bfq_min_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget / 32; -+ else -+ return bfqd->bfq_max_budget / 32; -+} -+ -+static void bfq_arm_slice_timer(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ struct bfq_io_cq *bic; -+ unsigned long sl; -+ -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ /* Processes have exited, don't wait. */ -+ bic = bfqd->in_service_bic; -+ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) -+ return; -+ -+ bfq_mark_bfqq_wait_request(bfqq); -+ -+ /* -+ * We don't want to idle for seeks, but we do want to allow -+ * fair distribution of slice time for a process doing back-to-back -+ * seeks. So allow a little bit of time for him to submit a new rq. -+ * -+ * To prevent processes with (partly) seeky workloads from -+ * being too ill-treated, grant them a small fraction of the -+ * assigned budget before reducing the waiting time to -+ * BFQ_MIN_TT. This happened to help reduce latency. -+ */ -+ sl = bfqd->bfq_slice_idle; -+ /* -+ * Unless the queue is being weight-raised or the scenario is -+ * asymmetric, grant only minimum idle time if the queue either -+ * has been seeky for long enough or has already proved to be -+ * constantly seeky. -+ */ -+ if (bfq_sample_valid(bfqq->seek_samples) && -+ ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > -+ bfq_max_budget(bfqq->bfqd) / 8) || -+ bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && -+ bfq_symmetric_scenario(bfqd)) -+ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); -+ else if (bfqq->wr_coeff > 1) -+ sl = sl * 3; -+ bfqd->last_idling_start = ktime_get(); -+ mod_timer(&bfqd->idle_slice_timer, jiffies + sl); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); -+#endif -+ bfq_log(bfqd, "arm idle: %u/%u ms", -+ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); -+} -+ -+/* -+ * Set the maximum time for the in-service queue to consume its -+ * budget. This prevents seeky processes from lowering the disk -+ * throughput (always guaranteed with a time slice scheme as in CFQ). -+ */ -+static void bfq_set_budget_timeout(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ unsigned int timeout_coeff; -+ -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) -+ timeout_coeff = 1; -+ else -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; -+ -+ bfqd->last_budget_start = ktime_get(); -+ -+ bfq_clear_bfqq_budget_new(bfqq); -+ bfqq->budget_timeout = jiffies + -+ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; -+ -+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", -+ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * -+ timeout_coeff)); -+} -+ -+/* -+ * Move request from internal lists to the request queue dispatch list. -+ */ -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ /* -+ * For consistency, the next instruction should have been executed -+ * after removing the request from the queue and dispatching it. -+ * We execute instead this instruction before bfq_remove_request() -+ * (and hence introduce a temporary inconsistency), for efficiency. -+ * In fact, in a forced_dispatch, this prevents two counters related -+ * to bfqq->dispatched to risk to be uselessly decremented if bfqq -+ * is not in service, and then to be incremented again after -+ * incrementing bfqq->dispatched. -+ */ -+ bfqq->dispatched++; -+ bfq_remove_request(rq); -+ elv_dispatch_sort(q, rq); -+ -+ if (bfq_bfqq_sync(bfqq)) -+ bfqd->sync_flight++; -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq), -+ rq->cmd_flags); -+#endif -+} -+ -+/* -+ * Return expired entry, or NULL to just start from scratch in rbtree. -+ */ -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq) -+{ -+ struct request *rq = NULL; -+ -+ if (bfq_bfqq_fifo_expire(bfqq)) -+ return NULL; -+ -+ bfq_mark_bfqq_fifo_expire(bfqq); -+ -+ if (list_empty(&bfqq->fifo)) -+ return NULL; -+ -+ rq = rq_entry_fifo(bfqq->fifo.next); -+ -+ if (time_before(jiffies, rq->fifo_time)) -+ return NULL; -+ -+ return rq; -+} -+ -+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ return entity->budget - entity->service; -+} -+ -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ __bfq_bfqd_reset_in_service(bfqd); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ /* -+ * Overloading budget_timeout field to store the time -+ * at which the queue remains with no backlog; used by -+ * the weight-raising mechanism. -+ */ -+ bfqq->budget_timeout = jiffies; -+ bfq_del_bfqq_busy(bfqd, bfqq, 1); -+ } else -+ bfq_activate_bfqq(bfqd, bfqq); -+} -+ -+/** -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. -+ * @bfqd: device data. -+ * @bfqq: queue to update. -+ * @reason: reason for expiration. -+ * -+ * Handle the feedback on @bfqq budget at queue expiration. -+ * See the body for detailed comments. -+ */ -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ enum bfqq_expiration reason) -+{ -+ struct request *next_rq; -+ int budget, min_budget; -+ -+ budget = bfqq->max_budget; -+ min_budget = bfq_min_budget(bfqd); -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", -+ budget, bfq_min_budget(bfqd)); -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); -+ -+ if (bfq_bfqq_sync(bfqq)) { -+ switch (reason) { -+ /* -+ * Caveat: in all the following cases we trade latency -+ * for throughput. -+ */ -+ case BFQ_BFQQ_TOO_IDLE: -+ /* -+ * This is the only case where we may reduce -+ * the budget: if there is no request of the -+ * process still waiting for completion, then -+ * we assume (tentatively) that the timer has -+ * expired because the batch of requests of -+ * the process could have been served with a -+ * smaller budget. Hence, betting that -+ * process will behave in the same way when it -+ * becomes backlogged again, we reduce its -+ * next budget. As long as we guess right, -+ * this budget cut reduces the latency -+ * experienced by the process. -+ * -+ * However, if there are still outstanding -+ * requests, then the process may have not yet -+ * issued its next request just because it is -+ * still waiting for the completion of some of -+ * the still outstanding ones. So in this -+ * subcase we do not reduce its budget, on the -+ * contrary we increase it to possibly boost -+ * the throughput, as discussed in the -+ * comments to the BUDGET_TIMEOUT case. -+ */ -+ if (bfqq->dispatched > 0) /* still outstanding reqs */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ else { -+ if (budget > 5 * min_budget) -+ budget -= 4 * min_budget; -+ else -+ budget = min_budget; -+ } -+ break; -+ case BFQ_BFQQ_BUDGET_TIMEOUT: -+ /* -+ * We double the budget here because: 1) it -+ * gives the chance to boost the throughput if -+ * this is not a seeky process (which may have -+ * bumped into this timeout because of, e.g., -+ * ZBR), 2) together with charge_full_budget -+ * it helps give seeky processes higher -+ * timestamps, and hence be served less -+ * frequently. -+ */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_BUDGET_EXHAUSTED: -+ /* -+ * The process still has backlog, and did not -+ * let either the budget timeout or the disk -+ * idling timeout expire. Hence it is not -+ * seeky, has a short thinktime and may be -+ * happy with a higher budget too. So -+ * definitely increase the budget of this good -+ * candidate to boost the disk throughput. -+ */ -+ budget = min(budget * 4, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_NO_MORE_REQUESTS: -+ /* -+ * Leave the budget unchanged. -+ */ -+ default: -+ return; -+ } -+ } else -+ /* -+ * Async queues get always the maximum possible budget -+ * (their ability to dispatch is limited by -+ * @bfqd->bfq_max_budget_async_rq). -+ */ -+ budget = bfqd->bfq_max_budget; -+ -+ bfqq->max_budget = budget; -+ -+ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && -+ !bfqd->bfq_user_max_budget) -+ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); -+ -+ /* -+ * Make sure that we have enough budget for the next request. -+ * Since the finish time of the bfqq must be kept in sync with -+ * the budget, be sure to call __bfq_bfqq_expire() after the -+ * update. -+ */ -+ next_rq = bfqq->next_rq; -+ if (next_rq) -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ else -+ bfqq->entity.budget = bfqq->max_budget; -+ -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", -+ next_rq ? blk_rq_sectors(next_rq) : 0, -+ bfqq->entity.budget); -+} -+ -+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) -+{ -+ unsigned long max_budget; -+ -+ /* -+ * The max_budget calculated when autotuning is equal to the -+ * amount of sectors transfered in timeout_sync at the -+ * estimated peak rate. -+ */ -+ max_budget = (unsigned long)(peak_rate * 1000 * -+ timeout >> BFQ_RATE_SHIFT); -+ -+ return max_budget; -+} -+ -+/* -+ * In addition to updating the peak rate, checks whether the process -+ * is "slow", and returns 1 if so. This slow flag is used, in addition -+ * to the budget timeout, to reduce the amount of service provided to -+ * seeky processes, and hence reduce their chances to lower the -+ * throughput. See the code for more details. -+ */ -+static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool compensate, enum bfqq_expiration reason) -+{ -+ u64 bw, usecs, expected, timeout; -+ ktime_t delta; -+ int update = 0; -+ -+ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) -+ return false; -+ -+ if (compensate) -+ delta = bfqd->last_idling_start; -+ else -+ delta = ktime_get(); -+ delta = ktime_sub(delta, bfqd->last_budget_start); -+ usecs = ktime_to_us(delta); -+ -+ /* Don't trust short/unrealistic values. */ -+ if (usecs < 100 || usecs >= LONG_MAX) -+ return false; -+ -+ /* -+ * Calculate the bandwidth for the last slice. We use a 64 bit -+ * value to store the peak rate, in sectors per usec in fixed -+ * point math. We do so to have enough precision in the estimate -+ * and to avoid overflows. -+ */ -+ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; -+ do_div(bw, (unsigned long)usecs); -+ -+ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); -+ -+ /* -+ * Use only long (> 20ms) intervals to filter out spikes for -+ * the peak rate estimation. -+ */ -+ if (usecs > 20000) { -+ if (bw > bfqd->peak_rate || -+ (!BFQQ_SEEKY(bfqq) && -+ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { -+ bfq_log(bfqd, "measured bw =%llu", bw); -+ /* -+ * To smooth oscillations use a low-pass filter with -+ * alpha=7/8, i.e., -+ * new_rate = (7/8) * old_rate + (1/8) * bw -+ */ -+ do_div(bw, 8); -+ if (bw == 0) -+ return 0; -+ bfqd->peak_rate *= 7; -+ do_div(bfqd->peak_rate, 8); -+ bfqd->peak_rate += bw; -+ update = 1; -+ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); -+ } -+ -+ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; -+ -+ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) -+ bfqd->peak_rate_samples++; -+ -+ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && -+ update) { -+ int dev_type = blk_queue_nonrot(bfqd->queue); -+ -+ if (bfqd->bfq_user_max_budget == 0) { -+ bfqd->bfq_max_budget = -+ bfq_calc_max_budget(bfqd->peak_rate, -+ timeout); -+ bfq_log(bfqd, "new max_budget=%d", -+ bfqd->bfq_max_budget); -+ } -+ if (bfqd->device_speed == BFQ_BFQD_FAST && -+ bfqd->peak_rate < device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_SLOW; -+ bfqd->RT_prod = R_slow[dev_type] * -+ T_slow[dev_type]; -+ } else if (bfqd->device_speed == BFQ_BFQD_SLOW && -+ bfqd->peak_rate > device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_FAST; -+ bfqd->RT_prod = R_fast[dev_type] * -+ T_fast[dev_type]; -+ } -+ } -+ } -+ -+ /* -+ * If the process has been served for a too short time -+ * interval to let its possible sequential accesses prevail on -+ * the initial seek time needed to move the disk head on the -+ * first sector it requested, then give the process a chance -+ * and for the moment return false. -+ */ -+ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) -+ return false; -+ -+ /* -+ * A process is considered ``slow'' (i.e., seeky, so that we -+ * cannot treat it fairly in the service domain, as it would -+ * slow down too much the other processes) if, when a slice -+ * ends for whatever reason, it has received service at a -+ * rate that would not be high enough to complete the budget -+ * before the budget timeout expiration. -+ */ -+ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; -+ -+ /* -+ * Caveat: processes doing IO in the slower disk zones will -+ * tend to be slow(er) even if not seeky. And the estimated -+ * peak rate will actually be an average over the disk -+ * surface. Hence, to not be too harsh with unlucky processes, -+ * we keep a budget/3 margin of safety before declaring a -+ * process slow. -+ */ -+ return expected > (4 * bfqq->entity.budget) / 3; -+} -+ -+/* -+ * To be deemed as soft real-time, an application must meet two -+ * requirements. First, the application must not require an average -+ * bandwidth higher than the approximate bandwidth required to playback or -+ * record a compressed high-definition video. -+ * The next function is invoked on the completion of the last request of a -+ * batch, to compute the next-start time instant, soft_rt_next_start, such -+ * that, if the next request of the application does not arrive before -+ * soft_rt_next_start, then the above requirement on the bandwidth is met. -+ * -+ * The second requirement is that the request pattern of the application is -+ * isochronous, i.e., that, after issuing a request or a batch of requests, -+ * the application stops issuing new requests until all its pending requests -+ * have been completed. After that, the application may issue a new batch, -+ * and so on. -+ * For this reason the next function is invoked to compute -+ * soft_rt_next_start only for applications that meet this requirement, -+ * whereas soft_rt_next_start is set to infinity for applications that do -+ * not. -+ * -+ * Unfortunately, even a greedy application may happen to behave in an -+ * isochronous way if the CPU load is high. In fact, the application may -+ * stop issuing requests while the CPUs are busy serving other processes, -+ * then restart, then stop again for a while, and so on. In addition, if -+ * the disk achieves a low enough throughput with the request pattern -+ * issued by the application (e.g., because the request pattern is random -+ * and/or the device is slow), then the application may meet the above -+ * bandwidth requirement too. To prevent such a greedy application to be -+ * deemed as soft real-time, a further rule is used in the computation of -+ * soft_rt_next_start: soft_rt_next_start must be higher than the current -+ * time plus the maximum time for which the arrival of a request is waited -+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. -+ * This filters out greedy applications, as the latter issue instead their -+ * next request as soon as possible after the last one has been completed -+ * (in contrast, when a batch of requests is completed, a soft real-time -+ * application spends some time processing data). -+ * -+ * Unfortunately, the last filter may easily generate false positives if -+ * only bfqd->bfq_slice_idle is used as a reference time interval and one -+ * or both the following cases occur: -+ * 1) HZ is so low that the duration of a jiffy is comparable to or higher -+ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with -+ * HZ=100. -+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing -+ * for a while, then suddenly 'jump' by several units to recover the lost -+ * increments. This seems to happen, e.g., inside virtual machines. -+ * To address this issue, we do not use as a reference time interval just -+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In -+ * particular we add the minimum number of jiffies for which the filter -+ * seems to be quite precise also in embedded systems and KVM/QEMU virtual -+ * machines. -+ */ -+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ return max(bfqq->last_idle_bklogged + -+ HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies + bfqq->bfqd->bfq_slice_idle + 4); -+} -+ -+/* -+ * Return the largest-possible time instant such that, for as long as possible, -+ * the current time will be lower than this time instant according to the macro -+ * time_is_before_jiffies(). -+ */ -+static unsigned long bfq_infinity_from_now(unsigned long now) -+{ -+ return now + ULONG_MAX / 2; -+} -+ -+/** -+ * bfq_bfqq_expire - expire a queue. -+ * @bfqd: device owning the queue. -+ * @bfqq: the queue to expire. -+ * @compensate: if true, compensate for the time spent idling. -+ * @reason: the reason causing the expiration. -+ * -+ * -+ * If the process associated to the queue is slow (i.e., seeky), or in -+ * case of budget timeout, or, finally, if it is async, we -+ * artificially charge it an entire budget (independently of the -+ * actual service it received). As a consequence, the queue will get -+ * higher timestamps than the correct ones upon reactivation, and -+ * hence it will be rescheduled as if it had received more service -+ * than what it actually received. In the end, this class of processes -+ * will receive less service in proportion to how slowly they consume -+ * their budgets (and hence how seriously they tend to lower the -+ * throughput). -+ * -+ * In contrast, when a queue expires because it has been idling for -+ * too much or because it exhausted its budget, we do not touch the -+ * amount of service it has received. Hence when the queue will be -+ * reactivated and its timestamps updated, the latter will be in sync -+ * with the actual service received by the queue until expiration. -+ * -+ * Charging a full budget to the first type of queues and the exact -+ * service to the others has the effect of using the WF2Q+ policy to -+ * schedule the former on a timeslice basis, without violating the -+ * service domain guarantees of the latter. -+ */ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason) -+{ -+ bool slow; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * Update disk peak rate for autotuning and check whether the -+ * process is slow (see bfq_update_peak_rate). -+ */ -+ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); -+ -+ /* -+ * As above explained, 'punish' slow (i.e., seeky), timed-out -+ * and async queues, to favor sequential sync workloads. -+ * -+ * Processes doing I/O in the slower disk zones will tend to be -+ * slow(er) even if not seeky. Hence, since the estimated peak -+ * rate is actually an average over the disk surface, these -+ * processes may timeout just for bad luck. To avoid punishing -+ * them we do not charge a full budget to a process that -+ * succeeded in consuming at least 2/3 of its budget. -+ */ -+ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) -+ bfq_bfqq_charge_full_budget(bfqq); -+ -+ bfqq->service_from_backlogged += bfqq->entity.service; -+ -+ if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && -+ !bfq_bfqq_constantly_seeky(bfqq)) { -+ bfq_mark_bfqq_constantly_seeky(bfqq); -+ if (!blk_queue_nonrot(bfqd->queue)) -+ bfqd->const_seeky_busy_in_flight_queues++; -+ } -+ -+ if (reason == BFQ_BFQQ_TOO_IDLE && -+ bfqq->entity.service <= 2 * bfqq->entity.budget / 10) -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (bfqd->low_latency && bfqq->wr_coeff == 1) -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ /* -+ * If we get here, and there are no outstanding requests, -+ * then the request pattern is isochronous (see the comments -+ * to the function bfq_bfqq_softrt_next_start()). Hence we -+ * can compute soft_rt_next_start. If, instead, the queue -+ * still has outstanding requests, then we have to wait -+ * for the completion of all the outstanding requests to -+ * discover whether the request pattern is actually -+ * isochronous. -+ */ -+ if (bfqq->dispatched == 0) -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ else { -+ /* -+ * The application is still waiting for the -+ * completion of one or more requests: -+ * prevent it from possibly being incorrectly -+ * deemed as soft real-time by setting its -+ * soft_rt_next_start to infinity. In fact, -+ * without this assignment, the application -+ * would be incorrectly deemed as soft -+ * real-time if: -+ * 1) it issued a new request before the -+ * completion of all its in-flight -+ * requests, and -+ * 2) at that time, its soft_rt_next_start -+ * happened to be in the past. -+ */ -+ bfqq->soft_rt_next_start = -+ bfq_infinity_from_now(jiffies); -+ /* -+ * Schedule an update of soft_rt_next_start to when -+ * the task may be discovered to be isochronous. -+ */ -+ bfq_mark_bfqq_softrt_update(bfqq); -+ } -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, -+ slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); -+ -+ /* -+ * Increase, decrease or leave budget unchanged according to -+ * reason. -+ */ -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); -+ __bfq_bfqq_expire(bfqd, bfqq); -+} -+ -+/* -+ * Budget timeout is not implemented through a dedicated timer, but -+ * just checked on request arrivals and completions, as well as on -+ * idle timer expirations. -+ */ -+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_budget_new(bfqq) || -+ time_before(jiffies, bfqq->budget_timeout)) -+ return false; -+ return true; -+} -+ -+/* -+ * If we expire a queue that is waiting for the arrival of a new -+ * request, we may prevent the fictitious timestamp back-shifting that -+ * allows the guarantees of the queue to be preserved (see [1] for -+ * this tricky aspect). Hence we return true only if this condition -+ * does not hold, or if the queue is slow enough to deserve only to be -+ * kicked off for preserving a high throughput. -+*/ -+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "may_budget_timeout: wait_request %d left %d timeout %d", -+ bfq_bfqq_wait_request(bfqq), -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, -+ bfq_bfqq_budget_timeout(bfqq)); -+ -+ return (!bfq_bfqq_wait_request(bfqq) || -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) -+ && -+ bfq_bfqq_budget_timeout(bfqq); -+} -+ -+/* -+ * For a queue that becomes empty, device idling is allowed only if -+ * this function returns true for that queue. As a consequence, since -+ * device idling plays a critical role for both throughput boosting -+ * and service guarantees, the return value of this function plays a -+ * critical role as well. -+ * -+ * In a nutshell, this function returns true only if idling is -+ * beneficial for throughput or, even if detrimental for throughput, -+ * idling is however necessary to preserve service guarantees (low -+ * latency, desired throughput distribution, ...). In particular, on -+ * NCQ-capable devices, this function tries to return false, so as to -+ * help keep the drives' internal queues full, whenever this helps the -+ * device boost the throughput without causing any service-guarantee -+ * issue. -+ * -+ * In more detail, the return value of this function is obtained by, -+ * first, computing a number of boolean variables that take into -+ * account throughput and service-guarantee issues, and, then, -+ * combining these variables in a logical expression. Most of the -+ * issues taken into account are not trivial. We discuss these issues -+ * while introducing the variables. -+ */ -+static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ bool idling_boosts_thr, idling_boosts_thr_without_issues, -+ all_queues_seeky, on_hdd_and_not_all_queues_seeky, -+ idling_needed_for_service_guarantees, -+ asymmetric_scenario; -+ -+ /* -+ * The next variable takes into account the cases where idling -+ * boosts the throughput. -+ * -+ * The value of the variable is computed considering, first, that -+ * idling is virtually always beneficial for the throughput if: -+ * (a) the device is not NCQ-capable, or -+ * (b) regardless of the presence of NCQ, the device is rotational -+ * and the request pattern for bfqq is I/O-bound and sequential. -+ * -+ * Secondly, and in contrast to the above item (b), idling an -+ * NCQ-capable flash-based device would not boost the -+ * throughput even with sequential I/O; rather it would lower -+ * the throughput in proportion to how fast the device -+ * is. Accordingly, the next variable is true if any of the -+ * above conditions (a) and (b) is true, and, in particular, -+ * happens to be false if bfqd is an NCQ-capable flash-based -+ * device. -+ */ -+ idling_boosts_thr = !bfqd->hw_tag || -+ (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && -+ bfq_bfqq_idle_window(bfqq)); -+ -+ /* -+ * The value of the next variable, -+ * idling_boosts_thr_without_issues, is equal to that of -+ * idling_boosts_thr, unless a special case holds. In this -+ * special case, described below, idling may cause problems to -+ * weight-raised queues. -+ * -+ * When the request pool is saturated (e.g., in the presence -+ * of write hogs), if the processes associated with -+ * non-weight-raised queues ask for requests at a lower rate, -+ * then processes associated with weight-raised queues have a -+ * higher probability to get a request from the pool -+ * immediately (or at least soon) when they need one. Thus -+ * they have a higher probability to actually get a fraction -+ * of the device throughput proportional to their high -+ * weight. This is especially true with NCQ-capable drives, -+ * which enqueue several requests in advance, and further -+ * reorder internally-queued requests. -+ * -+ * For this reason, we force to false the value of -+ * idling_boosts_thr_without_issues if there are weight-raised -+ * busy queues. In this case, and if bfqq is not weight-raised, -+ * this guarantees that the device is not idled for bfqq (if, -+ * instead, bfqq is weight-raised, then idling will be -+ * guaranteed by another variable, see below). Combined with -+ * the timestamping rules of BFQ (see [1] for details), this -+ * behavior causes bfqq, and hence any sync non-weight-raised -+ * queue, to get a lower number of requests served, and thus -+ * to ask for a lower number of requests from the request -+ * pool, before the busy weight-raised queues get served -+ * again. This often mitigates starvation problems in the -+ * presence of heavy write workloads and NCQ, thereby -+ * guaranteeing a higher application and system responsiveness -+ * in these hostile scenarios. -+ */ -+ idling_boosts_thr_without_issues = idling_boosts_thr && -+ bfqd->wr_busy_queues == 0; -+ -+ /* -+ * There are then two cases where idling must be performed not -+ * for throughput concerns, but to preserve service -+ * guarantees. In the description of these cases, we say, for -+ * short, that a queue is sequential/random if the process -+ * associated to the queue issues sequential/random requests -+ * (in the second case the queue may be tagged as seeky or -+ * even constantly_seeky). -+ * -+ * To introduce the first case, we note that, since -+ * bfq_bfqq_idle_window(bfqq) is false if the device is -+ * NCQ-capable and bfqq is random (see -+ * bfq_update_idle_window()), then, from the above two -+ * assignments it follows that -+ * idling_boosts_thr_without_issues is false if the device is -+ * NCQ-capable and bfqq is random. Therefore, for this case, -+ * device idling would never be allowed if we used just -+ * idling_boosts_thr_without_issues to decide whether to allow -+ * it. And, beneficially, this would imply that throughput -+ * would always be boosted also with random I/O on NCQ-capable -+ * HDDs. -+ * -+ * But we must be careful on this point, to avoid an unfair -+ * treatment for bfqq. In fact, because of the same above -+ * assignments, idling_boosts_thr_without_issues is, on the -+ * other hand, true if 1) the device is an HDD and bfqq is -+ * sequential, and 2) there are no busy weight-raised -+ * queues. As a consequence, if we used just -+ * idling_boosts_thr_without_issues to decide whether to idle -+ * the device, then with an HDD we might easily bump into a -+ * scenario where queues that are sequential and I/O-bound -+ * would enjoy idling, whereas random queues would not. The -+ * latter might then get a low share of the device throughput, -+ * simply because the former would get many requests served -+ * after being set as in service, while the latter would not. -+ * -+ * To address this issue, we start by setting to true a -+ * sentinel variable, on_hdd_and_not_all_queues_seeky, if the -+ * device is rotational and not all queues with pending or -+ * in-flight requests are constantly seeky (i.e., there are -+ * active sequential queues, and bfqq might then be mistreated -+ * if it does not enjoy idling because it is random). -+ */ -+ all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) && -+ bfqd->busy_in_flight_queues == -+ bfqd->const_seeky_busy_in_flight_queues; -+ -+ on_hdd_and_not_all_queues_seeky = -+ !blk_queue_nonrot(bfqd->queue) && !all_queues_seeky; -+ -+ /* -+ * To introduce the second case where idling needs to be -+ * performed to preserve service guarantees, we can note that -+ * allowing the drive to enqueue more than one request at a -+ * time, and hence delegating de facto final scheduling -+ * decisions to the drive's internal scheduler, causes loss of -+ * control on the actual request service order. In particular, -+ * the critical situation is when requests from different -+ * processes happens to be present, at the same time, in the -+ * internal queue(s) of the drive. In such a situation, the -+ * drive, by deciding the service order of the -+ * internally-queued requests, does determine also the actual -+ * throughput distribution among these processes. But the -+ * drive typically has no notion or concern about per-process -+ * throughput distribution, and makes its decisions only on a -+ * per-request basis. Therefore, the service distribution -+ * enforced by the drive's internal scheduler is likely to -+ * coincide with the desired device-throughput distribution -+ * only in a completely symmetric scenario where: -+ * (i) each of these processes must get the same throughput as -+ * the others; -+ * (ii) all these processes have the same I/O pattern -+ * (either sequential or random). -+ * In fact, in such a scenario, the drive will tend to treat -+ * the requests of each of these processes in about the same -+ * way as the requests of the others, and thus to provide -+ * each of these processes with about the same throughput -+ * (which is exactly the desired throughput distribution). In -+ * contrast, in any asymmetric scenario, device idling is -+ * certainly needed to guarantee that bfqq receives its -+ * assigned fraction of the device throughput (see [1] for -+ * details). -+ * -+ * We address this issue by controlling, actually, only the -+ * symmetry sub-condition (i), i.e., provided that -+ * sub-condition (i) holds, idling is not performed, -+ * regardless of whether sub-condition (ii) holds. In other -+ * words, only if sub-condition (i) holds, then idling is -+ * allowed, and the device tends to be prevented from queueing -+ * many requests, possibly of several processes. The reason -+ * for not controlling also sub-condition (ii) is that, first, -+ * in the case of an HDD, the asymmetry in terms of types of -+ * I/O patterns is already taken in to account in the above -+ * sentinel variable -+ * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a -+ * flash-based device, we prefer however to privilege -+ * throughput (and idling lowers throughput for this type of -+ * devices), for the following reasons: -+ * 1) differently from HDDs, the service time of random -+ * requests is not orders of magnitudes lower than the service -+ * time of sequential requests; thus, even if processes doing -+ * sequential I/O get a preferential treatment with respect to -+ * others doing random I/O, the consequences are not as -+ * dramatic as with HDDs; -+ * 2) if a process doing random I/O does need strong -+ * throughput guarantees, it is hopefully already being -+ * weight-raised, or the user is likely to have assigned it a -+ * higher weight than the other processes (and thus -+ * sub-condition (i) is likely to be false, which triggers -+ * idling). -+ * -+ * According to the above considerations, the next variable is -+ * true (only) if sub-condition (i) holds. To compute the -+ * value of this variable, we not only use the return value of -+ * the function bfq_symmetric_scenario(), but also check -+ * whether bfqq is being weight-raised, because -+ * bfq_symmetric_scenario() does not take into account also -+ * weight-raised queues (see comments to -+ * bfq_weights_tree_add()). -+ * -+ * As a side note, it is worth considering that the above -+ * device-idling countermeasures may however fail in the -+ * following unlucky scenario: if idling is (correctly) -+ * disabled in a time period during which all symmetry -+ * sub-conditions hold, and hence the device is allowed to -+ * enqueue many requests, but at some later point in time some -+ * sub-condition stops to hold, then it may become impossible -+ * to let requests be served in the desired order until all -+ * the requests already queued in the device have been served. -+ */ -+ asymmetric_scenario = bfqq->wr_coeff > 1 || -+ !bfq_symmetric_scenario(bfqd); -+ -+ /* -+ * Finally, there is a case where maximizing throughput is the -+ * best choice even if it may cause unfairness toward -+ * bfqq. Such a case is when bfqq became active in a burst of -+ * queue activations. Queues that became active during a large -+ * burst benefit only from throughput, as discussed in the -+ * comments to bfq_handle_burst. Thus, if bfqq became active -+ * in a burst and not idling the device maximizes throughput, -+ * then the device must no be idled, because not idling the -+ * device provides bfqq and all other queues in the burst with -+ * maximum benefit. Combining this and the two cases above, we -+ * can now establish when idling is actually needed to -+ * preserve service guarantees. -+ */ -+ idling_needed_for_service_guarantees = -+ (on_hdd_and_not_all_queues_seeky || asymmetric_scenario) && -+ !bfq_bfqq_in_large_burst(bfqq); -+ -+ /* -+ * We have now all the components we need to compute the return -+ * value of the function, which is true only if both the following -+ * conditions hold: -+ * 1) bfqq is sync, because idling make sense only for sync queues; -+ * 2) idling either boosts the throughput (without issues), or -+ * is necessary to preserve service guarantees. -+ */ -+ return bfq_bfqq_sync(bfqq) && -+ (idling_boosts_thr_without_issues || -+ idling_needed_for_service_guarantees); -+} -+ -+/* -+ * If the in-service queue is empty but the function bfq_bfqq_may_idle -+ * returns true, then: -+ * 1) the queue must remain in service and cannot be expired, and -+ * 2) the device must be idled to wait for the possible arrival of a new -+ * request for the queue. -+ * See the comments to the function bfq_bfqq_may_idle for the reasons -+ * why performing device idling is the best choice to boost the throughput -+ * and preserve service guarantees when bfq_bfqq_may_idle itself -+ * returns true. -+ */ -+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ -+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && -+ bfq_bfqq_may_idle(bfqq); -+} -+ -+/* -+ * Select a queue for service. If we have a current queue in service, -+ * check whether to continue servicing it, or retrieve and set a new one. -+ */ -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ struct request *next_rq; -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ -+ bfqq = bfqd->in_service_queue; -+ if (!bfqq) -+ goto new_queue; -+ -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); -+ -+ if (bfq_may_expire_for_budg_timeout(bfqq) && -+ !timer_pending(&bfqd->idle_slice_timer) && -+ !bfq_bfqq_must_idle(bfqq)) -+ goto expire; -+ -+ next_rq = bfqq->next_rq; -+ /* -+ * If bfqq has requests queued and it has enough budget left to -+ * serve them, keep the queue, otherwise expire it. -+ */ -+ if (next_rq) { -+ if (bfq_serv_to_charge(next_rq, bfqq) > -+ bfq_bfqq_budget_left(bfqq)) { -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; -+ goto expire; -+ } else { -+ /* -+ * The idle timer may be pending because we may -+ * not disable disk idling even when a new request -+ * arrives. -+ */ -+ if (timer_pending(&bfqd->idle_slice_timer)) { -+ /* -+ * If we get here: 1) at least a new request -+ * has arrived but we have not disabled the -+ * timer because the request was too small, -+ * 2) then the block layer has unplugged -+ * the device, causing the dispatch to be -+ * invoked. -+ * -+ * Since the device is unplugged, now the -+ * requests are probably large enough to -+ * provide a reasonable throughput. -+ * So we disable idling. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ del_timer(&bfqd->idle_slice_timer); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+#endif -+ } -+ goto keep_queue; -+ } -+ } -+ -+ /* -+ * No requests pending. However, if the in-service queue is idling -+ * for a new request, or has requests waiting for a completion and -+ * may idle after their completion, then keep it anyway. -+ */ -+ if (timer_pending(&bfqd->idle_slice_timer) || -+ (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { -+ bfqq = NULL; -+ goto keep_queue; -+ } -+ -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS; -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, reason); -+new_queue: -+ bfqq = bfq_set_in_service_queue(bfqd); -+ bfq_log(bfqd, "select_queue: new queue %d returned", -+ bfqq ? bfqq->pid : 0); -+keep_queue: -+ return bfqq; -+} -+ -+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != -+ entity->orig_weight * bfqq->wr_coeff); -+ if (entity->prio_changed) -+ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); -+ -+ /* -+ * If the queue was activated in a burst, or -+ * too much time has elapsed from the beginning -+ * of this weight-raising period, then end weight -+ * raising. -+ */ -+ if (bfq_bfqq_in_large_burst(bfqq) || -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time)) { -+ bfqq->last_wr_start_finish = jiffies; -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ bfqq->last_wr_start_finish, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ bfq_bfqq_end_wr(bfqq); -+ } -+ } -+ /* Update weight both if it must be raised and if it must be lowered */ -+ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) -+ __bfq_entity_update_weight_prio( -+ bfq_entity_service_tree(entity), -+ entity); -+} -+ -+/* -+ * Dispatch one request from bfqq, moving it to the request queue -+ * dispatch list. -+ */ -+static int bfq_dispatch_request(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ struct request *rq; -+ unsigned long service_to_charge; -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ /* Follow expired path, else get first next available. */ -+ rq = bfq_check_fifo(bfqq); -+ if (!rq) -+ rq = bfqq->next_rq; -+ service_to_charge = bfq_serv_to_charge(rq, bfqq); -+ -+ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { -+ /* -+ * This may happen if the next rq is chosen in fifo order -+ * instead of sector order. The budget is properly -+ * dimensioned to be always sufficient to serve the next -+ * request only if it is chosen in sector order. The reason -+ * is that it would be quite inefficient and little useful -+ * to always make sure that the budget is large enough to -+ * serve even the possible next rq in fifo order. -+ * In fact, requests are seldom served in fifo order. -+ * -+ * Expire the queue for budget exhaustion, and make sure -+ * that the next act_budget is enough to serve the next -+ * request, even if it comes from the fifo expired path. -+ */ -+ bfqq->next_rq = rq; -+ /* -+ * Since this dispatch is failed, make sure that -+ * a new one will be performed -+ */ -+ if (!bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+ goto expire; -+ } -+ -+ /* Finally, insert request into driver dispatch list. */ -+ bfq_bfqq_served(bfqq, service_to_charge); -+ bfq_dispatch_insert(bfqd->queue, rq); -+ -+ bfq_update_wr_data(bfqd, bfqq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "dispatched %u sec req (%llu), budg left %d", -+ blk_rq_sectors(rq), -+ (unsigned long long) blk_rq_pos(rq), -+ bfq_bfqq_budget_left(bfqq)); -+ -+ dispatched++; -+ -+ if (!bfqd->in_service_bic) { -+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); -+ bfqd->in_service_bic = RQ_BIC(rq); -+ } -+ -+ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && -+ dispatched >= bfqd->bfq_max_budget_async_rq) || -+ bfq_class_idle(bfqq))) -+ goto expire; -+ -+ return dispatched; -+ -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); -+ return dispatched; -+} -+ -+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ -+ while (bfqq->next_rq) { -+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); -+ dispatched++; -+ } -+ -+ BUG_ON(!list_empty(&bfqq->fifo)); -+ return dispatched; -+} -+ -+/* -+ * Drain our current requests. -+ * Used for barriers and when switching io schedulers on-the-fly. -+ */ -+static int bfq_forced_dispatch(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq, *n; -+ struct bfq_service_tree *st; -+ int dispatched = 0; -+ -+ bfqq = bfqd->in_service_queue; -+ if (bfqq) -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ /* -+ * Loop through classes, and be careful to leave the scheduler -+ * in a consistent state, as feedback mechanisms and vtime -+ * updates cannot be disabled during the process. -+ */ -+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { -+ st = bfq_entity_service_tree(&bfqq->entity); -+ -+ dispatched += __bfq_forced_dispatch_bfqq(bfqq); -+ bfqq->max_budget = bfq_max_budget(bfqd); -+ -+ bfq_forget_idle(st); -+ } -+ -+ BUG_ON(bfqd->busy_queues != 0); -+ -+ return dispatched; -+} -+ -+static int bfq_dispatch_requests(struct request_queue *q, int force) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq; -+ int max_dispatch; -+ -+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -+ if (bfqd->busy_queues == 0) -+ return 0; -+ -+ if (unlikely(force)) -+ return bfq_forced_dispatch(bfqd); -+ -+ bfqq = bfq_select_queue(bfqd); -+ if (!bfqq) -+ return 0; -+ -+ if (bfq_class_idle(bfqq)) -+ max_dispatch = 1; -+ -+ if (!bfq_bfqq_sync(bfqq)) -+ max_dispatch = bfqd->bfq_max_budget_async_rq; -+ -+ if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) { -+ if (bfqd->busy_queues > 1) -+ return 0; -+ if (bfqq->dispatched >= 4 * max_dispatch) -+ return 0; -+ } -+ -+ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) -+ return 0; -+ -+ bfq_clear_bfqq_wait_request(bfqq); -+ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); -+ -+ if (!bfq_dispatch_request(bfqd, bfqq)) -+ return 0; -+ -+ bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", -+ bfq_bfqq_sync(bfqq) ? "sync" : "async"); -+ -+ return 1; -+} -+ -+/* -+ * Task holds one reference to the queue, dropped when task exits. Each rq -+ * in-flight on this queue also holds a reference, dropped when rq is freed. -+ * -+ * Queue lock must be held here. -+ */ -+static void bfq_put_queue(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ struct bfq_group *bfqg = bfqq_group(bfqq); -+#endif -+ -+ BUG_ON(atomic_read(&bfqq->ref) <= 0); -+ -+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, -+ atomic_read(&bfqq->ref)); -+ if (!atomic_dec_and_test(&bfqq->ref)) -+ return; -+ -+ BUG_ON(rb_first(&bfqq->sort_list)); -+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); -+ BUG_ON(bfqq->entity.tree); -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ BUG_ON(bfqd->in_service_queue == bfqq); -+ -+ if (bfq_bfqq_sync(bfqq)) -+ /* -+ * The fact that this queue is being destroyed does not -+ * invalidate the fact that this queue may have been -+ * activated during the current burst. As a consequence, -+ * although the queue does not exist anymore, and hence -+ * needs to be removed from the burst list if there, -+ * the burst size has not to be decremented. -+ */ -+ hlist_del_init(&bfqq->burst_list_node); -+ -+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); -+ -+ kmem_cache_free(bfq_pool, bfqq); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_put(bfqg); -+#endif -+} -+ -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ if (bfqq == bfqd->in_service_queue) { -+ __bfq_bfqq_expire(bfqd, bfqq); -+ bfq_schedule_dispatch(bfqd); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, -+ atomic_read(&bfqq->ref)); -+ -+ bfq_put_queue(bfqq); -+} -+ -+static void bfq_init_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ -+ bic->ttime.last_end_request = jiffies; -+} -+ -+static void bfq_exit_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ -+ if (bic->bfqq[BLK_RW_ASYNC]) { -+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); -+ bic->bfqq[BLK_RW_ASYNC] = NULL; -+ } -+ -+ if (bic->bfqq[BLK_RW_SYNC]) { -+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); -+ bic->bfqq[BLK_RW_SYNC] = NULL; -+ } -+} -+ -+/* -+ * Update the entity prio values; note that the new values will not -+ * be used until the next (re)activation. -+ */ -+static void -+bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) -+{ -+ struct task_struct *tsk = current; -+ int ioprio_class; -+ -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ switch (ioprio_class) { -+ default: -+ dev_err(bfqq->bfqd->queue->backing_dev_info.dev, -+ "bfq: bad prio class %d\n", ioprio_class); -+ case IOPRIO_CLASS_NONE: -+ /* -+ * No prio set, inherit CPU scheduling settings. -+ */ -+ bfqq->new_ioprio = task_nice_ioprio(tsk); -+ bfqq->new_ioprio_class = task_nice_ioclass(tsk); -+ break; -+ case IOPRIO_CLASS_RT: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; -+ break; -+ case IOPRIO_CLASS_BE: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; -+ break; -+ case IOPRIO_CLASS_IDLE: -+ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; -+ bfqq->new_ioprio = 7; -+ bfq_clear_bfqq_idle_window(bfqq); -+ break; -+ } -+ -+ if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) { -+ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", -+ bfqq->new_ioprio); -+ BUG(); -+ } -+ -+ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); -+ bfqq->entity.prio_changed = 1; -+} -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd; -+ struct bfq_queue *bfqq, *new_bfqq; -+ unsigned long uninitialized_var(flags); -+ int ioprio = bic->icq.ioc->ioprio; -+ -+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), -+ &flags); -+ /* -+ * This condition may trigger on a newly created bic, be sure to -+ * drop the lock before returning. -+ */ -+ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) -+ goto out; -+ -+ bic->ioprio = ioprio; -+ -+ bfqq = bic->bfqq[BLK_RW_ASYNC]; -+ if (bfqq) { -+ new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, -+ GFP_ATOMIC); -+ if (new_bfqq) { -+ bic->bfqq[BLK_RW_ASYNC] = new_bfqq; -+ bfq_log_bfqq(bfqd, bfqq, -+ "check_ioprio_change: bfqq %p %d", -+ bfqq, atomic_read(&bfqq->ref)); -+ bfq_put_queue(bfqq); -+ } -+ } -+ -+ bfqq = bic->bfqq[BLK_RW_SYNC]; -+ if (bfqq) -+ bfq_set_next_ioprio_data(bfqq, bic); -+ -+out: -+ bfq_put_bfqd_unlock(bfqd, &flags); -+} -+ -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic, pid_t pid, int is_sync) -+{ -+ RB_CLEAR_NODE(&bfqq->entity.rb_node); -+ INIT_LIST_HEAD(&bfqq->fifo); -+ INIT_HLIST_NODE(&bfqq->burst_list_node); -+ -+ atomic_set(&bfqq->ref, 0); -+ bfqq->bfqd = bfqd; -+ -+ if (bic) -+ bfq_set_next_ioprio_data(bfqq, bic); -+ -+ if (is_sync) { -+ if (!bfq_class_idle(bfqq)) -+ bfq_mark_bfqq_idle_window(bfqq); -+ bfq_mark_bfqq_sync(bfqq); -+ } else -+ bfq_clear_bfqq_sync(bfqq); -+ bfq_mark_bfqq_IO_bound(bfqq); -+ -+ /* Tentative initial value to trade off between thr and lat */ -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; -+ bfqq->pid = pid; -+ -+ bfqq->wr_coeff = 1; -+ bfqq->last_wr_start_finish = 0; -+ /* -+ * Set to the value for which bfqq will not be deemed as -+ * soft rt when it becomes backlogged. -+ */ -+ bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); -+} -+ -+static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, -+ struct bio *bio, int is_sync, -+ struct bfq_io_cq *bic, -+ gfp_t gfp_mask) -+{ -+ struct bfq_group *bfqg; -+ struct bfq_queue *bfqq, *new_bfqq = NULL; -+ struct blkcg *blkcg; -+ -+retry: -+ rcu_read_lock(); -+ -+ blkcg = bio_blkcg(bio); -+ bfqg = bfq_find_alloc_group(bfqd, blkcg); -+ /* bic always exists here */ -+ bfqq = bic_to_bfqq(bic, is_sync); -+ -+ /* -+ * Always try a new alloc if we fall back to the OOM bfqq -+ * originally, since it should just be a temporary situation. -+ */ -+ if (!bfqq || bfqq == &bfqd->oom_bfqq) { -+ bfqq = NULL; -+ if (new_bfqq) { -+ bfqq = new_bfqq; -+ new_bfqq = NULL; -+ } else if (gfpflags_allow_blocking(gfp_mask)) { -+ rcu_read_unlock(); -+ spin_unlock_irq(bfqd->queue->queue_lock); -+ new_bfqq = kmem_cache_alloc_node(bfq_pool, -+ gfp_mask | __GFP_ZERO, -+ bfqd->queue->node); -+ spin_lock_irq(bfqd->queue->queue_lock); -+ if (new_bfqq) -+ goto retry; -+ } else { -+ bfqq = kmem_cache_alloc_node(bfq_pool, -+ gfp_mask | __GFP_ZERO, -+ bfqd->queue->node); -+ } -+ -+ if (bfqq) { -+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, -+ is_sync); -+ bfq_init_entity(&bfqq->entity, bfqg); -+ bfq_log_bfqq(bfqd, bfqq, "allocated"); -+ } else { -+ bfqq = &bfqd->oom_bfqq; -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); -+ } -+ } -+ -+ if (new_bfqq) -+ kmem_cache_free(bfq_pool, new_bfqq); -+ -+ rcu_read_unlock(); -+ -+ return bfqq; -+} -+ -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ int ioprio_class, int ioprio) -+{ -+ switch (ioprio_class) { -+ case IOPRIO_CLASS_RT: -+ return &bfqg->async_bfqq[0][ioprio]; -+ case IOPRIO_CLASS_NONE: -+ ioprio = IOPRIO_NORM; -+ /* fall through */ -+ case IOPRIO_CLASS_BE: -+ return &bfqg->async_bfqq[1][ioprio]; -+ case IOPRIO_CLASS_IDLE: -+ return &bfqg->async_idle_bfqq; -+ default: -+ BUG(); -+ } -+} -+ -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, int is_sync, -+ struct bfq_io_cq *bic, gfp_t gfp_mask) -+{ -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ struct bfq_queue **async_bfqq = NULL; -+ struct bfq_queue *bfqq = NULL; -+ -+ if (!is_sync) { -+ struct blkcg *blkcg; -+ struct bfq_group *bfqg; -+ -+ rcu_read_lock(); -+ blkcg = bio_blkcg(bio); -+ rcu_read_unlock(); -+ bfqg = bfq_find_alloc_group(bfqd, blkcg); -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, -+ ioprio); -+ bfqq = *async_bfqq; -+ } -+ -+ if (!bfqq) -+ bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask); -+ -+ /* -+ * Pin the queue now that it's allocated, scheduler exit will -+ * prune it. -+ */ -+ if (!is_sync && !(*async_bfqq)) { -+ atomic_inc(&bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", -+ bfqq, atomic_read(&bfqq->ref)); -+ *async_bfqq = bfqq; -+ } -+ -+ atomic_inc(&bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, -+ atomic_read(&bfqq->ref)); -+ return bfqq; -+} -+ -+static void bfq_update_io_thinktime(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic) -+{ -+ unsigned long elapsed = jiffies - bic->ttime.last_end_request; -+ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); -+ -+ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; -+ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; -+ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / -+ bic->ttime.ttime_samples; -+} -+ -+static void bfq_update_io_seektime(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ sector_t sdist; -+ u64 total; -+ -+ if (bfqq->last_request_pos < blk_rq_pos(rq)) -+ sdist = blk_rq_pos(rq) - bfqq->last_request_pos; -+ else -+ sdist = bfqq->last_request_pos - blk_rq_pos(rq); -+ -+ /* -+ * Don't allow the seek distance to get too large from the -+ * odd fragment, pagein, etc. -+ */ -+ if (bfqq->seek_samples == 0) /* first request, not really a seek */ -+ sdist = 0; -+ else if (bfqq->seek_samples <= 60) /* second & third seek */ -+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); -+ else -+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); -+ -+ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; -+ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; -+ total = bfqq->seek_total + (bfqq->seek_samples/2); -+ do_div(total, bfqq->seek_samples); -+ bfqq->seek_mean = (sector_t)total; -+ -+ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, -+ (u64)bfqq->seek_mean); -+} -+ -+/* -+ * Disable idle window if the process thinks too long or seeks so much that -+ * it doesn't matter. -+ */ -+static void bfq_update_idle_window(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ int enable_idle; -+ -+ /* Don't idle for async or idle io prio class. */ -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) -+ return; -+ -+ enable_idle = bfq_bfqq_idle_window(bfqq); -+ -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -+ bfqd->bfq_slice_idle == 0 || -+ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && -+ bfqq->wr_coeff == 1)) -+ enable_idle = 0; -+ else if (bfq_sample_valid(bic->ttime.ttime_samples)) { -+ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && -+ bfqq->wr_coeff == 1) -+ enable_idle = 0; -+ else -+ enable_idle = 1; -+ } -+ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", -+ enable_idle); -+ -+ if (enable_idle) -+ bfq_mark_bfqq_idle_window(bfqq); -+ else -+ bfq_clear_bfqq_idle_window(bfqq); -+} -+ -+/* -+ * Called when a new fs request (rq) is added to bfqq. Check if there's -+ * something we should do about it. -+ */ -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ struct bfq_io_cq *bic = RQ_BIC(rq); -+ -+ if (rq->cmd_flags & REQ_META) -+ bfqq->meta_pending++; -+ -+ bfq_update_io_thinktime(bfqd, bic); -+ bfq_update_io_seektime(bfqd, bfqq, rq); -+ if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { -+ bfq_clear_bfqq_constantly_seeky(bfqq); -+ if (!blk_queue_nonrot(bfqd->queue)) { -+ BUG_ON(!bfqd->const_seeky_busy_in_flight_queues); -+ bfqd->const_seeky_busy_in_flight_queues--; -+ } -+ } -+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || -+ !BFQQ_SEEKY(bfqq)) -+ bfq_update_idle_window(bfqd, bfqq, bic); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", -+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), -+ (unsigned long long) bfqq->seek_mean); -+ -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ -+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { -+ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && -+ blk_rq_sectors(rq) < 32; -+ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); -+ -+ /* -+ * There is just this request queued: if the request -+ * is small and the queue is not to be expired, then -+ * just exit. -+ * -+ * In this way, if the disk is being idled to wait for -+ * a new request from the in-service queue, we avoid -+ * unplugging the device and committing the disk to serve -+ * just a small request. On the contrary, we wait for -+ * the block layer to decide when to unplug the device: -+ * hopefully, new requests will be merged to this one -+ * quickly, then the device will be unplugged and -+ * larger requests will be dispatched. -+ */ -+ if (small_req && !budget_timeout) -+ return; -+ -+ /* -+ * A large enough request arrived, or the queue is to -+ * be expired: in both cases disk idling is to be -+ * stopped, so clear wait_request flag and reset -+ * timer. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ del_timer(&bfqd->idle_slice_timer); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+#endif -+ -+ /* -+ * The queue is not empty, because a new request just -+ * arrived. Hence we can safely expire the queue, in -+ * case of budget timeout, without risking that the -+ * timestamps of the queue are not updated correctly. -+ * See [1] for more details. -+ */ -+ if (budget_timeout) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ -+ /* -+ * Let the request rip immediately, or let a new queue be -+ * selected if bfqq has just been expired. -+ */ -+ __blk_run_queue(bfqd->queue); -+ } -+} -+ -+static void bfq_insert_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ -+ bfq_add_request(rq); -+ -+ rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; -+ list_add_tail(&rq->queuelist, &bfqq->fifo); -+ -+ bfq_rq_enqueued(bfqd, bfqq, rq); -+} -+ -+static void bfq_update_hw_tag(struct bfq_data *bfqd) -+{ -+ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, -+ bfqd->rq_in_driver); -+ -+ if (bfqd->hw_tag == 1) -+ return; -+ -+ /* -+ * This sample is valid if the number of outstanding requests -+ * is large enough to allow a queueing behavior. Note that the -+ * sum is not exact, as it's not taking into account deactivated -+ * requests. -+ */ -+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) -+ return; -+ -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) -+ return; -+ -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; -+ bfqd->max_rq_in_driver = 0; -+ bfqd->hw_tag_samples = 0; -+} -+ -+static void bfq_completed_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ bool sync = bfq_bfqq_sync(bfqq); -+ -+ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)", -+ blk_rq_sectors(rq), sync); -+ -+ bfq_update_hw_tag(bfqd); -+ -+ BUG_ON(!bfqd->rq_in_driver); -+ BUG_ON(!bfqq->dispatched); -+ bfqd->rq_in_driver--; -+ bfqq->dispatched--; -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_update_completion(bfqq_group(bfqq), -+ rq_start_time_ns(rq), -+ rq_io_start_time_ns(rq), rq->cmd_flags); -+#endif -+ -+ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { -+ bfq_weights_tree_remove(bfqd, &bfqq->entity, -+ &bfqd->queue_weights_tree); -+ if (!blk_queue_nonrot(bfqd->queue)) { -+ BUG_ON(!bfqd->busy_in_flight_queues); -+ bfqd->busy_in_flight_queues--; -+ if (bfq_bfqq_constantly_seeky(bfqq)) { -+ BUG_ON(!bfqd-> -+ const_seeky_busy_in_flight_queues); -+ bfqd->const_seeky_busy_in_flight_queues--; -+ } -+ } -+ } -+ -+ if (sync) { -+ bfqd->sync_flight--; -+ RQ_BIC(rq)->ttime.last_end_request = jiffies; -+ } -+ -+ /* -+ * If we are waiting to discover whether the request pattern of the -+ * task associated with the queue is actually isochronous, and -+ * both requisites for this condition to hold are satisfied, then -+ * compute soft_rt_next_start (see the comments to the function -+ * bfq_bfqq_softrt_next_start()). -+ */ -+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ -+ /* -+ * If this is the in-service queue, check if it needs to be expired, -+ * or if we want to idle in case it has no pending requests. -+ */ -+ if (bfqd->in_service_queue == bfqq) { -+ if (bfq_bfqq_budget_new(bfqq)) -+ bfq_set_budget_timeout(bfqd); -+ -+ if (bfq_bfqq_must_idle(bfqq)) { -+ bfq_arm_slice_timer(bfqd); -+ goto out; -+ } else if (bfq_may_expire_for_budg_timeout(bfqq)) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && -+ (bfqq->dispatched == 0 || -+ !bfq_bfqq_may_idle(bfqq))) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_NO_MORE_REQUESTS); -+ } -+ -+ if (!bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+ -+out: -+ return; -+} -+ -+static int __bfq_may_queue(struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { -+ bfq_clear_bfqq_must_alloc(bfqq); -+ return ELV_MQUEUE_MUST; -+ } -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+static int bfq_may_queue(struct request_queue *q, int rw) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ /* -+ * Don't force setup of a queue from here, as a call to may_queue -+ * does not necessarily imply that a request actually will be -+ * queued. So just lookup a possibly existing queue, or return -+ * 'may queue' if that fails. -+ */ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return ELV_MQUEUE_MAY; -+ -+ bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); -+ if (bfqq) -+ return __bfq_may_queue(bfqq); -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+/* -+ * Queue lock held here. -+ */ -+static void bfq_put_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ if (bfqq) { -+ const int rw = rq_data_dir(rq); -+ -+ BUG_ON(!bfqq->allocated[rw]); -+ bfqq->allocated[rw]--; -+ -+ rq->elv.priv[0] = NULL; -+ rq->elv.priv[1] = NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", -+ bfqq, atomic_read(&bfqq->ref)); -+ bfq_put_queue(bfqq); -+ } -+} -+ -+/* -+ * Allocate bfq data structures associated with this request. -+ */ -+static int bfq_set_request(struct request_queue *q, struct request *rq, -+ struct bio *bio, gfp_t gfp_mask) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); -+ const int rw = rq_data_dir(rq); -+ const int is_sync = rq_is_sync(rq); -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ -+ might_sleep_if(gfpflags_allow_blocking(gfp_mask)); -+ -+ bfq_check_ioprio_change(bic, bio); -+ -+ spin_lock_irqsave(q->queue_lock, flags); -+ -+ if (!bic) -+ goto queue_fail; -+ -+ bfq_bic_update_cgroup(bic, bio); -+ -+ bfqq = bic_to_bfqq(bic, is_sync); -+ if (!bfqq || bfqq == &bfqd->oom_bfqq) { -+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); -+ bic_set_bfqq(bic, bfqq, is_sync); -+ if (is_sync) { -+ if (bfqd->large_burst) -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ else -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ } -+ } -+ -+ bfqq->allocated[rw]++; -+ atomic_inc(&bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, -+ atomic_read(&bfqq->ref)); -+ -+ rq->elv.priv[0] = bic; -+ rq->elv.priv[1] = bfqq; -+ -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 0; -+ -+queue_fail: -+ bfq_schedule_dispatch(bfqd); -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 1; -+} -+ -+static void bfq_kick_queue(struct work_struct *work) -+{ -+ struct bfq_data *bfqd = -+ container_of(work, struct bfq_data, unplug_work); -+ struct request_queue *q = bfqd->queue; -+ -+ spin_lock_irq(q->queue_lock); -+ __blk_run_queue(q); -+ spin_unlock_irq(q->queue_lock); -+} -+ -+/* -+ * Handler of the expiration of the timer running if the in-service queue -+ * is idling inside its time slice. -+ */ -+static void bfq_idle_slice_timer(unsigned long data) -+{ -+ struct bfq_data *bfqd = (struct bfq_data *)data; -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ enum bfqq_expiration reason; -+ -+ spin_lock_irqsave(bfqd->queue->queue_lock, flags); -+ -+ bfqq = bfqd->in_service_queue; -+ /* -+ * Theoretical race here: the in-service queue can be NULL or -+ * different from the queue that was idling if the timer handler -+ * spins on the queue_lock and a new request arrives for the -+ * current queue and there is a full dispatch cycle that changes -+ * the in-service queue. This can hardly happen, but in the worst -+ * case we just expire a queue too early. -+ */ -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); -+ if (bfq_bfqq_budget_timeout(bfqq)) -+ /* -+ * Also here the queue can be safely expired -+ * for budget timeout without wasting -+ * guarantees -+ */ -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) -+ /* -+ * The queue may not be empty upon timer expiration, -+ * because we may not disable the timer when the -+ * first request of the in-service queue arrives -+ * during disk idling. -+ */ -+ reason = BFQ_BFQQ_TOO_IDLE; -+ else -+ goto schedule_dispatch; -+ -+ bfq_bfqq_expire(bfqd, bfqq, true, reason); -+ } -+ -+schedule_dispatch: -+ bfq_schedule_dispatch(bfqd); -+ -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); -+} -+ -+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) -+{ -+ del_timer_sync(&bfqd->idle_slice_timer); -+ cancel_work_sync(&bfqd->unplug_work); -+} -+ -+static void __bfq_put_async_bfqq(struct bfq_data *bfqd, -+ struct bfq_queue **bfqq_ptr) -+{ -+ struct bfq_group *root_group = bfqd->root_group; -+ struct bfq_queue *bfqq = *bfqq_ptr; -+ -+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); -+ if (bfqq) { -+ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); -+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", -+ bfqq, atomic_read(&bfqq->ref)); -+ bfq_put_queue(bfqq); -+ *bfqq_ptr = NULL; -+ } -+} -+ -+/* -+ * Release all the bfqg references to its async queues. If we are -+ * deallocating the group these queues may still contain requests, so -+ * we reparent them to the root cgroup (i.e., the only one that will -+ * exist for sure until all the requests on a device are gone). -+ */ -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); -+ -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); -+} -+ -+static void bfq_exit_queue(struct elevator_queue *e) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ struct request_queue *q = bfqd->queue; -+ struct bfq_queue *bfqq, *n; -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ spin_lock_irq(q->queue_lock); -+ -+ BUG_ON(bfqd->in_service_queue); -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) -+ bfq_deactivate_bfqq(bfqd, bfqq, 0); -+ -+ spin_unlock_irq(q->queue_lock); -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ synchronize_rcu(); -+ -+ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ blkcg_deactivate_policy(q, &blkcg_policy_bfq); -+#else -+ kfree(bfqd->root_group); -+#endif -+ -+ kfree(bfqd); -+} -+ -+static void bfq_init_root_group(struct bfq_group *root_group, -+ struct bfq_data *bfqd) -+{ -+ int i; -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ root_group->entity.parent = NULL; -+ root_group->my_entity = NULL; -+ root_group->bfqd = bfqd; -+#endif -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+} -+ -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) -+{ -+ struct bfq_data *bfqd; -+ struct elevator_queue *eq; -+ -+ eq = elevator_alloc(q, e); -+ if (!eq) -+ return -ENOMEM; -+ -+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); -+ if (!bfqd) { -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+ } -+ eq->elevator_data = bfqd; -+ -+ /* -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. -+ * Grab a permanent reference to it, so that the normal code flow -+ * will not attempt to free it. -+ */ -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); -+ atomic_inc(&bfqd->oom_bfqq.ref); -+ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; -+ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; -+ bfqd->oom_bfqq.entity.new_weight = -+ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); -+ /* -+ * Trigger weight initialization, according to ioprio, at the -+ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio -+ * class won't be changed any more. -+ */ -+ bfqd->oom_bfqq.entity.prio_changed = 1; -+ -+ bfqd->queue = q; -+ -+ spin_lock_irq(q->queue_lock); -+ q->elevator = eq; -+ spin_unlock_irq(q->queue_lock); -+ -+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); -+ if (!bfqd->root_group) -+ goto out_free; -+ bfq_init_root_group(bfqd->root_group, bfqd); -+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqd->active_numerous_groups = 0; -+#endif -+ -+ init_timer(&bfqd->idle_slice_timer); -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; -+ bfqd->idle_slice_timer.data = (unsigned long)bfqd; -+ -+ bfqd->queue_weights_tree = RB_ROOT; -+ bfqd->group_weights_tree = RB_ROOT; -+ -+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); -+ -+ INIT_LIST_HEAD(&bfqd->active_list); -+ INIT_LIST_HEAD(&bfqd->idle_list); -+ INIT_HLIST_HEAD(&bfqd->burst_list); -+ -+ bfqd->hw_tag = -1; -+ -+ bfqd->bfq_max_budget = bfq_default_max_budget; -+ -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; -+ bfqd->bfq_back_max = bfq_back_max; -+ bfqd->bfq_back_penalty = bfq_back_penalty; -+ bfqd->bfq_slice_idle = bfq_slice_idle; -+ bfqd->bfq_class_idle_last_service = 0; -+ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; -+ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; -+ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; -+ -+ bfqd->bfq_requests_within_timer = 120; -+ -+ bfqd->bfq_large_burst_thresh = 11; -+ bfqd->bfq_burst_interval = msecs_to_jiffies(500); -+ -+ bfqd->low_latency = true; -+ -+ bfqd->bfq_wr_coeff = 20; -+ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); -+ bfqd->bfq_wr_max_time = 0; -+ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); -+ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); -+ bfqd->bfq_wr_max_softrt_rate = 7000; /* -+ * Approximate rate required -+ * to playback or record a -+ * high-definition compressed -+ * video. -+ */ -+ bfqd->wr_busy_queues = 0; -+ bfqd->busy_in_flight_queues = 0; -+ bfqd->const_seeky_busy_in_flight_queues = 0; -+ -+ /* -+ * Begin by assuming, optimistically, that the device peak rate is -+ * equal to the highest reference rate. -+ */ -+ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * -+ T_fast[blk_queue_nonrot(bfqd->queue)]; -+ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)]; -+ bfqd->device_speed = BFQ_BFQD_FAST; -+ -+ return 0; -+ -+out_free: -+ kfree(bfqd); -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+} -+ -+static void bfq_slab_kill(void) -+{ -+ kmem_cache_destroy(bfq_pool); -+} -+ -+static int __init bfq_slab_setup(void) -+{ -+ bfq_pool = KMEM_CACHE(bfq_queue, 0); -+ if (!bfq_pool) -+ return -ENOMEM; -+ return 0; -+} -+ -+static ssize_t bfq_var_show(unsigned int var, char *page) -+{ -+ return sprintf(page, "%d\n", var); -+} -+ -+static ssize_t bfq_var_store(unsigned long *var, const char *page, -+ size_t count) -+{ -+ unsigned long new_val; -+ int ret = kstrtoul(page, 10, &new_val); -+ -+ if (ret == 0) -+ *var = new_val; -+ -+ return count; -+} -+ -+static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ -+ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? -+ jiffies_to_msecs(bfqd->bfq_wr_max_time) : -+ jiffies_to_msecs(bfq_wr_duration(bfqd))); -+} -+ -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_queue *bfqq; -+ struct bfq_data *bfqd = e->elevator_data; -+ ssize_t num_char = 0; -+ -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", -+ bfqd->queued); -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ num_char += sprintf(page + num_char, "Active:\n"); -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, nr_queued %d %d, ", -+ bfqq->pid, -+ bfqq->entity.weight, -+ bfqq->queued[0], -+ bfqq->queued[1]); -+ num_char += sprintf(page + num_char, -+ "dur %d/%u\n", -+ jiffies_to_msecs( -+ jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ num_char += sprintf(page + num_char, "Idle:\n"); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, dur %d/%u\n", -+ bfqq->pid, -+ bfqq->entity.weight, -+ jiffies_to_msecs(jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+ -+ return num_char; -+} -+ -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned int __data = __VAR; \ -+ if (__CONV) \ -+ __data = jiffies_to_msecs(__data); \ -+ return bfq_var_show(__data, (page)); \ -+} -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -+SHOW_FUNCTION(bfq_max_budget_async_rq_show, -+ bfqd->bfq_max_budget_async_rq, 0); -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); -+SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); -+SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); -+SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); -+SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); -+SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, -+ 1); -+SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); -+#undef SHOW_FUNCTION -+ -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -+static ssize_t \ -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ if (__CONV) \ -+ *(__PTR) = msecs_to_jiffies(__data); \ -+ else \ -+ *(__PTR) = __data; \ -+ return ret; \ -+} -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, -+ INT_MAX, 0); -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, -+ 1, INT_MAX, 0); -+STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); -+STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, -+ 1); -+STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, -+ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, -+ INT_MAX, 0); -+#undef STORE_FUNCTION -+ -+/* do nothing for the moment */ -+static ssize_t bfq_weights_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ return count; -+} -+ -+static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) -+{ -+ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); -+ -+ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) -+ return bfq_calc_max_budget(bfqd->peak_rate, timeout); -+ else -+ return bfq_default_max_budget; -+} -+ -+static ssize_t bfq_max_budget_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data == 0) -+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); -+ else { -+ if (__data > INT_MAX) -+ __data = INT_MAX; -+ bfqd->bfq_max_budget = __data; -+ } -+ -+ bfqd->bfq_user_max_budget = __data; -+ -+ return ret; -+} -+ -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data < 1) -+ __data = 1; -+ else if (__data > INT_MAX) -+ __data = INT_MAX; -+ -+ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); -+ if (bfqd->bfq_user_max_budget == 0) -+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); -+ -+ return ret; -+} -+ -+static ssize_t bfq_low_latency_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (__data == 0 && bfqd->low_latency != 0) -+ bfq_end_wr(bfqd); -+ bfqd->low_latency = __data; -+ -+ return ret; -+} -+ -+#define BFQ_ATTR(name) \ -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) -+ -+static struct elv_fs_entry bfq_attrs[] = { -+ BFQ_ATTR(fifo_expire_sync), -+ BFQ_ATTR(fifo_expire_async), -+ BFQ_ATTR(back_seek_max), -+ BFQ_ATTR(back_seek_penalty), -+ BFQ_ATTR(slice_idle), -+ BFQ_ATTR(max_budget), -+ BFQ_ATTR(max_budget_async_rq), -+ BFQ_ATTR(timeout_sync), -+ BFQ_ATTR(timeout_async), -+ BFQ_ATTR(low_latency), -+ BFQ_ATTR(wr_coeff), -+ BFQ_ATTR(wr_max_time), -+ BFQ_ATTR(wr_rt_max_time), -+ BFQ_ATTR(wr_min_idle_time), -+ BFQ_ATTR(wr_min_inter_arr_async), -+ BFQ_ATTR(wr_max_softrt_rate), -+ BFQ_ATTR(weights), -+ __ATTR_NULL -+}; -+ -+static struct elevator_type iosched_bfq = { -+ .ops = { -+ .elevator_merge_fn = bfq_merge, -+ .elevator_merged_fn = bfq_merged_request, -+ .elevator_merge_req_fn = bfq_merged_requests, -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ .elevator_bio_merged_fn = bfq_bio_merged, -+#endif -+ .elevator_allow_merge_fn = bfq_allow_merge, -+ .elevator_dispatch_fn = bfq_dispatch_requests, -+ .elevator_add_req_fn = bfq_insert_request, -+ .elevator_activate_req_fn = bfq_activate_request, -+ .elevator_deactivate_req_fn = bfq_deactivate_request, -+ .elevator_completed_req_fn = bfq_completed_request, -+ .elevator_former_req_fn = elv_rb_former_request, -+ .elevator_latter_req_fn = elv_rb_latter_request, -+ .elevator_init_icq_fn = bfq_init_icq, -+ .elevator_exit_icq_fn = bfq_exit_icq, -+ .elevator_set_req_fn = bfq_set_request, -+ .elevator_put_req_fn = bfq_put_request, -+ .elevator_may_queue_fn = bfq_may_queue, -+ .elevator_init_fn = bfq_init_queue, -+ .elevator_exit_fn = bfq_exit_queue, -+ }, -+ .icq_size = sizeof(struct bfq_io_cq), -+ .icq_align = __alignof__(struct bfq_io_cq), -+ .elevator_attrs = bfq_attrs, -+ .elevator_name = "bfq", -+ .elevator_owner = THIS_MODULE, -+}; -+ -+static int __init bfq_init(void) -+{ -+ int ret; -+ -+ /* -+ * Can be 0 on HZ < 1000 setups. -+ */ -+ if (bfq_slice_idle == 0) -+ bfq_slice_idle = 1; -+ -+ if (bfq_timeout_async == 0) -+ bfq_timeout_async = 1; -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ ret = blkcg_policy_register(&blkcg_policy_bfq); -+ if (ret) -+ return ret; -+#endif -+ -+ ret = -ENOMEM; -+ if (bfq_slab_setup()) -+ goto err_pol_unreg; -+ -+ /* -+ * Times to load large popular applications for the typical systems -+ * installed on the reference devices (see the comments before the -+ * definitions of the two arrays). -+ */ -+ T_slow[0] = msecs_to_jiffies(2600); -+ T_slow[1] = msecs_to_jiffies(1000); -+ T_fast[0] = msecs_to_jiffies(5500); -+ T_fast[1] = msecs_to_jiffies(2000); -+ -+ /* -+ * Thresholds that determine the switch between speed classes (see -+ * the comments before the definition of the array). -+ */ -+ device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; -+ device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; -+ -+ ret = elv_register(&iosched_bfq); -+ if (ret) -+ goto err_pol_unreg; -+ -+ pr_info("BFQ I/O-scheduler: v7r11"); -+ -+ return 0; -+ -+err_pol_unreg: -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ return ret; -+} -+ -+static void __exit bfq_exit(void) -+{ -+ elv_unregister(&iosched_bfq); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ bfq_slab_kill(); -+} -+ -+module_init(bfq_init); -+module_exit(bfq_exit); -+ -+MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); -+MODULE_LICENSE("GPL"); -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -new file mode 100644 -index 0000000..a5ed694 ---- /dev/null -+++ b/block/bfq-sched.c -@@ -0,0 +1,1199 @@ -+/* -+ * BFQ: Hierarchical B-WF2Q+ scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe -+ * -+ * Copyright (C) 2008 Fabio Checconi -+ * Paolo Valente -+ * -+ * Copyright (C) 2010 Paolo Valente -+ */ -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+#define for_each_entity(entity) \ -+ for (; entity ; entity = entity->parent) -+ -+#define for_each_entity_safe(entity, parent) \ -+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) -+ -+ -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, -+ int extract, -+ struct bfq_data *bfqd); -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+ -+static void bfq_update_budget(struct bfq_entity *next_in_service) -+{ -+ struct bfq_entity *bfqg_entity; -+ struct bfq_group *bfqg; -+ struct bfq_sched_data *group_sd; -+ -+ BUG_ON(!next_in_service); -+ -+ group_sd = next_in_service->sched_data; -+ -+ bfqg = container_of(group_sd, struct bfq_group, sched_data); -+ /* -+ * bfq_group's my_entity field is not NULL only if the group -+ * is not the root group. We must not touch the root entity -+ * as it must never become an in-service entity. -+ */ -+ bfqg_entity = bfqg->my_entity; -+ if (bfqg_entity) -+ bfqg_entity->budget = next_in_service->budget; -+} -+ -+static int bfq_update_next_in_service(struct bfq_sched_data *sd) -+{ -+ struct bfq_entity *next_in_service; -+ -+ if (sd->in_service_entity) -+ /* will update/requeue at the end of service */ -+ return 0; -+ -+ /* -+ * NOTE: this can be improved in many ways, such as returning -+ * 1 (and thus propagating upwards the update) only when the -+ * budget changes, or caching the bfqq that will be scheduled -+ * next from this subtree. By now we worry more about -+ * correctness than about performance... -+ */ -+ next_in_service = bfq_lookup_next_entity(sd, 0, NULL); -+ sd->next_in_service = next_in_service; -+ -+ if (next_in_service) -+ bfq_update_budget(next_in_service); -+ -+ return 1; -+} -+ -+static void bfq_check_next_in_service(struct bfq_sched_data *sd, -+ struct bfq_entity *entity) -+{ -+ BUG_ON(sd->next_in_service != entity); -+} -+#else -+#define for_each_entity(entity) \ -+ for (; entity ; entity = NULL) -+ -+#define for_each_entity_safe(entity, parent) \ -+ for (parent = NULL; entity ; entity = parent) -+ -+static int bfq_update_next_in_service(struct bfq_sched_data *sd) -+{ -+ return 0; -+} -+ -+static void bfq_check_next_in_service(struct bfq_sched_data *sd, -+ struct bfq_entity *entity) -+{ -+} -+ -+static void bfq_update_budget(struct bfq_entity *next_in_service) -+{ -+} -+#endif -+ -+/* -+ * Shift for timestamp calculations. This actually limits the maximum -+ * service allowed in one timestamp delta (small shift values increase it), -+ * the maximum total weight that can be used for the queues in the system -+ * (big shift values increase it), and the period of virtual time -+ * wraparounds. -+ */ -+#define WFQ_SERVICE_SHIFT 22 -+ -+/** -+ * bfq_gt - compare two timestamps. -+ * @a: first ts. -+ * @b: second ts. -+ * -+ * Return @a > @b, dealing with wrapping correctly. -+ */ -+static int bfq_gt(u64 a, u64 b) -+{ -+ return (s64)(a - b) > 0; -+} -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = NULL; -+ -+ BUG_ON(!entity); -+ -+ if (!entity->my_sched_data) -+ bfqq = container_of(entity, struct bfq_queue, entity); -+ -+ return bfqq; -+} -+ -+ -+/** -+ * bfq_delta - map service into the virtual time domain. -+ * @service: amount of service. -+ * @weight: scale factor (weight of an entity or weight sum). -+ */ -+static u64 bfq_delta(unsigned long service, unsigned long weight) -+{ -+ u64 d = (u64)service << WFQ_SERVICE_SHIFT; -+ -+ do_div(d, weight); -+ return d; -+} -+ -+/** -+ * bfq_calc_finish - assign the finish time to an entity. -+ * @entity: the entity to act upon. -+ * @service: the service to be charged to the entity. -+ */ -+static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ BUG_ON(entity->weight == 0); -+ -+ entity->finish = entity->start + -+ bfq_delta(service, entity->weight); -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "calc_finish: serv %lu, w %d", -+ service, entity->weight); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "calc_finish: start %llu, finish %llu, delta %llu", -+ entity->start, entity->finish, -+ bfq_delta(service, entity->weight)); -+ } -+} -+ -+/** -+ * bfq_entity_of - get an entity from a node. -+ * @node: the node field of the entity. -+ * -+ * Convert a node pointer to the relative entity. This is used only -+ * to simplify the logic of some functions and not as the generic -+ * conversion mechanism because, e.g., in the tree walking functions, -+ * the check for a %NULL value would be redundant. -+ */ -+static struct bfq_entity *bfq_entity_of(struct rb_node *node) -+{ -+ struct bfq_entity *entity = NULL; -+ -+ if (node) -+ entity = rb_entry(node, struct bfq_entity, rb_node); -+ -+ return entity; -+} -+ -+/** -+ * bfq_extract - remove an entity from a tree. -+ * @root: the tree root. -+ * @entity: the entity to remove. -+ */ -+static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) -+{ -+ BUG_ON(entity->tree != root); -+ -+ entity->tree = NULL; -+ rb_erase(&entity->rb_node, root); -+} -+ -+/** -+ * bfq_idle_extract - extract an entity from the idle tree. -+ * @st: the service tree of the owning @entity. -+ * @entity: the entity being removed. -+ */ -+static void bfq_idle_extract(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *next; -+ -+ BUG_ON(entity->tree != &st->idle); -+ -+ if (entity == st->first_idle) { -+ next = rb_next(&entity->rb_node); -+ st->first_idle = bfq_entity_of(next); -+ } -+ -+ if (entity == st->last_idle) { -+ next = rb_prev(&entity->rb_node); -+ st->last_idle = bfq_entity_of(next); -+ } -+ -+ bfq_extract(&st->idle, entity); -+ -+ if (bfqq) -+ list_del(&bfqq->bfqq_list); -+} -+ -+/** -+ * bfq_insert - generic tree insertion. -+ * @root: tree root. -+ * @entity: entity to insert. -+ * -+ * This is used for the idle and the active tree, since they are both -+ * ordered by finish time. -+ */ -+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) -+{ -+ struct bfq_entity *entry; -+ struct rb_node **node = &root->rb_node; -+ struct rb_node *parent = NULL; -+ -+ BUG_ON(entity->tree); -+ -+ while (*node) { -+ parent = *node; -+ entry = rb_entry(parent, struct bfq_entity, rb_node); -+ -+ if (bfq_gt(entry->finish, entity->finish)) -+ node = &parent->rb_left; -+ else -+ node = &parent->rb_right; -+ } -+ -+ rb_link_node(&entity->rb_node, parent, node); -+ rb_insert_color(&entity->rb_node, root); -+ -+ entity->tree = root; -+} -+ -+/** -+ * bfq_update_min - update the min_start field of a entity. -+ * @entity: the entity to update. -+ * @node: one of its children. -+ * -+ * This function is called when @entity may store an invalid value for -+ * min_start due to updates to the active tree. The function assumes -+ * that the subtree rooted at @node (which may be its left or its right -+ * child) has a valid min_start value. -+ */ -+static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) -+{ -+ struct bfq_entity *child; -+ -+ if (node) { -+ child = rb_entry(node, struct bfq_entity, rb_node); -+ if (bfq_gt(entity->min_start, child->min_start)) -+ entity->min_start = child->min_start; -+ } -+} -+ -+/** -+ * bfq_update_active_node - recalculate min_start. -+ * @node: the node to update. -+ * -+ * @node may have changed position or one of its children may have moved, -+ * this function updates its min_start value. The left and right subtrees -+ * are assumed to hold a correct min_start value. -+ */ -+static void bfq_update_active_node(struct rb_node *node) -+{ -+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); -+ -+ entity->min_start = entity->start; -+ bfq_update_min(entity, node->rb_right); -+ bfq_update_min(entity, node->rb_left); -+} -+ -+/** -+ * bfq_update_active_tree - update min_start for the whole active tree. -+ * @node: the starting node. -+ * -+ * @node must be the deepest modified node after an update. This function -+ * updates its min_start using the values held by its children, assuming -+ * that they did not change, and then updates all the nodes that may have -+ * changed in the path to the root. The only nodes that may have changed -+ * are the ones in the path or their siblings. -+ */ -+static void bfq_update_active_tree(struct rb_node *node) -+{ -+ struct rb_node *parent; -+ -+up: -+ bfq_update_active_node(node); -+ -+ parent = rb_parent(node); -+ if (!parent) -+ return; -+ -+ if (node == parent->rb_left && parent->rb_right) -+ bfq_update_active_node(parent->rb_right); -+ else if (parent->rb_left) -+ bfq_update_active_node(parent->rb_left); -+ -+ node = parent; -+ goto up; -+} -+ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root); -+ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root); -+ -+ -+/** -+ * bfq_active_insert - insert an entity in the active tree of its -+ * group/device. -+ * @st: the service tree of the entity. -+ * @entity: the entity being inserted. -+ * -+ * The active tree is ordered by finish time, but an extra key is kept -+ * per each node, containing the minimum value for the start times of -+ * its children (and the node itself), so it's possible to search for -+ * the eligible node with the lowest finish time in logarithmic time. -+ */ -+static void bfq_active_insert(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *node = &entity->rb_node; -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ struct bfq_sched_data *sd = NULL; -+ struct bfq_group *bfqg = NULL; -+ struct bfq_data *bfqd = NULL; -+#endif -+ -+ bfq_insert(&st->active, entity); -+ -+ if (node->rb_left) -+ node = node->rb_left; -+ else if (node->rb_right) -+ node = node->rb_right; -+ -+ bfq_update_active_tree(node); -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ sd = entity->sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+#endif -+ if (bfqq) -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { /* bfq_group */ -+ BUG_ON(!bfqd); -+ bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); -+ } -+ if (bfqg != bfqd->root_group) { -+ BUG_ON(!bfqg); -+ BUG_ON(!bfqd); -+ bfqg->active_entities++; -+ if (bfqg->active_entities == 2) -+ bfqd->active_numerous_groups++; -+ } -+#endif -+} -+ -+/** -+ * bfq_ioprio_to_weight - calc a weight from an ioprio. -+ * @ioprio: the ioprio value to convert. -+ */ -+static unsigned short bfq_ioprio_to_weight(int ioprio) -+{ -+ BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); -+ return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio; -+} -+ -+/** -+ * bfq_weight_to_ioprio - calc an ioprio from a weight. -+ * @weight: the weight value to convert. -+ * -+ * To preserve as much as possible the old only-ioprio user interface, -+ * 0 is used as an escape ioprio value for weights (numerically) equal or -+ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. -+ */ -+static unsigned short bfq_weight_to_ioprio(int weight) -+{ -+ BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); -+ return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ? -+ 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight; -+} -+ -+static void bfq_get_entity(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ if (bfqq) { -+ atomic_inc(&bfqq->ref); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", -+ bfqq, atomic_read(&bfqq->ref)); -+ } -+} -+ -+/** -+ * bfq_find_deepest - find the deepest node that an extraction can modify. -+ * @node: the node being removed. -+ * -+ * Do the first step of an extraction in an rb tree, looking for the -+ * node that will replace @node, and returning the deepest node that -+ * the following modifications to the tree can touch. If @node is the -+ * last node in the tree return %NULL. -+ */ -+static struct rb_node *bfq_find_deepest(struct rb_node *node) -+{ -+ struct rb_node *deepest; -+ -+ if (!node->rb_right && !node->rb_left) -+ deepest = rb_parent(node); -+ else if (!node->rb_right) -+ deepest = node->rb_left; -+ else if (!node->rb_left) -+ deepest = node->rb_right; -+ else { -+ deepest = rb_next(node); -+ if (deepest->rb_right) -+ deepest = deepest->rb_right; -+ else if (rb_parent(deepest) != node) -+ deepest = rb_parent(deepest); -+ } -+ -+ return deepest; -+} -+ -+/** -+ * bfq_active_extract - remove an entity from the active tree. -+ * @st: the service_tree containing the tree. -+ * @entity: the entity being removed. -+ */ -+static void bfq_active_extract(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *node; -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ struct bfq_sched_data *sd = NULL; -+ struct bfq_group *bfqg = NULL; -+ struct bfq_data *bfqd = NULL; -+#endif -+ -+ node = bfq_find_deepest(&entity->rb_node); -+ bfq_extract(&st->active, entity); -+ -+ if (node) -+ bfq_update_active_tree(node); -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ sd = entity->sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+#endif -+ if (bfqq) -+ list_del(&bfqq->bfqq_list); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { /* bfq_group */ -+ BUG_ON(!bfqd); -+ bfq_weights_tree_remove(bfqd, entity, -+ &bfqd->group_weights_tree); -+ } -+ if (bfqg != bfqd->root_group) { -+ BUG_ON(!bfqg); -+ BUG_ON(!bfqd); -+ BUG_ON(!bfqg->active_entities); -+ bfqg->active_entities--; -+ if (bfqg->active_entities == 1) { -+ BUG_ON(!bfqd->active_numerous_groups); -+ bfqd->active_numerous_groups--; -+ } -+ } -+#endif -+} -+ -+/** -+ * bfq_idle_insert - insert an entity into the idle tree. -+ * @st: the service tree containing the tree. -+ * @entity: the entity to insert. -+ */ -+static void bfq_idle_insert(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct bfq_entity *first_idle = st->first_idle; -+ struct bfq_entity *last_idle = st->last_idle; -+ -+ if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) -+ st->first_idle = entity; -+ if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) -+ st->last_idle = entity; -+ -+ bfq_insert(&st->idle, entity); -+ -+ if (bfqq) -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); -+} -+ -+/** -+ * bfq_forget_entity - remove an entity from the wfq trees. -+ * @st: the service tree. -+ * @entity: the entity being removed. -+ * -+ * Update the device status and forget everything about @entity, putting -+ * the device reference to it, if it is a queue. Entities belonging to -+ * groups are not refcounted. -+ */ -+static void bfq_forget_entity(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct bfq_sched_data *sd; -+ -+ BUG_ON(!entity->on_st); -+ -+ entity->on_st = 0; -+ st->wsum -= entity->weight; -+ if (bfqq) { -+ sd = entity->sched_data; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", -+ bfqq, atomic_read(&bfqq->ref)); -+ bfq_put_queue(bfqq); -+ } -+} -+ -+/** -+ * bfq_put_idle_entity - release the idle tree ref of an entity. -+ * @st: service tree for the entity. -+ * @entity: the entity being released. -+ */ -+static void bfq_put_idle_entity(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ bfq_idle_extract(st, entity); -+ bfq_forget_entity(st, entity); -+} -+ -+/** -+ * bfq_forget_idle - update the idle tree if necessary. -+ * @st: the service tree to act upon. -+ * -+ * To preserve the global O(log N) complexity we only remove one entry here; -+ * as the idle tree will not grow indefinitely this can be done safely. -+ */ -+static void bfq_forget_idle(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *first_idle = st->first_idle; -+ struct bfq_entity *last_idle = st->last_idle; -+ -+ if (RB_EMPTY_ROOT(&st->active) && last_idle && -+ !bfq_gt(last_idle->finish, st->vtime)) { -+ /* -+ * Forget the whole idle tree, increasing the vtime past -+ * the last finish time of idle entities. -+ */ -+ st->vtime = last_idle->finish; -+ } -+ -+ if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) -+ bfq_put_idle_entity(st, first_idle); -+} -+ -+static struct bfq_service_tree * -+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_service_tree *new_st = old_st; -+ -+ if (entity->prio_changed) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned short prev_weight, new_weight; -+ struct bfq_data *bfqd = NULL; -+ struct rb_root *root; -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ struct bfq_sched_data *sd; -+ struct bfq_group *bfqg; -+#endif -+ -+ if (bfqq) -+ bfqd = bfqq->bfqd; -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { -+ sd = entity->my_sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+ BUG_ON(!bfqd); -+ } -+#endif -+ -+ BUG_ON(old_st->wsum < entity->weight); -+ old_st->wsum -= entity->weight; -+ -+ if (entity->new_weight != entity->orig_weight) { -+ if (entity->new_weight < BFQ_MIN_WEIGHT || -+ entity->new_weight > BFQ_MAX_WEIGHT) { -+ pr_crit("update_weight_prio: new_weight %d\n", -+ entity->new_weight); -+ BUG(); -+ } -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) -+ bfqq->ioprio = -+ bfq_weight_to_ioprio(entity->orig_weight); -+ } -+ -+ if (bfqq) -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ entity->prio_changed = 0; -+ -+ /* -+ * NOTE: here we may be changing the weight too early, -+ * this will cause unfairness. The correct approach -+ * would have required additional complexity to defer -+ * weight changes to the proper time instants (i.e., -+ * when entity->finish <= old_st->vtime). -+ */ -+ new_st = bfq_entity_service_tree(entity); -+ -+ prev_weight = entity->weight; -+ new_weight = entity->orig_weight * -+ (bfqq ? bfqq->wr_coeff : 1); -+ /* -+ * If the weight of the entity changes, remove the entity -+ * from its old weight counter (if there is a counter -+ * associated with the entity), and add it to the counter -+ * associated with its new weight. -+ */ -+ if (prev_weight != new_weight) { -+ root = bfqq ? &bfqd->queue_weights_tree : -+ &bfqd->group_weights_tree; -+ bfq_weights_tree_remove(bfqd, entity, root); -+ } -+ entity->weight = new_weight; -+ /* -+ * Add the entity to its weights tree only if it is -+ * not associated with a weight-raised queue. -+ */ -+ if (prev_weight != new_weight && -+ (bfqq ? bfqq->wr_coeff == 1 : 1)) -+ /* If we get here, root has been initialized. */ -+ bfq_weights_tree_add(bfqd, entity, root); -+ -+ new_st->wsum += entity->weight; -+ -+ if (new_st != old_st) -+ entity->start = new_st->vtime; -+ } -+ -+ return new_st; -+} -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); -+#endif -+ -+/** -+ * bfq_bfqq_served - update the scheduler status after selection for -+ * service. -+ * @bfqq: the queue being served. -+ * @served: bytes to transfer. -+ * -+ * NOTE: this can be optimized, as the timestamps of upper level entities -+ * are synchronized every time a new bfqq is selected for service. By now, -+ * we keep it to better check consistency. -+ */ -+static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st; -+ -+ for_each_entity(entity) { -+ st = bfq_entity_service_tree(entity); -+ -+ entity->service += served; -+ BUG_ON(entity->service > entity->budget); -+ BUG_ON(st->wsum == 0); -+ -+ st->vtime += bfq_delta(served, st->wsum); -+ bfq_forget_idle(st); -+ } -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); -+#endif -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); -+} -+ -+/** -+ * bfq_bfqq_charge_full_budget - set the service to the entity budget. -+ * @bfqq: the queue that needs a service update. -+ * -+ * When it's not possible to be fair in the service domain, because -+ * a queue is not consuming its budget fast enough (the meaning of -+ * fast depends on the timeout parameter), we charge it a full -+ * budget. In this way we should obtain a sort of time-domain -+ * fairness among all the seeky/slow queues. -+ */ -+static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); -+ -+ bfq_bfqq_served(bfqq, entity->budget - entity->service); -+} -+ -+/** -+ * __bfq_activate_entity - activate an entity. -+ * @entity: the entity being activated. -+ * -+ * Called whenever an entity is activated, i.e., it is not active and one -+ * of its children receives a new request, or has to be reactivated due to -+ * budget exhaustion. It uses the current budget of the entity (and the -+ * service received if @entity is active) of the queue to calculate its -+ * timestamps. -+ */ -+static void __bfq_activate_entity(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ if (entity == sd->in_service_entity) { -+ BUG_ON(entity->tree); -+ /* -+ * If we are requeueing the current entity we have -+ * to take care of not charging to it service it has -+ * not received. -+ */ -+ bfq_calc_finish(entity, entity->service); -+ entity->start = entity->finish; -+ sd->in_service_entity = NULL; -+ } else if (entity->tree == &st->active) { -+ /* -+ * Requeueing an entity due to a change of some -+ * next_in_service entity below it. We reuse the -+ * old start time. -+ */ -+ bfq_active_extract(st, entity); -+ } else if (entity->tree == &st->idle) { -+ /* -+ * Must be on the idle tree, bfq_idle_extract() will -+ * check for that. -+ */ -+ bfq_idle_extract(st, entity); -+ entity->start = bfq_gt(st->vtime, entity->finish) ? -+ st->vtime : entity->finish; -+ } else { -+ /* -+ * The finish time of the entity may be invalid, and -+ * it is in the past for sure, otherwise the queue -+ * would have been on the idle tree. -+ */ -+ entity->start = st->vtime; -+ st->wsum += entity->weight; -+ bfq_get_entity(entity); -+ -+ BUG_ON(entity->on_st); -+ entity->on_st = 1; -+ } -+ -+ st = __bfq_entity_update_weight_prio(st, entity); -+ bfq_calc_finish(entity, entity->budget); -+ bfq_active_insert(st, entity); -+} -+ -+/** -+ * bfq_activate_entity - activate an entity and its ancestors if necessary. -+ * @entity: the entity to activate. -+ * -+ * Activate @entity and all the entities on the path from it to the root. -+ */ -+static void bfq_activate_entity(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sd; -+ -+ for_each_entity(entity) { -+ __bfq_activate_entity(entity); -+ -+ sd = entity->sched_data; -+ if (!bfq_update_next_in_service(sd)) -+ /* -+ * No need to propagate the activation to the -+ * upper entities, as they will be updated when -+ * the in-service entity is rescheduled. -+ */ -+ break; -+ } -+} -+ -+/** -+ * __bfq_deactivate_entity - deactivate an entity from its service tree. -+ * @entity: the entity to deactivate. -+ * @requeue: if false, the entity will not be put into the idle tree. -+ * -+ * Deactivate an entity, independently from its previous state. If the -+ * entity was not on a service tree just return, otherwise if it is on -+ * any scheduler tree, extract it from that tree, and if necessary -+ * and if the caller did not specify @requeue, put it on the idle tree. -+ * -+ * Return %1 if the caller should update the entity hierarchy, i.e., -+ * if the entity was in service or if it was the next_in_service for -+ * its sched_data; return %0 otherwise. -+ */ -+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st; -+ int was_in_service; -+ int ret = 0; -+ -+ if (sd == NULL || !entity->on_st) /* never activated, or inactive */ -+ return 0; -+ -+ st = bfq_entity_service_tree(entity); -+ was_in_service = entity == sd->in_service_entity; -+ -+ BUG_ON(was_in_service && entity->tree); -+ -+ if (was_in_service) { -+ bfq_calc_finish(entity, entity->service); -+ sd->in_service_entity = NULL; -+ } else if (entity->tree == &st->active) -+ bfq_active_extract(st, entity); -+ else if (entity->tree == &st->idle) -+ bfq_idle_extract(st, entity); -+ else if (entity->tree) -+ BUG(); -+ -+ if (was_in_service || sd->next_in_service == entity) -+ ret = bfq_update_next_in_service(sd); -+ -+ if (!requeue || !bfq_gt(entity->finish, st->vtime)) -+ bfq_forget_entity(st, entity); -+ else -+ bfq_idle_insert(st, entity); -+ -+ BUG_ON(sd->in_service_entity == entity); -+ BUG_ON(sd->next_in_service == entity); -+ -+ return ret; -+} -+ -+/** -+ * bfq_deactivate_entity - deactivate an entity. -+ * @entity: the entity to deactivate. -+ * @requeue: true if the entity can be put on the idle tree -+ */ -+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) -+{ -+ struct bfq_sched_data *sd; -+ struct bfq_entity *parent; -+ -+ for_each_entity_safe(entity, parent) { -+ sd = entity->sched_data; -+ -+ if (!__bfq_deactivate_entity(entity, requeue)) -+ /* -+ * The parent entity is still backlogged, and -+ * we don't need to update it as it is still -+ * in service. -+ */ -+ break; -+ -+ if (sd->next_in_service) -+ /* -+ * The parent entity is still backlogged and -+ * the budgets on the path towards the root -+ * need to be updated. -+ */ -+ goto update; -+ -+ /* -+ * If we reach there the parent is no more backlogged and -+ * we want to propagate the dequeue upwards. -+ */ -+ requeue = 1; -+ } -+ -+ return; -+ -+update: -+ entity = parent; -+ for_each_entity(entity) { -+ __bfq_activate_entity(entity); -+ -+ sd = entity->sched_data; -+ if (!bfq_update_next_in_service(sd)) -+ break; -+ } -+} -+ -+/** -+ * bfq_update_vtime - update vtime if necessary. -+ * @st: the service tree to act upon. -+ * -+ * If necessary update the service tree vtime to have at least one -+ * eligible entity, skipping to its start time. Assumes that the -+ * active tree of the device is not empty. -+ * -+ * NOTE: this hierarchical implementation updates vtimes quite often, -+ * we may end up with reactivated processes getting timestamps after a -+ * vtime skip done because we needed a ->first_active entity on some -+ * intermediate node. -+ */ -+static void bfq_update_vtime(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *entry; -+ struct rb_node *node = st->active.rb_node; -+ -+ entry = rb_entry(node, struct bfq_entity, rb_node); -+ if (bfq_gt(entry->min_start, st->vtime)) { -+ st->vtime = entry->min_start; -+ bfq_forget_idle(st); -+ } -+} -+ -+/** -+ * bfq_first_active_entity - find the eligible entity with -+ * the smallest finish time -+ * @st: the service tree to select from. -+ * -+ * This function searches the first schedulable entity, starting from the -+ * root of the tree and going on the left every time on this side there is -+ * a subtree with at least one eligible (start >= vtime) entity. The path on -+ * the right is followed only if a) the left subtree contains no eligible -+ * entities and b) no eligible entity has been found yet. -+ */ -+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *entry, *first = NULL; -+ struct rb_node *node = st->active.rb_node; -+ -+ while (node) { -+ entry = rb_entry(node, struct bfq_entity, rb_node); -+left: -+ if (!bfq_gt(entry->start, st->vtime)) -+ first = entry; -+ -+ BUG_ON(bfq_gt(entry->min_start, st->vtime)); -+ -+ if (node->rb_left) { -+ entry = rb_entry(node->rb_left, -+ struct bfq_entity, rb_node); -+ if (!bfq_gt(entry->min_start, st->vtime)) { -+ node = node->rb_left; -+ goto left; -+ } -+ } -+ if (first) -+ break; -+ node = node->rb_right; -+ } -+ -+ BUG_ON(!first && !RB_EMPTY_ROOT(&st->active)); -+ return first; -+} -+ -+/** -+ * __bfq_lookup_next_entity - return the first eligible entity in @st. -+ * @st: the service tree. -+ * -+ * Update the virtual time in @st and return the first eligible entity -+ * it contains. -+ */ -+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, -+ bool force) -+{ -+ struct bfq_entity *entity, *new_next_in_service = NULL; -+ -+ if (RB_EMPTY_ROOT(&st->active)) -+ return NULL; -+ -+ bfq_update_vtime(st); -+ entity = bfq_first_active_entity(st); -+ BUG_ON(bfq_gt(entity->start, st->vtime)); -+ -+ /* -+ * If the chosen entity does not match with the sched_data's -+ * next_in_service and we are forcedly serving the IDLE priority -+ * class tree, bubble up budget update. -+ */ -+ if (unlikely(force && entity != entity->sched_data->next_in_service)) { -+ new_next_in_service = entity; -+ for_each_entity(new_next_in_service) -+ bfq_update_budget(new_next_in_service); -+ } -+ -+ return entity; -+} -+ -+/** -+ * bfq_lookup_next_entity - return the first eligible entity in @sd. -+ * @sd: the sched_data. -+ * @extract: if true the returned entity will be also extracted from @sd. -+ * -+ * NOTE: since we cache the next_in_service entity at each level of the -+ * hierarchy, the complexity of the lookup can be decreased with -+ * absolutely no effort just returning the cached next_in_service value; -+ * we prefer to do full lookups to test the consistency of * the data -+ * structures. -+ */ -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, -+ int extract, -+ struct bfq_data *bfqd) -+{ -+ struct bfq_service_tree *st = sd->service_tree; -+ struct bfq_entity *entity; -+ int i = 0; -+ -+ BUG_ON(sd->in_service_entity); -+ -+ if (bfqd && -+ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { -+ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, -+ true); -+ if (entity) { -+ i = BFQ_IOPRIO_CLASSES - 1; -+ bfqd->bfq_class_idle_last_service = jiffies; -+ sd->next_in_service = entity; -+ } -+ } -+ for (; i < BFQ_IOPRIO_CLASSES; i++) { -+ entity = __bfq_lookup_next_entity(st + i, false); -+ if (entity) { -+ if (extract) { -+ bfq_check_next_in_service(sd, entity); -+ bfq_active_extract(st + i, entity); -+ sd->in_service_entity = entity; -+ sd->next_in_service = NULL; -+ } -+ break; -+ } -+ } -+ -+ return entity; -+} -+ -+/* -+ * Get next queue for service. -+ */ -+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_entity *entity = NULL; -+ struct bfq_sched_data *sd; -+ struct bfq_queue *bfqq; -+ -+ BUG_ON(bfqd->in_service_queue); -+ -+ if (bfqd->busy_queues == 0) -+ return NULL; -+ -+ sd = &bfqd->root_group->sched_data; -+ for (; sd ; sd = entity->my_sched_data) { -+ entity = bfq_lookup_next_entity(sd, 1, bfqd); -+ BUG_ON(!entity); -+ entity->service = 0; -+ } -+ -+ bfqq = bfq_entity_to_bfqq(entity); -+ BUG_ON(!bfqq); -+ -+ return bfqq; -+} -+ -+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) -+{ -+ if (bfqd->in_service_bic) { -+ put_io_context(bfqd->in_service_bic->icq.ioc); -+ bfqd->in_service_bic = NULL; -+ } -+ -+ bfqd->in_service_queue = NULL; -+ del_timer(&bfqd->idle_slice_timer); -+} -+ -+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ int requeue) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfqq == bfqd->in_service_queue) -+ __bfq_bfqd_reset_in_service(bfqd); -+ -+ bfq_deactivate_entity(entity, requeue); -+} -+ -+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ bfq_activate_entity(entity); -+} -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -+#endif -+ -+/* -+ * Called when the bfqq no longer has requests pending, remove it from -+ * the service tree. -+ */ -+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ int requeue) -+{ -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ bfq_log_bfqq(bfqd, bfqq, "del from busy"); -+ -+ bfq_clear_bfqq_busy(bfqq); -+ -+ BUG_ON(bfqd->busy_queues == 0); -+ bfqd->busy_queues--; -+ -+ if (!bfqq->dispatched) { -+ bfq_weights_tree_remove(bfqd, &bfqq->entity, -+ &bfqd->queue_weights_tree); -+ if (!blk_queue_nonrot(bfqd->queue)) { -+ BUG_ON(!bfqd->busy_in_flight_queues); -+ bfqd->busy_in_flight_queues--; -+ if (bfq_bfqq_constantly_seeky(bfqq)) { -+ BUG_ON(!bfqd-> -+ const_seeky_busy_in_flight_queues); -+ bfqd->const_seeky_busy_in_flight_queues--; -+ } -+ } -+ } -+ if (bfqq->wr_coeff > 1) -+ bfqd->wr_busy_queues--; -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_update_dequeue(bfqq_group(bfqq)); -+#endif -+ -+ bfq_deactivate_bfqq(bfqd, bfqq, requeue); -+} -+ -+/* -+ * Called when an inactive queue receives a new request. -+ */ -+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ -+ bfq_log_bfqq(bfqd, bfqq, "add to busy"); -+ -+ bfq_activate_bfqq(bfqd, bfqq); -+ -+ bfq_mark_bfqq_busy(bfqq); -+ bfqd->busy_queues++; -+ -+ if (!bfqq->dispatched) { -+ if (bfqq->wr_coeff == 1) -+ bfq_weights_tree_add(bfqd, &bfqq->entity, -+ &bfqd->queue_weights_tree); -+ if (!blk_queue_nonrot(bfqd->queue)) { -+ bfqd->busy_in_flight_queues++; -+ if (bfq_bfqq_constantly_seeky(bfqq)) -+ bfqd->const_seeky_busy_in_flight_queues++; -+ } -+ } -+ if (bfqq->wr_coeff > 1) -+ bfqd->wr_busy_queues++; -+} -diff --git a/block/bfq.h b/block/bfq.h -new file mode 100644 -index 0000000..2bf54ae ---- /dev/null -+++ b/block/bfq.h -@@ -0,0 +1,801 @@ -+/* -+ * BFQ-v7r11 for 4.5.0: data structures and common functions prototypes. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe -+ * -+ * Copyright (C) 2008 Fabio Checconi -+ * Paolo Valente -+ * -+ * Copyright (C) 2010 Paolo Valente -+ */ -+ -+#ifndef _BFQ_H -+#define _BFQ_H -+ -+#include -+#include -+#include -+#include -+#include -+ -+#define BFQ_IOPRIO_CLASSES 3 -+#define BFQ_CL_IDLE_TIMEOUT (HZ/5) -+ -+#define BFQ_MIN_WEIGHT 1 -+#define BFQ_MAX_WEIGHT 1000 -+#define BFQ_WEIGHT_CONVERSION_COEFF 10 -+ -+#define BFQ_DEFAULT_QUEUE_IOPRIO 4 -+ -+#define BFQ_DEFAULT_GRP_WEIGHT 10 -+#define BFQ_DEFAULT_GRP_IOPRIO 0 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -+ -+struct bfq_entity; -+ -+/** -+ * struct bfq_service_tree - per ioprio_class service tree. -+ * @active: tree for active entities (i.e., those backlogged). -+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). -+ * @first_idle: idle entity with minimum F_i. -+ * @last_idle: idle entity with maximum F_i. -+ * @vtime: scheduler virtual time. -+ * @wsum: scheduler weight sum; active and idle entities contribute to it. -+ * -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each -+ * ioprio_class has its own independent scheduler, and so its own -+ * bfq_service_tree. All the fields are protected by the queue lock -+ * of the containing bfqd. -+ */ -+struct bfq_service_tree { -+ struct rb_root active; -+ struct rb_root idle; -+ -+ struct bfq_entity *first_idle; -+ struct bfq_entity *last_idle; -+ -+ u64 vtime; -+ unsigned long wsum; -+}; -+ -+/** -+ * struct bfq_sched_data - multi-class scheduler. -+ * @in_service_entity: entity in service. -+ * @next_in_service: head-of-the-line entity in the scheduler. -+ * @service_tree: array of service trees, one per ioprio_class. -+ * -+ * bfq_sched_data is the basic scheduler queue. It supports three -+ * ioprio_classes, and can be used either as a toplevel queue or as -+ * an intermediate queue on a hierarchical setup. -+ * @next_in_service points to the active entity of the sched_data -+ * service trees that will be scheduled next. -+ * -+ * The supported ioprio_classes are the same as in CFQ, in descending -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. -+ * Requests from higher priority queues are served before all the -+ * requests from lower priority queues; among requests of the same -+ * queue requests are served according to B-WF2Q+. -+ * All the fields are protected by the queue lock of the containing bfqd. -+ */ -+struct bfq_sched_data { -+ struct bfq_entity *in_service_entity; -+ struct bfq_entity *next_in_service; -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; -+}; -+ -+/** -+ * struct bfq_weight_counter - counter of the number of all active entities -+ * with a given weight. -+ * @weight: weight of the entities that this counter refers to. -+ * @num_active: number of active entities with this weight. -+ * @weights_node: weights tree member (see bfq_data's @queue_weights_tree -+ * and @group_weights_tree). -+ */ -+struct bfq_weight_counter { -+ short int weight; -+ unsigned int num_active; -+ struct rb_node weights_node; -+}; -+ -+/** -+ * struct bfq_entity - schedulable entity. -+ * @rb_node: service_tree member. -+ * @weight_counter: pointer to the weight counter associated with this entity. -+ * @on_st: flag, true if the entity is on a tree (either the active or -+ * the idle one of its service_tree). -+ * @finish: B-WF2Q+ finish timestamp (aka F_i). -+ * @start: B-WF2Q+ start timestamp (aka S_i). -+ * @tree: tree the entity is enqueued into; %NULL if not on a tree. -+ * @min_start: minimum start time of the (active) subtree rooted at -+ * this entity; used for O(log N) lookups into active trees. -+ * @service: service received during the last round of service. -+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. -+ * @weight: weight of the queue -+ * @parent: parent entity, for hierarchical scheduling. -+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the -+ * associated scheduler queue, %NULL on leaf nodes. -+ * @sched_data: the scheduler queue this entity belongs to. -+ * @ioprio: the ioprio in use. -+ * @new_weight: when a weight change is requested, the new weight value. -+ * @orig_weight: original weight, used to implement weight boosting -+ * @prio_changed: flag, true when the user requested a weight, ioprio or -+ * ioprio_class change. -+ * -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each -+ * entity belongs to the sched_data of the parent group in the cgroup -+ * hierarchy. Non-leaf entities have also their own sched_data, stored -+ * in @my_sched_data. -+ * -+ * Each entity stores independently its priority values; this would -+ * allow different weights on different devices, but this -+ * functionality is not exported to userspace by now. Priorities and -+ * weights are updated lazily, first storing the new values into the -+ * new_* fields, then setting the @prio_changed flag. As soon as -+ * there is a transition in the entity state that allows the priority -+ * update to take place the effective and the requested priority -+ * values are synchronized. -+ * -+ * Unless cgroups are used, the weight value is calculated from the -+ * ioprio to export the same interface as CFQ. When dealing with -+ * ``well-behaved'' queues (i.e., queues that do not spend too much -+ * time to consume their budget and have true sequential behavior, and -+ * when there are no external factors breaking anticipation) the -+ * relative weights at each level of the cgroups hierarchy should be -+ * guaranteed. All the fields are protected by the queue lock of the -+ * containing bfqd. -+ */ -+struct bfq_entity { -+ struct rb_node rb_node; -+ struct bfq_weight_counter *weight_counter; -+ -+ int on_st; -+ -+ u64 finish; -+ u64 start; -+ -+ struct rb_root *tree; -+ -+ u64 min_start; -+ -+ int service, budget; -+ unsigned short weight, new_weight; -+ unsigned short orig_weight; -+ -+ struct bfq_entity *parent; -+ -+ struct bfq_sched_data *my_sched_data; -+ struct bfq_sched_data *sched_data; -+ -+ int prio_changed; -+}; -+ -+struct bfq_group; -+ -+/** -+ * struct bfq_queue - leaf schedulable entity. -+ * @ref: reference counter. -+ * @bfqd: parent bfq_data. -+ * @new_ioprio: when an ioprio change is requested, the new ioprio value. -+ * @ioprio_class: the ioprio_class in use. -+ * @new_ioprio_class: when an ioprio_class change is requested, the new -+ * ioprio_class value. -+ * @new_bfqq: shared bfq_queue if queue is cooperating with -+ * one or more other queues. -+ * @sort_list: sorted list of pending requests. -+ * @next_rq: if fifo isn't expired, next request to serve. -+ * @queued: nr of requests queued in @sort_list. -+ * @allocated: currently allocated requests. -+ * @meta_pending: pending metadata requests. -+ * @fifo: fifo list of requests in sort_list. -+ * @entity: entity representing this queue in the scheduler. -+ * @max_budget: maximum budget allowed from the feedback mechanism. -+ * @budget_timeout: budget expiration (in jiffies). -+ * @dispatched: number of requests on the dispatch list or inside driver. -+ * @flags: status flags. -+ * @bfqq_list: node for active/idle bfqq list inside our bfqd. -+ * @burst_list_node: node for the device's burst list. -+ * @seek_samples: number of seeks sampled -+ * @seek_total: sum of the distances of the seeks sampled -+ * @seek_mean: mean seek distance -+ * @last_request_pos: position of the last request enqueued -+ * @requests_within_timer: number of consecutive pairs of request completion -+ * and arrival, such that the queue becomes idle -+ * after the completion, but the next request arrives -+ * within an idle time slice; used only if the queue's -+ * IO_bound has been cleared. -+ * @pid: pid of the process owning the queue, used for logging purposes. -+ * @last_wr_start_finish: start time of the current weight-raising period if -+ * the @bfq-queue is being weight-raised, otherwise -+ * finish time of the last weight-raising period -+ * @wr_cur_max_time: current max raising time for this queue -+ * @soft_rt_next_start: minimum time instant such that, only if a new -+ * request is enqueued after this time instant in an -+ * idle @bfq_queue with no outstanding requests, then -+ * the task associated with the queue it is deemed as -+ * soft real-time (see the comments to the function -+ * bfq_bfqq_softrt_next_start()) -+ * @last_idle_bklogged: time of the last transition of the @bfq_queue from -+ * idle to backlogged -+ * @service_from_backlogged: cumulative service received from the @bfq_queue -+ * since the last transition from idle to -+ * backlogged -+ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the -+ * queue is shared -+ * -+ * A bfq_queue is a leaf request queue; it can be associated with an -+ * io_context or more, if it is async or shared between cooperating -+ * processes. @cgroup holds a reference to the cgroup, to be sure that it -+ * does not disappear while a bfqq still references it (mostly to avoid -+ * races between request issuing and task migration followed by cgroup -+ * destruction). -+ * All the fields are protected by the queue lock of the containing bfqd. -+ */ -+struct bfq_queue { -+ atomic_t ref; -+ struct bfq_data *bfqd; -+ -+ unsigned short ioprio, new_ioprio; -+ unsigned short ioprio_class, new_ioprio_class; -+ -+ /* fields for cooperating queues handling */ -+ struct bfq_queue *new_bfqq; -+ struct rb_node pos_node; -+ struct rb_root *pos_root; -+ -+ struct rb_root sort_list; -+ struct request *next_rq; -+ int queued[2]; -+ int allocated[2]; -+ int meta_pending; -+ struct list_head fifo; -+ -+ struct bfq_entity entity; -+ -+ int max_budget; -+ unsigned long budget_timeout; -+ -+ int dispatched; -+ -+ unsigned int flags; -+ -+ struct list_head bfqq_list; -+ -+ struct hlist_node burst_list_node; -+ -+ unsigned int seek_samples; -+ u64 seek_total; -+ sector_t seek_mean; -+ sector_t last_request_pos; -+ -+ unsigned int requests_within_timer; -+ -+ pid_t pid; -+ struct bfq_io_cq *bic; -+ -+ /* weight-raising fields */ -+ unsigned long wr_cur_max_time; -+ unsigned long soft_rt_next_start; -+ unsigned long last_wr_start_finish; -+ unsigned int wr_coeff; -+ unsigned long last_idle_bklogged; -+ unsigned long service_from_backlogged; -+}; -+ -+/** -+ * struct bfq_ttime - per process thinktime stats. -+ * @ttime_total: total process thinktime -+ * @ttime_samples: number of thinktime samples -+ * @ttime_mean: average process thinktime -+ */ -+struct bfq_ttime { -+ unsigned long last_end_request; -+ -+ unsigned long ttime_total; -+ unsigned long ttime_samples; -+ unsigned long ttime_mean; -+}; -+ -+/** -+ * struct bfq_io_cq - per (request_queue, io_context) structure. -+ * @icq: associated io_cq structure -+ * @bfqq: array of two process queues, the sync and the async -+ * @ttime: associated @bfq_ttime struct -+ * @ioprio: per (request_queue, blkcg) ioprio. -+ * @blkcg_id: id of the blkcg the related io_cq belongs to. -+ */ -+struct bfq_io_cq { -+ struct io_cq icq; /* must be the first member */ -+ struct bfq_queue *bfqq[2]; -+ struct bfq_ttime ttime; -+ int ioprio; -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ uint64_t blkcg_id; /* the current blkcg ID */ -+#endif -+}; -+ -+enum bfq_device_speed { -+ BFQ_BFQD_FAST, -+ BFQ_BFQD_SLOW, -+}; -+ -+/** -+ * struct bfq_data - per device data structure. -+ * @queue: request queue for the managed device. -+ * @root_group: root bfq_group for the device. -+ * @active_numerous_groups: number of bfq_groups containing more than one -+ * active @bfq_entity. -+ * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by -+ * weight. Used to keep track of whether all @bfq_queues -+ * have the same weight. The tree contains one counter -+ * for each distinct weight associated to some active -+ * and not weight-raised @bfq_queue (see the comments to -+ * the functions bfq_weights_tree_[add|remove] for -+ * further details). -+ * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted -+ * by weight. Used to keep track of whether all -+ * @bfq_groups have the same weight. The tree contains -+ * one counter for each distinct weight associated to -+ * some active @bfq_group (see the comments to the -+ * functions bfq_weights_tree_[add|remove] for further -+ * details). -+ * @busy_queues: number of bfq_queues containing requests (including the -+ * queue in service, even if it is idling). -+ * @busy_in_flight_queues: number of @bfq_queues containing pending or -+ * in-flight requests, plus the @bfq_queue in -+ * service, even if idle but waiting for the -+ * possible arrival of its next sync request. This -+ * field is updated only if the device is rotational, -+ * but used only if the device is also NCQ-capable. -+ * The reason why the field is updated also for non- -+ * NCQ-capable rotational devices is related to the -+ * fact that the value of @hw_tag may be set also -+ * later than when busy_in_flight_queues may need to -+ * be incremented for the first time(s). Taking also -+ * this possibility into account, to avoid unbalanced -+ * increments/decrements, would imply more overhead -+ * than just updating busy_in_flight_queues -+ * regardless of the value of @hw_tag. -+ * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues -+ * (that is, seeky queues that expired -+ * for budget timeout at least once) -+ * containing pending or in-flight -+ * requests, including the in-service -+ * @bfq_queue if constantly seeky. This -+ * field is updated only if the device -+ * is rotational, but used only if the -+ * device is also NCQ-capable (see the -+ * comments to @busy_in_flight_queues). -+ * @wr_busy_queues: number of weight-raised busy @bfq_queues. -+ * @queued: number of queued requests. -+ * @rq_in_driver: number of requests dispatched and waiting for completion. -+ * @sync_flight: number of sync requests in the driver. -+ * @max_rq_in_driver: max number of reqs in driver in the last -+ * @hw_tag_samples completed requests. -+ * @hw_tag_samples: nr of samples used to calculate hw_tag. -+ * @hw_tag: flag set to one if the driver is showing a queueing behavior. -+ * @budgets_assigned: number of budgets assigned. -+ * @idle_slice_timer: timer set when idling for the next sequential request -+ * from the queue in service. -+ * @unplug_work: delayed work to restart dispatching on the request queue. -+ * @in_service_queue: bfq_queue in service. -+ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. -+ * @last_position: on-disk position of the last served request. -+ * @last_budget_start: beginning of the last budget. -+ * @last_idling_start: beginning of the last idle slice. -+ * @peak_rate: peak transfer rate observed for a budget. -+ * @peak_rate_samples: number of samples used to calculate @peak_rate. -+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before -+ * rescheduling. -+ * @active_list: list of all the bfq_queues active on the device. -+ * @idle_list: list of all the bfq_queues idle on the device. -+ * @bfq_fifo_expire: timeout for async/sync requests; when it expires -+ * requests are served in fifo order. -+ * @bfq_back_penalty: weight of backward seeks wrt forward ones. -+ * @bfq_back_max: maximum allowed backward seek. -+ * @bfq_slice_idle: maximum idling time. -+ * @bfq_user_max_budget: user-configured max budget value -+ * (0 for auto-tuning). -+ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to -+ * async queues. -+ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to -+ * to prevent seeky queues to impose long latencies to well -+ * behaved ones (this also implies that seeky queues cannot -+ * receive guarantees in the service domain; after a timeout -+ * they are charged for the whole allocated budget, to try -+ * to preserve a behavior reasonably fair among them, but -+ * without service-domain guarantees). -+ * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is -+ * no more granted any weight-raising. -+ * @bfq_failed_cooperations: number of consecutive failed cooperation -+ * chances after which weight-raising is restored -+ * to a queue subject to more than bfq_coop_thresh -+ * queue merges. -+ * @bfq_requests_within_timer: number of consecutive requests that must be -+ * issued within the idle time slice to set -+ * again idling to a queue which was marked as -+ * non-I/O-bound (see the definition of the -+ * IO_bound flag for further details). -+ * @last_ins_in_burst: last time at which a queue entered the current -+ * burst of queues being activated shortly after -+ * each other; for more details about this and the -+ * following parameters related to a burst of -+ * activations, see the comments to the function -+ * @bfq_handle_burst. -+ * @bfq_burst_interval: reference time interval used to decide whether a -+ * queue has been activated shortly after -+ * @last_ins_in_burst. -+ * @burst_size: number of queues in the current burst of queue activations. -+ * @bfq_large_burst_thresh: maximum burst size above which the current -+ * queue-activation burst is deemed as 'large'. -+ * @large_burst: true if a large queue-activation burst is in progress. -+ * @burst_list: head of the burst list (as for the above fields, more details -+ * in the comments to the function bfq_handle_burst). -+ * @low_latency: if set to true, low-latency heuristics are enabled. -+ * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised -+ * queue is multiplied. -+ * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies). -+ * @bfq_wr_rt_max_time: maximum duration for soft real-time processes. -+ * @bfq_wr_min_idle_time: minimum idle period after which weight-raising -+ * may be reactivated for a queue (in jiffies). -+ * @bfq_wr_min_inter_arr_async: minimum period between request arrivals -+ * after which weight-raising may be -+ * reactivated for an already busy queue -+ * (in jiffies). -+ * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue, -+ * sectors per seconds. -+ * @RT_prod: cached value of the product R*T used for computing the maximum -+ * duration of the weight raising automatically. -+ * @device_speed: device-speed class for the low-latency heuristic. -+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions. -+ * -+ * All the fields are protected by the @queue lock. -+ */ -+struct bfq_data { -+ struct request_queue *queue; -+ -+ struct bfq_group *root_group; -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ int active_numerous_groups; -+#endif -+ -+ struct rb_root queue_weights_tree; -+ struct rb_root group_weights_tree; -+ -+ int busy_queues; -+ int busy_in_flight_queues; -+ int const_seeky_busy_in_flight_queues; -+ int wr_busy_queues; -+ int queued; -+ int rq_in_driver; -+ int sync_flight; -+ -+ int max_rq_in_driver; -+ int hw_tag_samples; -+ int hw_tag; -+ -+ int budgets_assigned; -+ -+ struct timer_list idle_slice_timer; -+ struct work_struct unplug_work; -+ -+ struct bfq_queue *in_service_queue; -+ struct bfq_io_cq *in_service_bic; -+ -+ sector_t last_position; -+ -+ ktime_t last_budget_start; -+ ktime_t last_idling_start; -+ int peak_rate_samples; -+ u64 peak_rate; -+ int bfq_max_budget; -+ -+ struct list_head active_list; -+ struct list_head idle_list; -+ -+ unsigned int bfq_fifo_expire[2]; -+ unsigned int bfq_back_penalty; -+ unsigned int bfq_back_max; -+ unsigned int bfq_slice_idle; -+ u64 bfq_class_idle_last_service; -+ -+ int bfq_user_max_budget; -+ int bfq_max_budget_async_rq; -+ unsigned int bfq_timeout[2]; -+ -+ unsigned int bfq_coop_thresh; -+ unsigned int bfq_failed_cooperations; -+ unsigned int bfq_requests_within_timer; -+ -+ unsigned long last_ins_in_burst; -+ unsigned long bfq_burst_interval; -+ int burst_size; -+ unsigned long bfq_large_burst_thresh; -+ bool large_burst; -+ struct hlist_head burst_list; -+ -+ bool low_latency; -+ -+ /* parameters of the low_latency heuristics */ -+ unsigned int bfq_wr_coeff; -+ unsigned int bfq_wr_max_time; -+ unsigned int bfq_wr_rt_max_time; -+ unsigned int bfq_wr_min_idle_time; -+ unsigned long bfq_wr_min_inter_arr_async; -+ unsigned int bfq_wr_max_softrt_rate; -+ u64 RT_prod; -+ enum bfq_device_speed device_speed; -+ -+ struct bfq_queue oom_bfqq; -+}; -+ -+enum bfqq_state_flags { -+ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ -+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ -+ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ -+ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ -+ BFQ_BFQQ_FLAG_IO_bound, /* -+ * bfqq has timed-out at least once -+ * having consumed at most 2/10 of -+ * its budget -+ */ -+ BFQ_BFQQ_FLAG_in_large_burst, /* -+ * bfqq activated in a large burst, -+ * see comments to bfq_handle_burst. -+ */ -+ BFQ_BFQQ_FLAG_constantly_seeky, /* -+ * bfqq has proved to be slow and -+ * seeky until budget timeout -+ */ -+ BFQ_BFQQ_FLAG_softrt_update, /* -+ * may need softrt-next-start -+ * update -+ */ -+}; -+ -+#define BFQ_BFQQ_FNS(name) \ -+static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ -+{ \ -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ -+} -+ -+BFQ_BFQQ_FNS(busy); -+BFQ_BFQQ_FNS(wait_request); -+BFQ_BFQQ_FNS(must_alloc); -+BFQ_BFQQ_FNS(fifo_expire); -+BFQ_BFQQ_FNS(idle_window); -+BFQ_BFQQ_FNS(sync); -+BFQ_BFQQ_FNS(budget_new); -+BFQ_BFQQ_FNS(IO_bound); -+BFQ_BFQQ_FNS(in_large_burst); -+BFQ_BFQQ_FNS(constantly_seeky); -+BFQ_BFQQ_FNS(softrt_update); -+#undef BFQ_BFQQ_FNS -+ -+/* Logging facilities. */ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+ -+/* Expiration reasons. */ -+enum bfqq_expiration { -+ BFQ_BFQQ_TOO_IDLE = 0, /* -+ * queue has been idling for -+ * too long -+ */ -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ -+}; -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ -+struct bfqg_stats { -+ /* total bytes transferred */ -+ struct blkg_rwstat service_bytes; -+ /* total IOs serviced, post merge */ -+ struct blkg_rwstat serviced; -+ /* number of ios merged */ -+ struct blkg_rwstat merged; -+ /* total time spent on device in ns, may not be accurate w/ queueing */ -+ struct blkg_rwstat service_time; -+ /* total time spent waiting in scheduler queue in ns */ -+ struct blkg_rwstat wait_time; -+ /* number of IOs queued up */ -+ struct blkg_rwstat queued; -+ /* total sectors transferred */ -+ struct blkg_stat sectors; -+ /* total disk time and nr sectors dispatched by this group */ -+ struct blkg_stat time; -+ /* time not charged to this cgroup */ -+ struct blkg_stat unaccounted_time; -+ /* sum of number of ios queued across all samples */ -+ struct blkg_stat avg_queue_size_sum; -+ /* count of samples taken for average */ -+ struct blkg_stat avg_queue_size_samples; -+ /* how many times this group has been removed from service tree */ -+ struct blkg_stat dequeue; -+ /* total time spent waiting for it to be assigned a timeslice. */ -+ struct blkg_stat group_wait_time; -+ /* time spent idling for this blkcg_gq */ -+ struct blkg_stat idle_time; -+ /* total time with empty current active q with other requests queued */ -+ struct blkg_stat empty_time; -+ /* fields after this shouldn't be cleared on stat reset */ -+ uint64_t start_group_wait_time; -+ uint64_t start_idle_time; -+ uint64_t start_empty_time; -+ uint16_t flags; -+}; -+ -+/* -+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. -+ * -+ * @ps: @blkcg_policy_storage that this structure inherits -+ * @weight: weight of the bfq_group -+ */ -+struct bfq_group_data { -+ /* must be the first member */ -+ struct blkcg_policy_data pd; -+ -+ unsigned short weight; -+}; -+ -+/** -+ * struct bfq_group - per (device, cgroup) data structure. -+ * @entity: schedulable entity to insert into the parent group sched_data. -+ * @sched_data: own sched_data, to contain child entities (they may be -+ * both bfq_queues and bfq_groups). -+ * @bfqd: the bfq_data for the device this group acts upon. -+ * @async_bfqq: array of async queues for all the tasks belonging to -+ * the group, one queue per ioprio value per ioprio_class, -+ * except for the idle class that has only one queue. -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used -+ * to avoid too many special cases during group creation/ -+ * migration. -+ * @active_entities: number of active entities belonging to the group; -+ * unused for the root group. Used to know whether there -+ * are groups with more than one active @bfq_entity -+ * (see the comments to the function -+ * bfq_bfqq_must_not_expire()). -+ * -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup -+ * there is a set of bfq_groups, each one collecting the lower-level -+ * entities belonging to the group that are acting on the same device. -+ * -+ * Locking works as follows: -+ * o @bfqd is protected by the queue lock, RCU is used to access it -+ * from the readers. -+ * o All the other fields are protected by the @bfqd queue lock. -+ */ -+struct bfq_group { -+ /* must be the first member */ -+ struct blkg_policy_data pd; -+ -+ struct bfq_entity entity; -+ struct bfq_sched_data sched_data; -+ -+ void *bfqd; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct bfq_entity *my_entity; -+ -+ int active_entities; -+ -+ struct bfqg_stats stats; -+ struct bfqg_stats dead_stats; /* stats pushed from dead children */ -+}; -+ -+#else -+struct bfq_group { -+ struct bfq_sched_data sched_data; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+}; -+#endif -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); -+ -+static struct bfq_service_tree * -+bfq_entity_service_tree(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sched_data = entity->sched_data; -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned int idx = bfqq ? bfqq->ioprio_class - 1 : -+ BFQ_DEFAULT_GRP_CLASS; -+ -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); -+ BUG_ON(sched_data == NULL); -+ -+ return sched_data->service_tree + idx; -+} -+ -+static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) -+{ -+ return bic->bfqq[is_sync]; -+} -+ -+static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, -+ bool is_sync) -+{ -+ bic->bfqq[is_sync] = bfqq; -+} -+ -+static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) -+{ -+ return bic->icq.q->elevator->elevator_data; -+} -+ -+/** -+ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. -+ * @ptr: a pointer to a bfqd. -+ * @flags: storage for the flags to be saved. -+ * -+ * This function allows bfqg->bfqd to be protected by the -+ * queue lock of the bfqd they reference; the pointer is dereferenced -+ * under RCU, so the storage for bfqd is assured to be safe as long -+ * as the RCU read side critical section does not end. After the -+ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be -+ * sure that no other writer accessed it. If we raced with a writer, -+ * the function returns NULL, with the queue unlocked, otherwise it -+ * returns the dereferenced pointer, with the queue locked. -+ */ -+static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags) -+{ -+ struct bfq_data *bfqd; -+ -+ rcu_read_lock(); -+ bfqd = rcu_dereference(*(struct bfq_data **)ptr); -+ -+ if (bfqd != NULL) { -+ spin_lock_irqsave(bfqd->queue->queue_lock, *flags); -+ if (ptr == NULL) -+ printk(KERN_CRIT "get_bfqd_locked pointer NULL\n"); -+ else if (*ptr == bfqd) -+ goto out; -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); -+ } -+ -+ bfqd = NULL; -+out: -+ rcu_read_unlock(); -+ return bfqd; -+} -+ -+static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) -+{ -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); -+} -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); -+static void bfq_put_queue(struct bfq_queue *bfqq); -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, int is_sync, -+ struct bfq_io_cq *bic, gfp_t gfp_mask); -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg); -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); -+ -+#endif /* _BFQ_H */ --- -2.10.0 - diff --git a/0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for.patch b/0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for.patch deleted file mode 100644 index 8a03246..0000000 --- a/0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for.patch +++ /dev/null @@ -1,1101 +0,0 @@ -From d1d10983cb4b593e7970e541c4c2721bbbdc21c8 Mon Sep 17 00:00:00 2001 -From: Mauro Andreolini -Date: Sun, 6 Sep 2015 16:09:05 +0200 -Subject: [PATCH 3/4] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r11 for - 4.11.0 - -A set of processes may happen to perform interleaved reads, i.e.,requests -whose union would give rise to a sequential read pattern. There are two -typical cases: in the first case, processes read fixed-size chunks of -data at a fixed distance from each other, while in the second case processes -may read variable-size chunks at variable distances. The latter case occurs -for example with QEMU, which splits the I/O generated by the guest into -multiple chunks, and lets these chunks be served by a pool of cooperating -processes, iteratively assigning the next chunk of I/O to the first -available process. CFQ uses actual queue merging for the first type of -rocesses, whereas it uses preemption to get a sequential read pattern out -of the read requests performed by the second type of processes. In the end -it uses two different mechanisms to achieve the same goal: boosting the -throughput with interleaved I/O. - -This patch introduces Early Queue Merge (EQM), a unified mechanism to get a -sequential read pattern with both types of processes. The main idea is -checking newly arrived requests against the next request of the active queue -both in case of actual request insert and in case of request merge. By doing -so, both the types of processes can be handled by just merging their queues. -EQM is then simpler and more compact than the pair of mechanisms used in -CFQ. - -Finally, EQM also preserves the typical low-latency properties of BFQ, by -properly restoring the weight-raising state of a queue when it gets back to -a non-merged state. - -Signed-off-by: Mauro Andreolini -Signed-off-by: Arianna Avanzini -Signed-off-by: Paolo Valente -Signed-off-by: Linus Walleij ---- - block/bfq-cgroup.c | 5 + - block/bfq-iosched.c | 685 +++++++++++++++++++++++++++++++++++++++++++++++++++- - block/bfq.h | 66 +++++ - 3 files changed, 743 insertions(+), 13 deletions(-) - -diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c -index 8b08a57..0367996 100644 ---- a/block/bfq-cgroup.c -+++ b/block/bfq-cgroup.c -@@ -440,6 +440,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) - */ - bfqg->bfqd = bfqd; - bfqg->active_entities = 0; -+ bfqg->rq_pos_tree = RB_ROOT; - } - - static void bfq_pd_free(struct blkg_policy_data *pd) -@@ -533,6 +534,9 @@ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, - return bfqg; - } - -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq); -+ - /** - * bfq_bfqq_move - migrate @bfqq to @bfqg. - * @bfqd: queue descriptor. -@@ -580,6 +584,7 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfqg_get(bfqg); - - if (busy) { -+ bfq_pos_tree_add_move(bfqd, bfqq); - if (resume) - bfq_activate_bfqq(bfqd, bfqq); - } -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c -index 85e2169..cf3e9b1 100644 ---- a/block/bfq-iosched.c -+++ b/block/bfq-iosched.c -@@ -295,6 +295,72 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, - } - } - -+static struct bfq_queue * -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, -+ sector_t sector, struct rb_node **ret_parent, -+ struct rb_node ***rb_link) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *bfqq = NULL; -+ -+ parent = NULL; -+ p = &root->rb_node; -+ while (*p) { -+ struct rb_node **n; -+ -+ parent = *p; -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ -+ /* -+ * Sort strictly based on sector. Smallest to the left, -+ * largest to the right. -+ */ -+ if (sector > blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_right; -+ else if (sector < blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_left; -+ else -+ break; -+ p = n; -+ bfqq = NULL; -+ } -+ -+ *ret_parent = parent; -+ if (rb_link) -+ *rb_link = p; -+ -+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", -+ (unsigned long long) sector, -+ bfqq ? bfqq->pid : 0); -+ -+ return bfqq; -+} -+ -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *__bfqq; -+ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ -+ if (bfq_class_idle(bfqq)) -+ return; -+ if (!bfqq->next_rq) -+ return; -+ -+ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, -+ blk_rq_pos(bfqq->next_rq), &parent, &p); -+ if (!__bfqq) { -+ rb_link_node(&bfqq->pos_node, parent, p); -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); -+ } else -+ bfqq->pos_root = NULL; -+} -+ - /* - * Tell whether there are active queues or groups with differentiated weights. - */ -@@ -527,6 +593,57 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) - return dur; - } - -+static unsigned int bfq_bfqq_cooperations(struct bfq_queue *bfqq) -+{ -+ return bfqq->bic ? bfqq->bic->cooperations : 0; -+} -+ -+static void -+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) -+{ -+ if (bic->saved_idle_window) -+ bfq_mark_bfqq_idle_window(bfqq); -+ else -+ bfq_clear_bfqq_idle_window(bfqq); -+ if (bic->saved_IO_bound) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ else -+ bfq_clear_bfqq_IO_bound(bfqq); -+ /* Assuming that the flag in_large_burst is already correctly set */ -+ if (bic->wr_time_left && bfqq->bfqd->low_latency && -+ !bfq_bfqq_in_large_burst(bfqq) && -+ bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { -+ /* -+ * Start a weight raising period with the duration given by -+ * the raising_time_left snapshot. -+ */ -+ if (bfq_bfqq_busy(bfqq)) -+ bfqq->bfqd->wr_busy_queues++; -+ bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bic->wr_time_left; -+ bfqq->last_wr_start_finish = jiffies; -+ bfqq->entity.prio_changed = 1; -+ } -+ /* -+ * Clear wr_time_left to prevent bfq_bfqq_save_state() from -+ * getting confused about the queue's need of a weight-raising -+ * period. -+ */ -+ bic->wr_time_left = 0; -+} -+ -+static int bfqq_process_refs(struct bfq_queue *bfqq) -+{ -+ int process_refs, io_refs; -+ -+ lockdep_assert_held(bfqq->bfqd->queue->queue_lock); -+ -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; -+ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; -+ BUG_ON(process_refs < 0); -+ return process_refs; -+} -+ - /* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ - static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) - { -@@ -763,8 +880,14 @@ static void bfq_add_request(struct request *rq) - BUG_ON(!next_rq); - bfqq->next_rq = next_rq; - -+ /* -+ * Adjust priority tree position, if next_rq changes. -+ */ -+ if (prev != bfqq->next_rq) -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ - if (!bfq_bfqq_busy(bfqq)) { -- bool soft_rt, in_burst, -+ bool soft_rt, coop_or_in_burst, - idle_for_long_time = time_is_before_jiffies( - bfqq->budget_timeout + - bfqd->bfq_wr_min_idle_time); -@@ -792,11 +915,12 @@ static void bfq_add_request(struct request *rq) - bfqd->last_ins_in_burst = jiffies; - } - -- in_burst = bfq_bfqq_in_large_burst(bfqq); -+ coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) || -+ bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh; - soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && -- !in_burst && -+ !coop_or_in_burst && - time_is_before_jiffies(bfqq->soft_rt_next_start); -- interactive = !in_burst && idle_for_long_time; -+ interactive = !coop_or_in_burst && idle_for_long_time; - entity->budget = max_t(unsigned long, bfqq->max_budget, - bfq_serv_to_charge(next_rq, bfqq)); - -@@ -815,6 +939,9 @@ static void bfq_add_request(struct request *rq) - if (!bfqd->low_latency) - goto add_bfqq_busy; - -+ if (bfq_bfqq_just_split(bfqq)) -+ goto set_prio_changed; -+ - /* - * If the queue: - * - is not being boosted, -@@ -839,7 +966,7 @@ static void bfq_add_request(struct request *rq) - } else if (old_wr_coeff > 1) { - if (interactive) - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -- else if (in_burst || -+ else if (coop_or_in_burst || - (bfqq->wr_cur_max_time == - bfqd->bfq_wr_rt_max_time && - !soft_rt)) { -@@ -904,6 +1031,7 @@ static void bfq_add_request(struct request *rq) - bfqd->bfq_wr_rt_max_time; - } - } -+set_prio_changed: - if (old_wr_coeff != bfqq->wr_coeff) - entity->prio_changed = 1; - add_bfqq_busy: -@@ -1046,6 +1174,15 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, - bfqd->last_position); - BUG_ON(!next_rq); - bfqq->next_rq = next_rq; -+ /* -+ * If next_rq changes, update both the queue's budget to -+ * fit the new request and the queue's position in its -+ * rq_pos_tree. -+ */ -+ if (prev != bfqq->next_rq) { -+ bfq_updated_next_req(bfqd, bfqq); -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } - } - } - -@@ -1128,11 +1265,346 @@ static void bfq_end_wr(struct bfq_data *bfqd) - spin_unlock_irq(bfqd->queue->queue_lock); - } - -+static sector_t bfq_io_struct_pos(void *io_struct, bool request) -+{ -+ if (request) -+ return blk_rq_pos(io_struct); -+ else -+ return ((struct bio *)io_struct)->bi_iter.bi_sector; -+} -+ -+static int bfq_rq_close_to_sector(void *io_struct, bool request, -+ sector_t sector) -+{ -+ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= -+ BFQQ_SEEK_THR; -+} -+ -+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ sector_t sector) -+{ -+ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ struct rb_node *parent, *node; -+ struct bfq_queue *__bfqq; -+ -+ if (RB_EMPTY_ROOT(root)) -+ return NULL; -+ -+ /* -+ * First, if we find a request starting at the end of the last -+ * request, choose it. -+ */ -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); -+ if (__bfqq) -+ return __bfqq; -+ -+ /* -+ * If the exact sector wasn't found, the parent of the NULL leaf -+ * will contain the closest sector (rq_pos_tree sorted by -+ * next_request position). -+ */ -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ if (blk_rq_pos(__bfqq->next_rq) < sector) -+ node = rb_next(&__bfqq->pos_node); -+ else -+ node = rb_prev(&__bfqq->pos_node); -+ if (!node) -+ return NULL; -+ -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ return NULL; -+} -+ -+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, -+ struct bfq_queue *cur_bfqq, -+ sector_t sector) -+{ -+ struct bfq_queue *bfqq; -+ -+ /* -+ * We shall notice if some of the queues are cooperating, -+ * e.g., working closely on the same area of the device. In -+ * that case, we can group them together and: 1) don't waste -+ * time idling, and 2) serve the union of their requests in -+ * the best possible order for throughput. -+ */ -+ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); -+ if (!bfqq || bfqq == cur_bfqq) -+ return NULL; -+ -+ return bfqq; -+} -+ -+static struct bfq_queue * -+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ int process_refs, new_process_refs; -+ struct bfq_queue *__bfqq; -+ -+ /* -+ * If there are no process references on the new_bfqq, then it is -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain -+ * may have dropped their last reference (not just their last process -+ * reference). -+ */ -+ if (!bfqq_process_refs(new_bfqq)) -+ return NULL; -+ -+ /* Avoid a circular list and skip interim queue merges. */ -+ while ((__bfqq = new_bfqq->new_bfqq)) { -+ if (__bfqq == bfqq) -+ return NULL; -+ new_bfqq = __bfqq; -+ } -+ -+ process_refs = bfqq_process_refs(bfqq); -+ new_process_refs = bfqq_process_refs(new_bfqq); -+ /* -+ * If the process for the bfqq has gone away, there is no -+ * sense in merging the queues. -+ */ -+ if (process_refs == 0 || new_process_refs == 0) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", -+ new_bfqq->pid); -+ -+ /* -+ * Merging is just a redirection: the requests of the process -+ * owning one of the two queues are redirected to the other queue. -+ * The latter queue, in its turn, is set as shared if this is the -+ * first time that the requests of some process are redirected to -+ * it. -+ * -+ * We redirect bfqq to new_bfqq and not the opposite, because we -+ * are in the context of the process owning bfqq, hence we have -+ * the io_cq of this process. So we can immediately configure this -+ * io_cq to redirect the requests of the process to new_bfqq. -+ * -+ * NOTE, even if new_bfqq coincides with the in-service queue, the -+ * io_cq of new_bfqq is not available, because, if the in-service -+ * queue is shared, bfqd->in_service_bic may not point to the -+ * io_cq of the in-service queue. -+ * Redirecting the requests of the process owning bfqq to the -+ * currently in-service queue is in any case the best option, as -+ * we feed the in-service queue with new requests close to the -+ * last request served and, by doing so, hopefully increase the -+ * throughput. -+ */ -+ bfqq->new_bfqq = new_bfqq; -+ atomic_add(process_refs, &new_bfqq->ref); -+ return new_bfqq; -+} -+ -+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, -+ struct bfq_queue *new_bfqq) -+{ -+ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || -+ (bfqq->ioprio_class != new_bfqq->ioprio_class)) -+ return false; -+ -+ /* -+ * If either of the queues has already been detected as seeky, -+ * then merging it with the other queue is unlikely to lead to -+ * sequential I/O. -+ */ -+ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) -+ return false; -+ -+ /* -+ * Interleaved I/O is known to be done by (some) applications -+ * only for reads, so it does not make sense to merge async -+ * queues. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) -+ return false; -+ -+ return true; -+} -+ -+/* -+ * Attempt to schedule a merge of bfqq with the currently in-service queue -+ * or with a close queue among the scheduled queues. -+ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue -+ * structure otherwise. -+ * -+ * The OOM queue is not allowed to participate to cooperation: in fact, since -+ * the requests temporarily redirected to the OOM queue could be redirected -+ * again to dedicated queues at any time, the state needed to correctly -+ * handle merging with the OOM queue would be quite complex and expensive -+ * to maintain. Besides, in such a critical condition as an out of memory, -+ * the benefits of queue merging may be little relevant, or even negligible. -+ */ -+static struct bfq_queue * -+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ void *io_struct, bool request) -+{ -+ struct bfq_queue *in_service_bfqq, *new_bfqq; -+ -+ if (bfqq->new_bfqq) -+ return bfqq->new_bfqq; -+ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) -+ return NULL; -+ /* If device has only one backlogged bfq_queue, don't search. */ -+ if (bfqd->busy_queues == 1) -+ return NULL; -+ -+ in_service_bfqq = bfqd->in_service_queue; -+ -+ if (!in_service_bfqq || in_service_bfqq == bfqq || -+ !bfqd->in_service_bic || -+ unlikely(in_service_bfqq == &bfqd->oom_bfqq)) -+ goto check_scheduled; -+ -+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && -+ bfqq->entity.parent == in_service_bfqq->entity.parent && -+ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { -+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); -+ if (new_bfqq) -+ return new_bfqq; -+ } -+ /* -+ * Check whether there is a cooperator among currently scheduled -+ * queues. The only thing we need is that the bio/request is not -+ * NULL, as we need it to establish whether a cooperator exists. -+ */ -+check_scheduled: -+ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, -+ bfq_io_struct_pos(io_struct, request)); -+ -+ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); -+ -+ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ return bfq_setup_merge(bfqq, new_bfqq); -+ -+ return NULL; -+} -+ -+static void bfq_bfqq_save_state(struct bfq_queue *bfqq) -+{ -+ /* -+ * If !bfqq->bic, the queue is already shared or its requests -+ * have already been redirected to a shared queue; both idle window -+ * and weight raising state have already been saved. Do nothing. -+ */ -+ if (!bfqq->bic) -+ return; -+ if (bfqq->bic->wr_time_left) -+ /* -+ * This is the queue of a just-started process, and would -+ * deserve weight raising: we set wr_time_left to the full -+ * weight-raising duration to trigger weight-raising when -+ * and if the queue is split and the first request of the -+ * queue is enqueued. -+ */ -+ bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); -+ else if (bfqq->wr_coeff > 1) { -+ unsigned long wr_duration = -+ jiffies - bfqq->last_wr_start_finish; -+ /* -+ * It may happen that a queue's weight raising period lasts -+ * longer than its wr_cur_max_time, as weight raising is -+ * handled only when a request is enqueued or dispatched (it -+ * does not use any timer). If the weight raising period is -+ * about to end, don't save it. -+ */ -+ if (bfqq->wr_cur_max_time <= wr_duration) -+ bfqq->bic->wr_time_left = 0; -+ else -+ bfqq->bic->wr_time_left = -+ bfqq->wr_cur_max_time - wr_duration; -+ /* -+ * The bfq_queue is becoming shared or the requests of the -+ * process owning the queue are being redirected to a shared -+ * queue. Stop the weight raising period of the queue, as in -+ * both cases it should not be owned by an interactive or -+ * soft real-time application. -+ */ -+ bfq_bfqq_end_wr(bfqq); -+ } else -+ bfqq->bic->wr_time_left = 0; -+ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); -+ bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); -+ bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -+ bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -+ bfqq->bic->cooperations++; -+ bfqq->bic->failed_cooperations = 0; -+} -+ -+static void bfq_get_bic_reference(struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq->bic has a non-NULL value, the bic to which it belongs -+ * is about to begin using a shared bfq_queue. -+ */ -+ if (bfqq->bic) -+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); -+} -+ -+static void -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", -+ (unsigned long) new_bfqq->pid); -+ /* Save weight raising and idle window of the merged queues */ -+ bfq_bfqq_save_state(bfqq); -+ bfq_bfqq_save_state(new_bfqq); -+ if (bfq_bfqq_IO_bound(bfqq)) -+ bfq_mark_bfqq_IO_bound(new_bfqq); -+ bfq_clear_bfqq_IO_bound(bfqq); -+ /* -+ * Grab a reference to the bic, to prevent it from being destroyed -+ * before being possibly touched by a bfq_split_bfqq(). -+ */ -+ bfq_get_bic_reference(bfqq); -+ bfq_get_bic_reference(new_bfqq); -+ /* -+ * Merge queues (that is, let bic redirect its requests to new_bfqq) -+ */ -+ bic_set_bfqq(bic, new_bfqq, 1); -+ bfq_mark_bfqq_coop(new_bfqq); -+ /* -+ * new_bfqq now belongs to at least two bics (it is a shared queue): -+ * set new_bfqq->bic to NULL. bfqq either: -+ * - does not belong to any bic any more, and hence bfqq->bic must -+ * be set to NULL, or -+ * - is a queue whose owning bics have already been redirected to a -+ * different queue, hence the queue is destined to not belong to -+ * any bic soon and bfqq->bic is already NULL (therefore the next -+ * assignment causes no harm). -+ */ -+ new_bfqq->bic = NULL; -+ bfqq->bic = NULL; -+ bfq_put_queue(bfqq); -+} -+ -+static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) -+{ -+ struct bfq_io_cq *bic = bfqq->bic; -+ struct bfq_data *bfqd = bfqq->bfqd; -+ -+ if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { -+ bic->failed_cooperations++; -+ if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) -+ bic->cooperations = 0; -+ } -+} -+ - static int bfq_allow_merge(struct request_queue *q, struct request *rq, - struct bio *bio) - { - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq, *new_bfqq; - - /* - * Disallow merge of a sync bio into an async request. -@@ -1149,7 +1621,26 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, - if (!bic) - return 0; - -- return bic_to_bfqq(bic, bfq_bio_sync(bio)) == RQ_BFQQ(rq); -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); -+ /* -+ * We take advantage of this function to perform an early merge -+ * of the queues of possible cooperating processes. -+ */ -+ if (bfqq) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ if (new_bfqq) { -+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); -+ /* -+ * If we get here, the bio will be queued in the -+ * shared queue, i.e., new_bfqq, so use new_bfqq -+ * to decide whether bio and rq can be merged. -+ */ -+ bfqq = new_bfqq; -+ } else -+ bfq_bfqq_increase_failed_cooperations(bfqq); -+ } -+ -+ return bfqq == RQ_BFQQ(rq); - } - - static void __bfq_set_in_service_queue(struct bfq_data *bfqd, -@@ -1350,6 +1841,15 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) - - __bfq_bfqd_reset_in_service(bfqd); - -+ /* -+ * If this bfqq is shared between multiple processes, check -+ * to make sure that those processes are still issuing I/Os -+ * within the mean seek distance. If not, it may be time to -+ * break the queues apart again. -+ */ -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) -+ bfq_mark_bfqq_split_coop(bfqq); -+ - if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - /* - * Overloading budget_timeout field to store the time -@@ -1358,8 +1858,13 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) - */ - bfqq->budget_timeout = jiffies; - bfq_del_bfqq_busy(bfqd, bfqq, 1); -- } else -+ } else { - bfq_activate_bfqq(bfqd, bfqq); -+ /* -+ * Resort priority tree of potential close cooperators. -+ */ -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } - } - - /** -@@ -2246,10 +2751,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - /* - * If the queue was activated in a burst, or - * too much time has elapsed from the beginning -- * of this weight-raising period, then end weight -- * raising. -+ * of this weight-raising period, or the queue has -+ * exceeded the acceptable number of cooperations, -+ * then end weight raising. - */ - if (bfq_bfqq_in_large_burst(bfqq) || -+ bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time)) { - bfqq->last_wr_start_finish = jiffies; -@@ -2478,6 +2985,25 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - #endif - } - -+static void bfq_put_cooperator(struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *__bfqq, *next; -+ -+ /* -+ * If this queue was scheduled to merge with another queue, be -+ * sure to drop the reference taken on that queue (and others in -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. -+ */ -+ __bfqq = bfqq->new_bfqq; -+ while (__bfqq) { -+ if (__bfqq == bfqq) -+ break; -+ next = __bfqq->new_bfqq; -+ bfq_put_queue(__bfqq); -+ __bfqq = next; -+ } -+} -+ - static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - { - if (bfqq == bfqd->in_service_queue) { -@@ -2488,6 +3014,8 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, - atomic_read(&bfqq->ref)); - -+ bfq_put_cooperator(bfqq); -+ - bfq_put_queue(bfqq); - } - -@@ -2496,6 +3024,25 @@ static void bfq_init_icq(struct io_cq *icq) - struct bfq_io_cq *bic = icq_to_bic(icq); - - bic->ttime.last_end_request = jiffies; -+ /* -+ * A newly created bic indicates that the process has just -+ * started doing I/O, and is probably mapping into memory its -+ * executable and libraries: it definitely needs weight raising. -+ * There is however the possibility that the process performs, -+ * for a while, I/O close to some other process. EQM intercepts -+ * this behavior and may merge the queue corresponding to the -+ * process with some other queue, BEFORE the weight of the queue -+ * is raised. Merged queues are not weight-raised (they are assumed -+ * to belong to processes that benefit only from high throughput). -+ * If the merge is basically the consequence of an accident, then -+ * the queue will be split soon and will get back its old weight. -+ * It is then important to write down somewhere that this queue -+ * does need weight raising, even if it did not make it to get its -+ * weight raised before being merged. To this purpose, we overload -+ * the field raising_time_left and assign 1 to it, to mark the queue -+ * as needing weight raising. -+ */ -+ bic->wr_time_left = 1; - } - - static void bfq_exit_icq(struct io_cq *icq) -@@ -2509,6 +3056,13 @@ static void bfq_exit_icq(struct io_cq *icq) - } - - if (bic->bfqq[BLK_RW_SYNC]) { -+ /* -+ * If the bic is using a shared queue, put the reference -+ * taken on the io_context when the bic started using a -+ * shared bfq_queue. -+ */ -+ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) -+ put_io_context(icq->ioc); - bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); - bic->bfqq[BLK_RW_SYNC] = NULL; - } -@@ -2814,6 +3368,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, - if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) - return; - -+ /* Idle window just restored, statistics are meaningless. */ -+ if (bfq_bfqq_just_split(bfqq)) -+ return; -+ - enable_idle = bfq_bfqq_idle_window(bfqq); - - if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -@@ -2861,6 +3419,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || - !BFQQ_SEEKY(bfqq)) - bfq_update_idle_window(bfqd, bfqq, bic); -+ bfq_clear_bfqq_just_split(bfqq); - - bfq_log_bfqq(bfqd, bfqq, - "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", -@@ -2925,12 +3484,47 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - static void bfq_insert_request(struct request_queue *q, struct request *rq) - { - struct bfq_data *bfqd = q->elevator->elevator_data; -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; - - assert_spin_locked(bfqd->queue->queue_lock); - -+ /* -+ * An unplug may trigger a requeue of a request from the device -+ * driver: make sure we are in process context while trying to -+ * merge two bfq_queues. -+ */ -+ if (!in_interrupt()) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); -+ if (new_bfqq) { -+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) -+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); -+ /* -+ * Release the request's reference to the old bfqq -+ * and make sure one is taken to the shared queue. -+ */ -+ new_bfqq->allocated[rq_data_dir(rq)]++; -+ bfqq->allocated[rq_data_dir(rq)]--; -+ atomic_inc(&new_bfqq->ref); -+ bfq_put_queue(bfqq); -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), -+ bfqq, new_bfqq); -+ rq->elv.priv[1] = new_bfqq; -+ bfqq = new_bfqq; -+ } else -+ bfq_bfqq_increase_failed_cooperations(bfqq); -+ } -+ - bfq_add_request(rq); - -+ /* -+ * Here a newly-created bfq_queue has already started a weight-raising -+ * period: clear raising_time_left to prevent bfq_bfqq_save_state() -+ * from assigning it a full weight-raising period. See the detailed -+ * comments about this field in bfq_init_icq(). -+ */ -+ if (bfqq->bic) -+ bfqq->bic->wr_time_left = 0; - rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; - list_add_tail(&rq->queuelist, &bfqq->fifo); - -@@ -3099,6 +3693,32 @@ static void bfq_put_request(struct request *rq) - } - - /* -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this -+ * was the last process referring to said bfqq. -+ */ -+static struct bfq_queue * -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); -+ -+ put_io_context(bic->icq.ioc); -+ -+ if (bfqq_process_refs(bfqq) == 1) { -+ bfqq->pid = current->pid; -+ bfq_clear_bfqq_coop(bfqq); -+ bfq_clear_bfqq_split_coop(bfqq); -+ return bfqq; -+ } -+ -+ bic_set_bfqq(bic, NULL, 1); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); -+ return NULL; -+} -+ -+/* - * Allocate bfq data structures associated with this request. - */ - static int bfq_set_request(struct request_queue *q, struct request *rq, -@@ -3110,6 +3730,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - const int is_sync = rq_is_sync(rq); - struct bfq_queue *bfqq; - unsigned long flags; -+ bool split = false; - - might_sleep_if(gfpflags_allow_blocking(gfp_mask)); - -@@ -3122,15 +3743,30 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - - bfq_bic_update_cgroup(bic, bio); - -+new_queue: - bfqq = bic_to_bfqq(bic, is_sync); - if (!bfqq || bfqq == &bfqd->oom_bfqq) { - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); - bic_set_bfqq(bic, bfqq, is_sync); -- if (is_sync) { -- if (bfqd->large_burst) -+ if (split && is_sync) { -+ if ((bic->was_in_burst_list && bfqd->large_burst) || -+ bic->saved_in_large_burst) - bfq_mark_bfqq_in_large_burst(bfqq); -- else -+ else { - bfq_clear_bfqq_in_large_burst(bfqq); -+ if (bic->was_in_burst_list) -+ hlist_add_head(&bfqq->burst_list_node, -+ &bfqd->burst_list); -+ } -+ } -+ } else { -+ /* If the queue was seeky for too long, break it apart. */ -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); -+ bfqq = bfq_split_bfqq(bic, bfqq); -+ split = true; -+ if (!bfqq) -+ goto new_queue; - } - } - -@@ -3142,6 +3778,26 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; - -+ /* -+ * If a bfq_queue has only one process reference, it is owned -+ * by only one bfq_io_cq: we can set the bic field of the -+ * bfq_queue to the address of that structure. Also, if the -+ * queue has just been split, mark a flag so that the -+ * information is available to the other scheduler hooks. -+ */ -+ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { -+ bfqq->bic = bic; -+ if (split) { -+ bfq_mark_bfqq_just_split(bfqq); -+ /* -+ * If the queue has just been split from a shared -+ * queue, restore the idle window and the possible -+ * weight raising period. -+ */ -+ bfq_bfqq_resume_state(bfqq, bic); -+ } -+ } -+ - spin_unlock_irqrestore(q->queue_lock, flags); - - return 0; -@@ -3295,6 +3951,7 @@ static void bfq_init_root_group(struct bfq_group *root_group, - root_group->my_entity = NULL; - root_group->bfqd = bfqd; - #endif -+ root_group->rq_pos_tree = RB_ROOT; - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) - root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; - } -@@ -3375,6 +4032,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; - bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; - -+ bfqd->bfq_coop_thresh = 2; -+ bfqd->bfq_failed_cooperations = 7000; - bfqd->bfq_requests_within_timer = 120; - - bfqd->bfq_large_burst_thresh = 11; -diff --git a/block/bfq.h b/block/bfq.h -index 2bf54ae..fcce855 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -183,6 +183,8 @@ struct bfq_group; - * ioprio_class value. - * @new_bfqq: shared bfq_queue if queue is cooperating with - * one or more other queues. -+ * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree). -+ * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree). - * @sort_list: sorted list of pending requests. - * @next_rq: if fifo isn't expired, next request to serve. - * @queued: nr of requests queued in @sort_list. -@@ -304,6 +306,26 @@ struct bfq_ttime { - * @ttime: associated @bfq_ttime struct - * @ioprio: per (request_queue, blkcg) ioprio. - * @blkcg_id: id of the blkcg the related io_cq belongs to. -+ * @wr_time_left: snapshot of the time left before weight raising ends -+ * for the sync queue associated to this process; this -+ * snapshot is taken to remember this value while the weight -+ * raising is suspended because the queue is merged with a -+ * shared queue, and is used to set @raising_cur_max_time -+ * when the queue is split from the shared queue and its -+ * weight is raised again -+ * @saved_idle_window: same purpose as the previous field for the idle -+ * window -+ * @saved_IO_bound: same purpose as the previous two fields for the I/O -+ * bound classification of a queue -+ * @saved_in_large_burst: same purpose as the previous fields for the -+ * value of the field keeping the queue's belonging -+ * to a large burst -+ * @was_in_burst_list: true if the queue belonged to a burst list -+ * before its merge with another cooperating queue -+ * @cooperations: counter of consecutive successful queue merges underwent -+ * by any of the process' @bfq_queues -+ * @failed_cooperations: counter of consecutive failed queue merges of any -+ * of the process' @bfq_queues - */ - struct bfq_io_cq { - struct io_cq icq; /* must be the first member */ -@@ -314,6 +336,16 @@ struct bfq_io_cq { - #ifdef CONFIG_BFQ_GROUP_IOSCHED - uint64_t blkcg_id; /* the current blkcg ID */ - #endif -+ -+ unsigned int wr_time_left; -+ bool saved_idle_window; -+ bool saved_IO_bound; -+ -+ bool saved_in_large_burst; -+ bool was_in_burst_list; -+ -+ unsigned int cooperations; -+ unsigned int failed_cooperations; - }; - - enum bfq_device_speed { -@@ -557,6 +589,9 @@ enum bfqq_state_flags { - * may need softrt-next-start - * update - */ -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ -+ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ -+ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ - }; - - #define BFQ_BFQQ_FNS(name) \ -@@ -583,6 +618,9 @@ BFQ_BFQQ_FNS(budget_new); - BFQ_BFQQ_FNS(IO_bound); - BFQ_BFQQ_FNS(in_large_burst); - BFQ_BFQQ_FNS(constantly_seeky); -+BFQ_BFQQ_FNS(coop); -+BFQ_BFQQ_FNS(split_coop); -+BFQ_BFQQ_FNS(just_split); - BFQ_BFQQ_FNS(softrt_update); - #undef BFQ_BFQQ_FNS - -@@ -675,6 +713,9 @@ struct bfq_group_data { - * are groups with more than one active @bfq_entity - * (see the comments to the function - * bfq_bfqq_must_not_expire()). -+ * @rq_pos_tree: rbtree sorted by next_request position, used when -+ * determining if two or more queues have interleaving -+ * requests (see bfq_find_close_cooperator()). - * - * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup - * there is a set of bfq_groups, each one collecting the lower-level -@@ -701,6 +742,8 @@ struct bfq_group { - - int active_entities; - -+ struct rb_root rq_pos_tree; -+ - struct bfqg_stats stats; - struct bfqg_stats dead_stats; /* stats pushed from dead children */ - }; -@@ -711,6 +754,8 @@ struct bfq_group { - - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; - struct bfq_queue *async_idle_bfqq; -+ -+ struct rb_root rq_pos_tree; - }; - #endif - -@@ -787,6 +832,27 @@ static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) - spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); - } - -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ if (!group_entity) -+ group_entity = &bfqq->bfqd->root_group->entity; -+ -+ return container_of(group_entity, struct bfq_group, entity); -+} -+ -+#else -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ return bfqq->bfqd->root_group; -+} -+ -+#endif -+ - static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); - static void bfq_put_queue(struct bfq_queue *bfqq); - static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); --- -2.10.0 - diff --git a/0004-blk-bfq-turn-BFQ-v7r11-for-4.11.0-into-BFQ-v8r11-for.patch b/0004-blk-bfq-turn-BFQ-v7r11-for-4.11.0-into-BFQ-v8r11-for.patch deleted file mode 100644 index f2a72fa..0000000 --- a/0004-blk-bfq-turn-BFQ-v7r11-for-4.11.0-into-BFQ-v8r11-for.patch +++ /dev/null @@ -1,9419 +0,0 @@ -From f5d5a33bf31028abb87ff8e36b695aa18b284c17 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Mon, 16 May 2016 11:16:17 +0200 -Subject: [PATCH 4/4] blk, bfq: turn BFQ-v7r11 for 4.11.0 into BFQ-v8r11 for - 4.11.0 - -Signed-off-by: Paolo Valente ---- - Documentation/block/00-INDEX | 2 + - Documentation/block/bfq-iosched.txt | 530 ++++++ - block/Kconfig.iosched | 18 +- - block/bfq-cgroup.c | 511 +++--- - block/bfq-iosched.c | 3468 ++++++++++++++++++++++------------- - block/bfq-sched.c | 1400 +++++++++++--- - block/bfq.h | 804 ++++---- - 7 files changed, 4535 insertions(+), 2198 deletions(-) - create mode 100644 Documentation/block/bfq-iosched.txt - -diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX -index e55103a..8d55b4b 100644 ---- a/Documentation/block/00-INDEX -+++ b/Documentation/block/00-INDEX -@@ -1,5 +1,7 @@ - 00-INDEX - - This file -+bfq-iosched.txt -+ - BFQ IO scheduler and its tunables - biodoc.txt - - Notes on the Generic Block Layer Rewrite in Linux 2.5 - biovecs.txt -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -new file mode 100644 -index 0000000..13b5248 ---- /dev/null -+++ b/Documentation/block/bfq-iosched.txt -@@ -0,0 +1,530 @@ -+BFQ (Budget Fair Queueing) -+========================== -+ -+BFQ is a proportional-share I/O scheduler, with some extra -+low-latency capabilities. In addition to cgroups support (blkio or io -+controllers), BFQ's main features are: -+- BFQ guarantees a high system and application responsiveness, and a -+ low latency for time-sensitive applications, such as audio or video -+ players; -+- BFQ distributes bandwidth, and not just time, among processes or -+ groups (switching back to time distribution when needed to keep -+ throughput high). -+ -+On average CPUs, the current version of BFQ can handle devices -+performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a -+reference, 30-50 KIOPS correspond to very high bandwidths with -+sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and -+to 120-200 MB/s with 4KB random I/O. -+ -+The table of contents follow. Impatients can just jump to Section 3. -+ -+CONTENTS -+ -+1. When may BFQ be useful? -+ 1-1 Personal systems -+ 1-2 Server systems -+2. How does BFQ work? -+3. What are BFQ's tunable? -+4. BFQ group scheduling -+ 4-1 Service guarantees provided -+ 4-2 Interface -+ -+1. When may BFQ be useful? -+========================== -+ -+BFQ provides the following benefits on personal and server systems. -+ -+1-1 Personal systems -+-------------------- -+ -+Low latency for interactive applications -+ -+Regardless of the actual background workload, BFQ guarantees that, for -+interactive tasks, the storage device is virtually as responsive as if -+it was idle. For example, even if one or more of the following -+background workloads are being executed: -+- one or more large files are being read, written or copied, -+- a tree of source files is being compiled, -+- one or more virtual machines are performing I/O, -+- a software update is in progress, -+- indexing daemons are scanning filesystems and updating their -+ databases, -+starting an application or loading a file from within an application -+takes about the same time as if the storage device was idle. As a -+comparison, with CFQ, NOOP or DEADLINE, and in the same conditions, -+applications experience high latencies, or even become unresponsive -+until the background workload terminates (also on SSDs). -+ -+Low latency for soft real-time applications -+ -+Also soft real-time applications, such as audio and video -+players/streamers, enjoy a low latency and a low drop rate, regardless -+of the background I/O workload. As a consequence, these applications -+do not suffer from almost any glitch due to the background workload. -+ -+Higher speed for code-development tasks -+ -+If some additional workload happens to be executed in parallel, then -+BFQ executes the I/O-related components of typical code-development -+tasks (compilation, checkout, merge, ...) much more quickly than CFQ, -+NOOP or DEADLINE. -+ -+High throughput -+ -+On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and -+up to 150% higher throughput than DEADLINE and NOOP, with all the -+sequential workloads considered in our tests. With random workloads, -+and with all the workloads on flash-based devices, BFQ achieves, -+instead, about the same throughput as the other schedulers. -+ -+Strong fairness, bandwidth and delay guarantees -+ -+BFQ distributes the device throughput, and not just the device time, -+among I/O-bound applications in proportion their weights, with any -+workload and regardless of the device parameters. From these bandwidth -+guarantees, it is possible to compute tight per-I/O-request delay -+guarantees by a simple formula. If not configured for strict service -+guarantees, BFQ switches to time-based resource sharing (only) for -+applications that would otherwise cause a throughput loss. -+ -+1-2 Server systems -+------------------ -+ -+Most benefits for server systems follow from the same service -+properties as above. In particular, regardless of whether additional, -+possibly heavy workloads are being served, BFQ guarantees: -+ -+. audio and video-streaming with zero or very low jitter and drop -+ rate; -+ -+. fast retrieval of WEB pages and embedded objects; -+ -+. real-time recording of data in live-dumping applications (e.g., -+ packet logging); -+ -+. responsiveness in local and remote access to a server. -+ -+ -+2. How does BFQ work? -+===================== -+ -+BFQ is a proportional-share I/O scheduler, whose general structure, -+plus a lot of code, are borrowed from CFQ. -+ -+- Each process doing I/O on a device is associated with a weight and a -+ (bfq_)queue. -+ -+- BFQ grants exclusive access to the device, for a while, to one queue -+ (process) at a time, and implements this service model by -+ associating every queue with a budget, measured in number of -+ sectors. -+ -+ - After a queue is granted access to the device, the budget of the -+ queue is decremented, on each request dispatch, by the size of the -+ request. -+ -+ - The in-service queue is expired, i.e., its service is suspended, -+ only if one of the following events occurs: 1) the queue finishes -+ its budget, 2) the queue empties, 3) a "budget timeout" fires. -+ -+ - The budget timeout prevents processes doing random I/O from -+ holding the device for too long and dramatically reducing -+ throughput. -+ -+ - Actually, as in CFQ, a queue associated with a process issuing -+ sync requests may not be expired immediately when it empties. In -+ contrast, BFQ may idle the device for a short time interval, -+ giving the process the chance to go on being served if it issues -+ a new request in time. Device idling typically boosts the -+ throughput on rotational devices, if processes do synchronous -+ and sequential I/O. In addition, under BFQ, device idling is -+ also instrumental in guaranteeing the desired throughput -+ fraction to processes issuing sync requests (see the description -+ of the slice_idle tunable in this document, or [1, 2], for more -+ details). -+ -+ - With respect to idling for service guarantees, if several -+ processes are competing for the device at the same time, but -+ all processes (and groups, after the following commit) have -+ the same weight, then BFQ guarantees the expected throughput -+ distribution without ever idling the device. Throughput is -+ thus as high as possible in this common scenario. -+ -+ - If low-latency mode is enabled (default configuration), BFQ -+ executes some special heuristics to detect interactive and soft -+ real-time applications (e.g., video or audio players/streamers), -+ and to reduce their latency. The most important action taken to -+ achieve this goal is to give to the queues associated with these -+ applications more than their fair share of the device -+ throughput. For brevity, we call just "weight-raising" the whole -+ sets of actions taken by BFQ to privilege these queues. In -+ particular, BFQ provides a milder form of weight-raising for -+ interactive applications, and a stronger form for soft real-time -+ applications. -+ -+ - BFQ automatically deactivates idling for queues born in a burst of -+ queue creations. In fact, these queues are usually associated with -+ the processes of applications and services that benefit mostly -+ from a high throughput. Examples are systemd during boot, or git -+ grep. -+ -+ - As CFQ, BFQ merges queues performing interleaved I/O, i.e., -+ performing random I/O that becomes mostly sequential if -+ merged. Differently from CFQ, BFQ achieves this goal with a more -+ reactive mechanism, called Early Queue Merge (EQM). EQM is so -+ responsive in detecting interleaved I/O (cooperating processes), -+ that it enables BFQ to achieve a high throughput, by queue -+ merging, even for queues for which CFQ needs a different -+ mechanism, preemption, to get a high throughput. As such EQM is a -+ unified mechanism to achieve a high throughput with interleaved -+ I/O. -+ -+ - Queues are scheduled according to a variant of WF2Q+, named -+ B-WF2Q+, and implemented using an augmented rb-tree to preserve an -+ O(log N) overall complexity. See [2] for more details. B-WF2Q+ is -+ also ready for hierarchical scheduling. However, for a cleaner -+ logical breakdown, the code that enables and completes -+ hierarchical support is provided in the next commit, which focuses -+ exactly on this feature. -+ -+ - B-WF2Q+ guarantees a tight deviation with respect to an ideal, -+ perfectly fair, and smooth service. In particular, B-WF2Q+ -+ guarantees that each queue receives a fraction of the device -+ throughput proportional to its weight, even if the throughput -+ fluctuates, and regardless of: the device parameters, the current -+ workload and the budgets assigned to the queue. -+ -+ - The last, budget-independence, property (although probably -+ counterintuitive in the first place) is definitely beneficial, for -+ the following reasons: -+ -+ - First, with any proportional-share scheduler, the maximum -+ deviation with respect to an ideal service is proportional to -+ the maximum budget (slice) assigned to queues. As a consequence, -+ BFQ can keep this deviation tight not only because of the -+ accurate service of B-WF2Q+, but also because BFQ *does not* -+ need to assign a larger budget to a queue to let the queue -+ receive a higher fraction of the device throughput. -+ -+ - Second, BFQ is free to choose, for every process (queue), the -+ budget that best fits the needs of the process, or best -+ leverages the I/O pattern of the process. In particular, BFQ -+ updates queue budgets with a simple feedback-loop algorithm that -+ allows a high throughput to be achieved, while still providing -+ tight latency guarantees to time-sensitive applications. When -+ the in-service queue expires, this algorithm computes the next -+ budget of the queue so as to: -+ -+ - Let large budgets be eventually assigned to the queues -+ associated with I/O-bound applications performing sequential -+ I/O: in fact, the longer these applications are served once -+ got access to the device, the higher the throughput is. -+ -+ - Let small budgets be eventually assigned to the queues -+ associated with time-sensitive applications (which typically -+ perform sporadic and short I/O), because, the smaller the -+ budget assigned to a queue waiting for service is, the sooner -+ B-WF2Q+ will serve that queue (Subsec 3.3 in [2]). -+ -+- If several processes are competing for the device at the same time, -+ but all processes and groups have the same weight, then BFQ -+ guarantees the expected throughput distribution without ever idling -+ the device. It uses preemption instead. Throughput is then much -+ higher in this common scenario. -+ -+- ioprio classes are served in strict priority order, i.e., -+ lower-priority queues are not served as long as there are -+ higher-priority queues. Among queues in the same class, the -+ bandwidth is distributed in proportion to the weight of each -+ queue. A very thin extra bandwidth is however guaranteed to -+ the Idle class, to prevent it from starving. -+ -+ -+3. What are BFQ's tunable? -+========================== -+ -+The tunables back_seek-max, back_seek_penalty, fifo_expire_async and -+fifo_expire_sync below are the same as in CFQ. Their description is -+just copied from that for CFQ. Some considerations in the description -+of slice_idle are copied from CFQ too. -+ -+per-process ioprio and weight -+----------------------------- -+ -+Unless the cgroups interface is used (see "4. BFQ group scheduling"), -+weights can be assigned to processes only indirectly, through I/O -+priorities, and according to the relation: -+weight = (IOPRIO_BE_NR - ioprio) * 10. -+ -+Beware that, if low-latency is set, then BFQ automatically raises the -+weight of the queues associated with interactive and soft real-time -+applications. Unset this tunable if you need/want to control weights. -+ -+slice_idle -+---------- -+ -+This parameter specifies how long BFQ should idle for next I/O -+request, when certain sync BFQ queues become empty. By default -+slice_idle is a non-zero value. Idling has a double purpose: boosting -+throughput and making sure that the desired throughput distribution is -+respected (see the description of how BFQ works, and, if needed, the -+papers referred there). -+ -+As for throughput, idling can be very helpful on highly seeky media -+like single spindle SATA/SAS disks where we can cut down on overall -+number of seeks and see improved throughput. -+ -+Setting slice_idle to 0 will remove all the idling on queues and one -+should see an overall improved throughput on faster storage devices -+like multiple SATA/SAS disks in hardware RAID configuration. -+ -+So depending on storage and workload, it might be useful to set -+slice_idle=0. In general for SATA/SAS disks and software RAID of -+SATA/SAS disks keeping slice_idle enabled should be useful. For any -+configurations where there are multiple spindles behind single LUN -+(Host based hardware RAID controller or for storage arrays), setting -+slice_idle=0 might end up in better throughput and acceptable -+latencies. -+ -+Idling is however necessary to have service guarantees enforced in -+case of differentiated weights or differentiated I/O-request lengths. -+To see why, suppose that a given BFQ queue A must get several I/O -+requests served for each request served for another queue B. Idling -+ensures that, if A makes a new I/O request slightly after becoming -+empty, then no request of B is dispatched in the middle, and thus A -+does not lose the possibility to get more than one request dispatched -+before the next request of B is dispatched. Note that idling -+guarantees the desired differentiated treatment of queues only in -+terms of I/O-request dispatches. To guarantee that the actual service -+order then corresponds to the dispatch order, the strict_guarantees -+tunable must be set too. -+ -+There is an important flipside for idling: apart from the above cases -+where it is beneficial also for throughput, idling can severely impact -+throughput. One important case is random workload. Because of this -+issue, BFQ tends to avoid idling as much as possible, when it is not -+beneficial also for throughput. As a consequence of this behavior, and -+of further issues described for the strict_guarantees tunable, -+short-term service guarantees may be occasionally violated. And, in -+some cases, these guarantees may be more important than guaranteeing -+maximum throughput. For example, in video playing/streaming, a very -+low drop rate may be more important than maximum throughput. In these -+cases, consider setting the strict_guarantees parameter. -+ -+strict_guarantees -+----------------- -+ -+If this parameter is set (default: unset), then BFQ -+ -+- always performs idling when the in-service queue becomes empty; -+ -+- forces the device to serve one I/O request at a time, by dispatching a -+ new request only if there is no outstanding request. -+ -+In the presence of differentiated weights or I/O-request sizes, both -+the above conditions are needed to guarantee that every BFQ queue -+receives its allotted share of the bandwidth. The first condition is -+needed for the reasons explained in the description of the slice_idle -+tunable. The second condition is needed because all modern storage -+devices reorder internally-queued requests, which may trivially break -+the service guarantees enforced by the I/O scheduler. -+ -+Setting strict_guarantees may evidently affect throughput. -+ -+back_seek_max -+------------- -+ -+This specifies, given in Kbytes, the maximum "distance" for backward seeking. -+The distance is the amount of space from the current head location to the -+sectors that are backward in terms of distance. -+ -+This parameter allows the scheduler to anticipate requests in the "backward" -+direction and consider them as being the "next" if they are within this -+distance from the current head location. -+ -+back_seek_penalty -+----------------- -+ -+This parameter is used to compute the cost of backward seeking. If the -+backward distance of request is just 1/back_seek_penalty from a "front" -+request, then the seeking cost of two requests is considered equivalent. -+ -+So scheduler will not bias toward one or the other request (otherwise scheduler -+will bias toward front request). Default value of back_seek_penalty is 2. -+ -+fifo_expire_async -+----------------- -+ -+This parameter is used to set the timeout of asynchronous requests. Default -+value of this is 248ms. -+ -+fifo_expire_sync -+---------------- -+ -+This parameter is used to set the timeout of synchronous requests. Default -+value of this is 124ms. In case to favor synchronous requests over asynchronous -+one, this value should be decreased relative to fifo_expire_async. -+ -+low_latency -+----------- -+ -+This parameter is used to enable/disable BFQ's low latency mode. By -+default, low latency mode is enabled. If enabled, interactive and soft -+real-time applications are privileged and experience a lower latency, -+as explained in more detail in the description of how BFQ works. -+ -+DO NOT enable this mode if you need full control on bandwidth -+distribution. In fact, if it is enabled, then BFQ automatically -+increases the bandwidth share of privileged applications, as the main -+means to guarantee a lower latency to them. -+ -+timeout_sync -+------------ -+ -+Maximum amount of device time that can be given to a task (queue) once -+it has been selected for service. On devices with costly seeks, -+increasing this time usually increases maximum throughput. On the -+opposite end, increasing this time coarsens the granularity of the -+short-term bandwidth and latency guarantees, especially if the -+following parameter is set to zero. -+ -+max_budget -+---------- -+ -+Maximum amount of service, measured in sectors, that can be provided -+to a BFQ queue once it is set in service (of course within the limits -+of the above timeout). According to what said in the description of -+the algorithm, larger values increase the throughput in proportion to -+the percentage of sequential I/O requests issued. The price of larger -+values is that they coarsen the granularity of short-term bandwidth -+and latency guarantees. -+ -+The default value is 0, which enables auto-tuning: BFQ sets max_budget -+to the maximum number of sectors that can be served during -+timeout_sync, according to the estimated peak rate. -+ -+weights -+------- -+ -+Read-only parameter, used to show the weights of the currently active -+BFQ queues. -+ -+ -+wr_ tunables -+------------ -+ -+BFQ exports a few parameters to control/tune the behavior of -+low-latency heuristics. -+ -+wr_coeff -+ -+Factor by which the weight of a weight-raised queue is multiplied. If -+the queue is deemed soft real-time, then the weight is further -+multiplied by an additional, constant factor. -+ -+wr_max_time -+ -+Maximum duration of a weight-raising period for an interactive task -+(ms). If set to zero (default value), then this value is computed -+automatically, as a function of the peak rate of the device. In any -+case, when the value of this parameter is read, it always reports the -+current duration, regardless of whether it has been set manually or -+computed automatically. -+ -+wr_max_softrt_rate -+ -+Maximum service rate below which a queue is deemed to be associated -+with a soft real-time application, and is then weight-raised -+accordingly (sectors/sec). -+ -+wr_min_idle_time -+ -+Minimum idle period after which interactive weight-raising may be -+reactivated for a queue (in ms). -+ -+wr_rt_max_time -+ -+Maximum weight-raising duration for soft real-time queues (in ms). The -+start time from which this duration is considered is automatically -+moved forward if the queue is detected to be still soft real-time -+before the current soft real-time weight-raising period finishes. -+ -+wr_min_inter_arr_async -+ -+Minimum period between I/O request arrivals after which weight-raising -+may be reactivated for an already busy async queue (in ms). -+ -+ -+4. Group scheduling with BFQ -+============================ -+ -+BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely -+blkio and io. In particular, BFQ supports weight-based proportional -+share. To activate cgroups support, set BFQ_GROUP_IOSCHED. -+ -+4-1 Service guarantees provided -+------------------------------- -+ -+With BFQ, proportional share means true proportional share of the -+device bandwidth, according to group weights. For example, a group -+with weight 200 gets twice the bandwidth, and not just twice the time, -+of a group with weight 100. -+ -+BFQ supports hierarchies (group trees) of any depth. Bandwidth is -+distributed among groups and processes in the expected way: for each -+group, the children of the group share the whole bandwidth of the -+group in proportion to their weights. In particular, this implies -+that, for each leaf group, every process of the group receives the -+same share of the whole group bandwidth, unless the ioprio of the -+process is modified. -+ -+The resource-sharing guarantee for a group may partially or totally -+switch from bandwidth to time, if providing bandwidth guarantees to -+the group lowers the throughput too much. This switch occurs on a -+per-process basis: if a process of a leaf group causes throughput loss -+if served in such a way to receive its share of the bandwidth, then -+BFQ switches back to just time-based proportional share for that -+process. -+ -+4-2 Interface -+------------- -+ -+To get proportional sharing of bandwidth with BFQ for a given device, -+BFQ must of course be the active scheduler for that device. -+ -+Within each group directory, the names of the files associated with -+BFQ-specific cgroup parameters and stats begin with the "bfq." -+prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for -+BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group -+parameter to set the weight of a group with BFQ is blkio.bfq.weight -+or io.bfq.weight. -+ -+Parameters to set -+----------------- -+ -+For each group, there is only the following parameter to set. -+ -+weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the -+group inside its parent. Available values: 1..10000 (default 100). The -+linear mapping between ioprio and weights, described at the beginning -+of the tunable section, is still valid, but all weights higher than -+IOPRIO_BE_NR*10 are mapped to ioprio 0. -+ -+Recall that, if low-latency is set, then BFQ automatically raises the -+weight of the queues associated with interactive and soft real-time -+applications. Unset this tunable if you need/want to control weights. -+ -+ -+[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O -+ Scheduler", Proceedings of the First Workshop on Mobile System -+ Technologies (MST-2015), May 2015. -+ http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf -+ -+[2] P. Valente and M. Andreolini, "Improving Application -+ Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of -+ the 5th Annual International Systems and Storage Conference -+ (SYSTOR '12), June 2012. -+ Slightly extended version: -+ http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite- -+ results.pdf -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched -index b1ab0ca..99a4226 100644 ---- a/block/Kconfig.iosched -+++ b/block/Kconfig.iosched -@@ -43,20 +43,20 @@ config IOSCHED_BFQ - tristate "BFQ I/O scheduler" - default n - ---help--- -- The BFQ I/O scheduler tries to distribute bandwidth among -- all processes according to their weights. -- It aims at distributing the bandwidth as desired, independently of -- the disk parameters and with any workload. It also tries to -- guarantee low latency to interactive and soft real-time -- applications. If compiled built-in (saying Y here), BFQ can -- be configured to support hierarchical scheduling. -+ The BFQ I/O scheduler distributes bandwidth among all -+ processes according to their weights, regardless of the -+ device parameters and with any workload. It also guarantees -+ a low latency to interactive and soft real-time applications. -+ Details in Documentation/block/bfq-iosched.txt - - config BFQ_GROUP_IOSCHED - bool "BFQ hierarchical scheduling support" -- depends on CGROUPS && IOSCHED_BFQ=y -+ depends on IOSCHED_BFQ && BLK_CGROUP - default n - ---help--- -- Enable hierarchical scheduling in BFQ, using the blkio controller. -+ -+ Enable hierarchical scheduling in BFQ, using the blkio -+ (cgroups-v1) or io (cgroups-v2) controller. - - choice - prompt "Default I/O scheduler" -diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c -index 0367996..39daaf4 100644 ---- a/block/bfq-cgroup.c -+++ b/block/bfq-cgroup.c -@@ -7,7 +7,9 @@ - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * -- * Copyright (C) 2010 Paolo Valente -+ * Copyright (C) 2015 Paolo Valente -+ * -+ * Copyright (C) 2016 Paolo Valente - * - * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ - * file. -@@ -163,8 +165,6 @@ static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) - { - struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); - -- BUG_ON(!pd); -- - return pd_to_bfqg(pd); - } - -@@ -208,59 +208,47 @@ static void bfqg_put(struct bfq_group *bfqg) - - static void bfqg_stats_update_io_add(struct bfq_group *bfqg, - struct bfq_queue *bfqq, -- int rw) -+ unsigned int op) - { -- blkg_rwstat_add(&bfqg->stats.queued, rw, 1); -+ blkg_rwstat_add(&bfqg->stats.queued, op, 1); - bfqg_stats_end_empty_time(&bfqg->stats); - if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) - bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); - } - --static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw) --{ -- blkg_rwstat_add(&bfqg->stats.queued, rw, -1); --} -- --static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) -+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) - { -- blkg_rwstat_add(&bfqg->stats.merged, rw, 1); -+ blkg_rwstat_add(&bfqg->stats.queued, op, -1); - } - --static void bfqg_stats_update_dispatch(struct bfq_group *bfqg, -- uint64_t bytes, int rw) -+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) - { -- blkg_stat_add(&bfqg->stats.sectors, bytes >> 9); -- blkg_rwstat_add(&bfqg->stats.serviced, rw, 1); -- blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes); -+ blkg_rwstat_add(&bfqg->stats.merged, op, 1); - } - - static void bfqg_stats_update_completion(struct bfq_group *bfqg, -- uint64_t start_time, uint64_t io_start_time, int rw) -+ uint64_t start_time, uint64_t io_start_time, -+ unsigned int op) - { - struct bfqg_stats *stats = &bfqg->stats; - unsigned long long now = sched_clock(); - - if (time_after64(now, io_start_time)) -- blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); -+ blkg_rwstat_add(&stats->service_time, op, -+ now - io_start_time); - if (time_after64(io_start_time, start_time)) -- blkg_rwstat_add(&stats->wait_time, rw, -+ blkg_rwstat_add(&stats->wait_time, op, - io_start_time - start_time); - } - - /* @stats = 0 */ - static void bfqg_stats_reset(struct bfqg_stats *stats) - { -- if (!stats) -- return; -- - /* queued stats shouldn't be cleared */ -- blkg_rwstat_reset(&stats->service_bytes); -- blkg_rwstat_reset(&stats->serviced); - blkg_rwstat_reset(&stats->merged); - blkg_rwstat_reset(&stats->service_time); - blkg_rwstat_reset(&stats->wait_time); - blkg_stat_reset(&stats->time); -- blkg_stat_reset(&stats->unaccounted_time); - blkg_stat_reset(&stats->avg_queue_size_sum); - blkg_stat_reset(&stats->avg_queue_size_samples); - blkg_stat_reset(&stats->dequeue); -@@ -270,19 +258,16 @@ static void bfqg_stats_reset(struct bfqg_stats *stats) - } - - /* @to += @from */ --static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from) -+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) - { - if (!to || !from) - return; - - /* queued stats shouldn't be cleared */ -- blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes); -- blkg_rwstat_add_aux(&to->serviced, &from->serviced); - blkg_rwstat_add_aux(&to->merged, &from->merged); - blkg_rwstat_add_aux(&to->service_time, &from->service_time); - blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); - blkg_stat_add_aux(&from->time, &from->time); -- blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); - blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); - blkg_stat_add_aux(&to->avg_queue_size_samples, - &from->avg_queue_size_samples); -@@ -311,10 +296,8 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) - if (unlikely(!parent)) - return; - -- bfqg_stats_merge(&parent->dead_stats, &bfqg->stats); -- bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats); -+ bfqg_stats_add_aux(&parent->stats, &bfqg->stats); - bfqg_stats_reset(&bfqg->stats); -- bfqg_stats_reset(&bfqg->dead_stats); - } - - static void bfq_init_entity(struct bfq_entity *entity, -@@ -329,21 +312,17 @@ static void bfq_init_entity(struct bfq_entity *entity, - bfqq->ioprio_class = bfqq->new_ioprio_class; - bfqg_get(bfqg); - } -- entity->parent = bfqg->my_entity; -+ entity->parent = bfqg->my_entity; /* NULL for root group */ - entity->sched_data = &bfqg->sched_data; - } - - static void bfqg_stats_exit(struct bfqg_stats *stats) - { -- blkg_rwstat_exit(&stats->service_bytes); -- blkg_rwstat_exit(&stats->serviced); - blkg_rwstat_exit(&stats->merged); - blkg_rwstat_exit(&stats->service_time); - blkg_rwstat_exit(&stats->wait_time); - blkg_rwstat_exit(&stats->queued); -- blkg_stat_exit(&stats->sectors); - blkg_stat_exit(&stats->time); -- blkg_stat_exit(&stats->unaccounted_time); - blkg_stat_exit(&stats->avg_queue_size_sum); - blkg_stat_exit(&stats->avg_queue_size_samples); - blkg_stat_exit(&stats->dequeue); -@@ -354,15 +333,11 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) - - static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) - { -- if (blkg_rwstat_init(&stats->service_bytes, gfp) || -- blkg_rwstat_init(&stats->serviced, gfp) || -- blkg_rwstat_init(&stats->merged, gfp) || -+ if (blkg_rwstat_init(&stats->merged, gfp) || - blkg_rwstat_init(&stats->service_time, gfp) || - blkg_rwstat_init(&stats->wait_time, gfp) || - blkg_rwstat_init(&stats->queued, gfp) || -- blkg_stat_init(&stats->sectors, gfp) || - blkg_stat_init(&stats->time, gfp) || -- blkg_stat_init(&stats->unaccounted_time, gfp) || - blkg_stat_init(&stats->avg_queue_size_sum, gfp) || - blkg_stat_init(&stats->avg_queue_size_samples, gfp) || - blkg_stat_init(&stats->dequeue, gfp) || -@@ -386,11 +361,27 @@ static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) - return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); - } - -+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) -+{ -+ struct bfq_group_data *bgd; -+ -+ bgd = kzalloc(sizeof(*bgd), gfp); -+ if (!bgd) -+ return NULL; -+ return &bgd->pd; -+} -+ - static void bfq_cpd_init(struct blkcg_policy_data *cpd) - { - struct bfq_group_data *d = cpd_to_bfqgd(cpd); - -- d->weight = BFQ_DEFAULT_GRP_WEIGHT; -+ d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? -+ CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; -+} -+ -+static void bfq_cpd_free(struct blkcg_policy_data *cpd) -+{ -+ kfree(cpd_to_bfqgd(cpd)); - } - - static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) -@@ -401,8 +392,7 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) - if (!bfqg) - return NULL; - -- if (bfqg_stats_init(&bfqg->stats, gfp) || -- bfqg_stats_init(&bfqg->dead_stats, gfp)) { -+ if (bfqg_stats_init(&bfqg->stats, gfp)) { - kfree(bfqg); - return NULL; - } -@@ -410,27 +400,20 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) - return &bfqg->pd; - } - --static void bfq_group_set_parent(struct bfq_group *bfqg, -- struct bfq_group *parent) -+static void bfq_pd_init(struct blkg_policy_data *pd) - { -+ struct blkcg_gq *blkg; -+ struct bfq_group *bfqg; -+ struct bfq_data *bfqd; - struct bfq_entity *entity; -+ struct bfq_group_data *d; - -- BUG_ON(!parent); -- BUG_ON(!bfqg); -- BUG_ON(bfqg == parent); -- -+ blkg = pd_to_blkg(pd); -+ BUG_ON(!blkg); -+ bfqg = blkg_to_bfqg(blkg); -+ bfqd = blkg->q->elevator->elevator_data; - entity = &bfqg->entity; -- entity->parent = parent->my_entity; -- entity->sched_data = &parent->sched_data; --} -- --static void bfq_pd_init(struct blkg_policy_data *pd) --{ -- struct blkcg_gq *blkg = pd_to_blkg(pd); -- struct bfq_group *bfqg = blkg_to_bfqg(blkg); -- struct bfq_data *bfqd = blkg->q->elevator->elevator_data; -- struct bfq_entity *entity = &bfqg->entity; -- struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); -+ d = blkcg_to_bfqgd(blkg->blkcg); - - entity->orig_weight = entity->weight = entity->new_weight = d->weight; - entity->my_sched_data = &bfqg->sched_data; -@@ -448,70 +431,53 @@ static void bfq_pd_free(struct blkg_policy_data *pd) - struct bfq_group *bfqg = pd_to_bfqg(pd); - - bfqg_stats_exit(&bfqg->stats); -- bfqg_stats_exit(&bfqg->dead_stats); -- - return kfree(bfqg); - } - --/* offset delta from bfqg->stats to bfqg->dead_stats */ --static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) - -- offsetof(struct bfq_group, stats); -- --/* to be used by recursive prfill, sums live and dead stats recursively */ --static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) -+static void bfq_pd_reset_stats(struct blkg_policy_data *pd) - { -- u64 sum = 0; -+ struct bfq_group *bfqg = pd_to_bfqg(pd); - -- sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); -- sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, -- off + dead_stats_off_delta); -- return sum; -+ bfqg_stats_reset(&bfqg->stats); - } - --/* to be used by recursive prfill, sums live and dead rwstats recursively */ --static struct blkg_rwstat --bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, int off) -+static void bfq_group_set_parent(struct bfq_group *bfqg, -+ struct bfq_group *parent) - { -- struct blkg_rwstat a, b; -+ struct bfq_entity *entity; - -- a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); -- b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, -- off + dead_stats_off_delta); -- blkg_rwstat_add_aux(&a, &b); -- return a; -+ BUG_ON(!parent); -+ BUG_ON(!bfqg); -+ BUG_ON(bfqg == parent); -+ -+ entity = &bfqg->entity; -+ entity->parent = parent->my_entity; -+ entity->sched_data = &parent->sched_data; - } - --static void bfq_pd_reset_stats(struct blkg_policy_data *pd) -+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, -+ struct blkcg *blkcg) - { -- struct bfq_group *bfqg = pd_to_bfqg(pd); -+ struct blkcg_gq *blkg; - -- bfqg_stats_reset(&bfqg->stats); -- bfqg_stats_reset(&bfqg->dead_stats); -+ blkg = blkg_lookup(blkcg, bfqd->queue); -+ if (likely(blkg)) -+ return blkg_to_bfqg(blkg); -+ return NULL; - } - --static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, -- struct blkcg *blkcg) -+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, -+ struct blkcg *blkcg) - { -- struct request_queue *q = bfqd->queue; -- struct bfq_group *bfqg = NULL, *parent; -- struct bfq_entity *entity = NULL; -+ struct bfq_group *bfqg, *parent; -+ struct bfq_entity *entity; - - assert_spin_locked(bfqd->queue->queue_lock); - -- /* avoid lookup for the common case where there's no blkcg */ -- if (blkcg == &blkcg_root) { -- bfqg = bfqd->root_group; -- } else { -- struct blkcg_gq *blkg; -- -- blkg = blkg_lookup_create(blkcg, q); -- if (!IS_ERR(blkg)) -- bfqg = blkg_to_bfqg(blkg); -- else /* fallback to root_group */ -- bfqg = bfqd->root_group; -- } -+ bfqg = bfq_lookup_bfqg(bfqd, blkcg); - -- BUG_ON(!bfqg); -+ if (unlikely(!bfqg)) -+ return NULL; - - /* - * Update chain of bfq_groups as we might be handling a leaf group -@@ -537,11 +503,15 @@ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, - static void bfq_pos_tree_add_move(struct bfq_data *bfqd, - struct bfq_queue *bfqq); - -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ - /** - * bfq_bfqq_move - migrate @bfqq to @bfqg. - * @bfqd: queue descriptor. - * @bfqq: the queue to move. -- * @entity: @bfqq's entity. - * @bfqg: the group to move to. - * - * Move @bfqq to @bfqg, deactivating it from its old group and reactivating -@@ -552,26 +522,40 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, - * rcu_read_lock()). - */ - static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, -- struct bfq_entity *entity, struct bfq_group *bfqg) -+ struct bfq_group *bfqg) - { -- int busy, resume; -- -- busy = bfq_bfqq_busy(bfqq); -- resume = !RB_EMPTY_ROOT(&bfqq->sort_list); -+ struct bfq_entity *entity = &bfqq->entity; - -- BUG_ON(resume && !entity->on_st); -- BUG_ON(busy && !resume && entity->on_st && -+ BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st); -+ BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) -+ && entity->on_st && - bfqq != bfqd->in_service_queue); -+ BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue); -+ -+ /* If bfqq is empty, then bfq_bfqq_expire also invokes -+ * bfq_del_bfqq_busy, thereby removing bfqq and its entity -+ * from data structures related to current group. Otherwise we -+ * need to remove bfqq explicitly with bfq_deactivate_bfqq, as -+ * we do below. -+ */ -+ if (bfqq == bfqd->in_service_queue) -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ -+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) -+ && &bfq_entity_service_tree(entity)->idle != -+ entity->tree); - -- if (busy) { -- BUG_ON(atomic_read(&bfqq->ref) < 2); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); - -- if (!resume) -- bfq_del_bfqq_busy(bfqd, bfqq, 0); -- else -- bfq_deactivate_bfqq(bfqd, bfqq, 0); -- } else if (entity->on_st) -+ if (bfq_bfqq_busy(bfqq)) -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ else if (entity->on_st) { -+ BUG_ON(&bfq_entity_service_tree(entity)->idle != -+ entity->tree); - bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); -+ } - bfqg_put(bfqq_group(bfqq)); - - /* -@@ -583,14 +567,17 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - entity->sched_data = &bfqg->sched_data; - bfqg_get(bfqg); - -- if (busy) { -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); -+ if (bfq_bfqq_busy(bfqq)) { - bfq_pos_tree_add_move(bfqd, bfqq); -- if (resume) -- bfq_activate_bfqq(bfqd, bfqq); -+ bfq_activate_bfqq(bfqd, bfqq); - } - - if (!bfqd->in_service_queue && !bfqd->rq_in_driver) - bfq_schedule_dispatch(bfqd); -+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) -+ && &bfq_entity_service_tree(entity)->idle != -+ entity->tree); - } - - /** -@@ -617,7 +604,11 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - - lockdep_assert_held(bfqd->queue->queue_lock); - -- bfqg = bfq_find_alloc_group(bfqd, blkcg); -+ bfqg = bfq_find_set_group(bfqd, blkcg); -+ -+ if (unlikely(!bfqg)) -+ bfqg = bfqd->root_group; -+ - if (async_bfqq) { - entity = &async_bfqq->entity; - -@@ -625,7 +616,8 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - bic_set_bfqq(bic, NULL, 0); - bfq_log_bfqq(bfqd, async_bfqq, - "bic_change_group: %p %d", -- async_bfqq, atomic_read(&async_bfqq->ref)); -+ async_bfqq, -+ async_bfqq->ref); - bfq_put_queue(async_bfqq); - } - } -@@ -633,7 +625,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - if (sync_bfqq) { - entity = &sync_bfqq->entity; - if (entity->sched_data != &bfqg->sched_data) -- bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); -+ bfq_bfqq_move(bfqd, sync_bfqq, bfqg); - } - - return bfqg; -@@ -642,25 +634,23 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) - { - struct bfq_data *bfqd = bic_to_bfqd(bic); -- struct blkcg *blkcg; - struct bfq_group *bfqg = NULL; -- uint64_t id; -+ uint64_t serial_nr; - - rcu_read_lock(); -- blkcg = bio_blkcg(bio); -- id = blkcg->css.serial_nr; -- rcu_read_unlock(); -+ serial_nr = bio_blkcg(bio)->css.serial_nr; - - /* - * Check whether blkcg has changed. The condition may trigger - * spuriously on a newly created cic but there's no harm. - */ -- if (unlikely(!bfqd) || likely(bic->blkcg_id == id)) -- return; -+ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) -+ goto out; - -- bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg); -- BUG_ON(!bfqg); -- bic->blkcg_id = id; -+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); -+ bic->blkcg_serial_nr = serial_nr; -+out: -+ rcu_read_unlock(); - } - - /** -@@ -672,7 +662,7 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st) - struct bfq_entity *entity = st->first_idle; - - for (; entity ; entity = st->first_idle) -- __bfq_deactivate_entity(entity, 0); -+ __bfq_deactivate_entity(entity, false); - } - - /** -@@ -686,7 +676,7 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - BUG_ON(!bfqq); -- bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); -+ bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); - } - - /** -@@ -717,11 +707,12 @@ static void bfq_reparent_active_entities(struct bfq_data *bfqd, - } - - /** -- * bfq_destroy_group - destroy @bfqg. -- * @bfqg: the group being destroyed. -+ * bfq_pd_offline - deactivate the entity associated with @pd, -+ * and reparent its children entities. -+ * @pd: descriptor of the policy going offline. - * -- * Destroy @bfqg, making sure that it is not referenced from its parent. -- * blkio already grabs the queue_lock for us, so no need to use RCU-based magic -+ * blkio already grabs the queue_lock for us, so no need to use -+ * RCU-based magic - */ - static void bfq_pd_offline(struct blkg_policy_data *pd) - { -@@ -776,10 +767,15 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - BUG_ON(bfqg->sched_data.next_in_service); - BUG_ON(bfqg->sched_data.in_service_entity); - -- __bfq_deactivate_entity(entity, 0); -+ __bfq_deactivate_entity(entity, false); - bfq_put_async_queues(bfqd, bfqg); -- BUG_ON(entity->tree); - -+ /* -+ * @blkg is going offline and will be ignored by -+ * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so -+ * that they don't get lost. If IOs complete after this point, the -+ * stats for them will be lost. Oh well... -+ */ - bfqg_stats_xfer_dead(bfqg); - } - -@@ -789,46 +785,35 @@ static void bfq_end_wr_async(struct bfq_data *bfqd) - - list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { - struct bfq_group *bfqg = blkg_to_bfqg(blkg); -+ BUG_ON(!bfqg); - - bfq_end_wr_async_queues(bfqd, bfqg); - } - bfq_end_wr_async_queues(bfqd, bfqd->root_group); - } - --static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css, -- struct cftype *cftype) --{ -- struct blkcg *blkcg = css_to_blkcg(css); -- struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -- int ret = -EINVAL; -- -- spin_lock_irq(&blkcg->lock); -- ret = bfqgd->weight; -- spin_unlock_irq(&blkcg->lock); -- -- return ret; --} -- --static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v) -+static int bfq_io_show_weight(struct seq_file *sf, void *v) - { - struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); - struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -+ unsigned int val = 0; - -- spin_lock_irq(&blkcg->lock); -- seq_printf(sf, "%u\n", bfqgd->weight); -- spin_unlock_irq(&blkcg->lock); -+ if (bfqgd) -+ val = bfqgd->weight; -+ -+ seq_printf(sf, "%u\n", val); - - return 0; - } - --static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, -- struct cftype *cftype, -- u64 val) -+static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, -+ struct cftype *cftype, -+ u64 val) - { - struct blkcg *blkcg = css_to_blkcg(css); - struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); - struct blkcg_gq *blkg; -- int ret = -EINVAL; -+ int ret = -ERANGE; - - if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) - return ret; -@@ -873,13 +858,18 @@ static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, - return ret; - } - --static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of, -- char *buf, size_t nbytes, -- loff_t off) -+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, -+ char *buf, size_t nbytes, -+ loff_t off) - { -+ u64 weight; - /* First unsigned long found in the file is used */ -- return bfqio_cgroup_weight_write(of_css(of), NULL, -- simple_strtoull(strim(buf), NULL, 0)); -+ int ret = kstrtoull(strim(buf), 0, &weight); -+ -+ if (ret) -+ return ret; -+ -+ return bfq_io_set_weight_legacy(of_css(of), NULL, weight); - } - - static int bfqg_print_stat(struct seq_file *sf, void *v) -@@ -899,16 +889,17 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v) - static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) - { -- u64 sum = bfqg_stat_pd_recursive_sum(pd, off); -- -+ u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), -+ &blkcg_policy_bfq, off); - return __blkg_prfill_u64(sf, pd, sum); - } - - static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) - { -- struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off); -- -+ struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), -+ &blkcg_policy_bfq, -+ off); - return __blkg_prfill_rwstat(sf, pd, &sum); - } - -@@ -928,6 +919,41 @@ static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) - return 0; - } - -+static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, -+ int off) -+{ -+ u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); -+ -+ return __blkg_prfill_u64(sf, pd, sum >> 9); -+} -+ -+static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); -+ return 0; -+} -+ -+static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, -+ offsetof(struct blkcg_gq, stat_bytes)); -+ u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + -+ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); -+ -+ return __blkg_prfill_u64(sf, pd, sum >> 9); -+} -+ -+static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, -+ false); -+ return 0; -+} -+ -+ - static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, - struct blkg_policy_data *pd, int off) - { -@@ -964,38 +990,15 @@ bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) - return blkg_to_bfqg(bfqd->queue->root_blkg); - } - --static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) --{ -- struct bfq_group_data *bgd; -- -- bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); -- if (!bgd) -- return NULL; -- return &bgd->pd; --} -- --static void bfq_cpd_free(struct blkcg_policy_data *cpd) --{ -- kfree(cpd_to_bfqgd(cpd)); --} -- --static struct cftype bfqio_files_dfl[] = { -+static struct cftype bfq_blkcg_legacy_files[] = { - { -- .name = "weight", -+ .name = "bfq.weight", - .flags = CFTYPE_NOT_ON_ROOT, -- .seq_show = bfqio_cgroup_weight_read_dfl, -- .write = bfqio_cgroup_weight_write_dfl, -+ .seq_show = bfq_io_show_weight, -+ .write_u64 = bfq_io_set_weight_legacy, - }, -- {} /* terminate */ --}; - --static struct cftype bfqio_files[] = { -- { -- .name = "bfq.weight", -- .read_u64 = bfqio_cgroup_weight_read, -- .write_u64 = bfqio_cgroup_weight_write, -- }, -- /* statistics, cover only the tasks in the bfqg */ -+ /* statistics, covers only the tasks in the bfqg */ - { - .name = "bfq.time", - .private = offsetof(struct bfq_group, stats.time), -@@ -1003,18 +1006,17 @@ static struct cftype bfqio_files[] = { - }, - { - .name = "bfq.sectors", -- .private = offsetof(struct bfq_group, stats.sectors), -- .seq_show = bfqg_print_stat, -+ .seq_show = bfqg_print_stat_sectors, - }, - { - .name = "bfq.io_service_bytes", -- .private = offsetof(struct bfq_group, stats.service_bytes), -- .seq_show = bfqg_print_rwstat, -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_bytes, - }, - { - .name = "bfq.io_serviced", -- .private = offsetof(struct bfq_group, stats.serviced), -- .seq_show = bfqg_print_rwstat, -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_ios, - }, - { - .name = "bfq.io_service_time", -@@ -1045,18 +1047,17 @@ static struct cftype bfqio_files[] = { - }, - { - .name = "bfq.sectors_recursive", -- .private = offsetof(struct bfq_group, stats.sectors), -- .seq_show = bfqg_print_stat_recursive, -+ .seq_show = bfqg_print_stat_sectors_recursive, - }, - { - .name = "bfq.io_service_bytes_recursive", -- .private = offsetof(struct bfq_group, stats.service_bytes), -- .seq_show = bfqg_print_rwstat_recursive, -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_bytes_recursive, - }, - { - .name = "bfq.io_serviced_recursive", -- .private = offsetof(struct bfq_group, stats.serviced), -- .seq_show = bfqg_print_rwstat_recursive, -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_ios_recursive, - }, - { - .name = "bfq.io_service_time_recursive", -@@ -1102,31 +1103,42 @@ static struct cftype bfqio_files[] = { - .private = offsetof(struct bfq_group, stats.dequeue), - .seq_show = bfqg_print_stat, - }, -- { -- .name = "bfq.unaccounted_time", -- .private = offsetof(struct bfq_group, stats.unaccounted_time), -- .seq_show = bfqg_print_stat, -- }, - { } /* terminate */ - }; - --static struct blkcg_policy blkcg_policy_bfq = { -- .dfl_cftypes = bfqio_files_dfl, -- .legacy_cftypes = bfqio_files, -- -- .pd_alloc_fn = bfq_pd_alloc, -- .pd_init_fn = bfq_pd_init, -- .pd_offline_fn = bfq_pd_offline, -- .pd_free_fn = bfq_pd_free, -- .pd_reset_stats_fn = bfq_pd_reset_stats, -- -- .cpd_alloc_fn = bfq_cpd_alloc, -- .cpd_init_fn = bfq_cpd_init, -- .cpd_bind_fn = bfq_cpd_init, -- .cpd_free_fn = bfq_cpd_free, -+static struct cftype bfq_blkg_files[] = { -+ { -+ .name = "bfq.weight", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .seq_show = bfq_io_show_weight, -+ .write = bfq_io_set_weight, -+ }, -+ {} /* terminate */ - }; - --#else -+#else /* CONFIG_BFQ_GROUP_IOSCHED */ -+ -+static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } -+static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ uint64_t start_time, uint64_t io_start_time, -+ unsigned int op) { } -+static inline void -+bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -+ struct bfq_group *curr_bfqg) { } -+static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } -+static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } -+ -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_group *bfqg) {} - - static void bfq_init_entity(struct bfq_entity *entity, - struct bfq_group *bfqg) -@@ -1142,35 +1154,22 @@ static void bfq_init_entity(struct bfq_entity *entity, - entity->sched_data = &bfqg->sched_data; - } - --static struct bfq_group * --bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) --{ -- struct bfq_data *bfqd = bic_to_bfqd(bic); -- -- return bfqd->root_group; --} -- --static void bfq_bfqq_move(struct bfq_data *bfqd, -- struct bfq_queue *bfqq, -- struct bfq_entity *entity, -- struct bfq_group *bfqg) --{ --} -+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} - - static void bfq_end_wr_async(struct bfq_data *bfqd) - { - bfq_end_wr_async_queues(bfqd, bfqd->root_group); - } - --static void bfq_disconnect_groups(struct bfq_data *bfqd) -+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, -+ struct blkcg *blkcg) - { -- bfq_put_async_queues(bfqd, bfqd->root_group); -+ return bfqd->root_group; - } - --static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, -- struct blkcg *blkcg) -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) - { -- return bfqd->root_group; -+ return bfqq->bfqd->root_group; - } - - static struct bfq_group * -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c -index cf3e9b1..a56888e 100644 ---- a/block/bfq-iosched.c -+++ b/block/bfq-iosched.c -@@ -1,5 +1,5 @@ - /* -- * Budget Fair Queueing (BFQ) disk scheduler. -+ * Budget Fair Queueing (BFQ) I/O scheduler. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe -@@ -7,25 +7,34 @@ - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * -- * Copyright (C) 2010 Paolo Valente -+ * Copyright (C) 2015 Paolo Valente -+ * -+ * Copyright (C) 2017 Paolo Valente - * - * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ - * file. - * -- * BFQ is a proportional-share storage-I/O scheduling algorithm based on -- * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, -- * measured in number of sectors, to processes instead of time slices. The -- * device is not granted to the in-service process for a given time slice, -- * but until it has exhausted its assigned budget. This change from the time -- * to the service domain allows BFQ to distribute the device throughput -- * among processes as desired, without any distortion due to ZBR, workload -- * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, -- * called B-WF2Q+, to schedule processes according to their budgets. More -- * precisely, BFQ schedules queues associated to processes. Thanks to the -- * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to -- * I/O-bound processes issuing sequential requests (to boost the -- * throughput), and yet guarantee a low latency to interactive and soft -- * real-time applications. -+ * BFQ is a proportional-share I/O scheduler, with some extra -+ * low-latency capabilities. BFQ also supports full hierarchical -+ * scheduling through cgroups. Next paragraphs provide an introduction -+ * on BFQ inner workings. Details on BFQ benefits and usage can be -+ * found in Documentation/block/bfq-iosched.txt. -+ * -+ * BFQ is a proportional-share storage-I/O scheduling algorithm based -+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns -+ * budgets, measured in number of sectors, to processes instead of -+ * time slices. The device is not granted to the in-service process -+ * for a given time slice, but until it has exhausted its assigned -+ * budget. This change from the time to the service domain enables BFQ -+ * to distribute the device throughput among processes as desired, -+ * without any distortion due to throughput fluctuations, or to device -+ * internal queueing. BFQ uses an ad hoc internal scheduler, called -+ * B-WF2Q+, to schedule processes according to their budgets. More -+ * precisely, BFQ schedules queues associated with processes. Thanks to -+ * the accurate policy of B-WF2Q+, BFQ can afford to assign high -+ * budgets to I/O-bound processes issuing sequential requests (to -+ * boost the throughput), and yet guarantee a low latency to -+ * interactive and soft real-time applications. - * - * BFQ is described in [1], where also a reference to the initial, more - * theoretical paper on BFQ can be found. The interested reader can find -@@ -40,10 +49,10 @@ - * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) - * complexity derives from the one introduced with EEVDF in [3]. - * -- * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness -- * with the BFQ Disk I/O Scheduler'', -- * Proceedings of the 5th Annual International Systems and Storage -- * Conference (SYSTOR '12), June 2012. -+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O -+ * Scheduler", Proceedings of the First Workshop on Mobile System -+ * Technologies (MST-2015), May 2015. -+ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf - * - * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf - * -@@ -67,27 +76,26 @@ - #include - #include - #include --#include "bfq.h" - #include "blk.h" -+#include "bfq.h" - --/* Expiration time of sync (0) and async (1) requests, in jiffies. */ --static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; -+/* Expiration time of sync (0) and async (1) requests, in ns. */ -+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; - - /* Maximum backwards seek, in KiB. */ --static const int bfq_back_max = 16 * 1024; -+static const int bfq_back_max = (16 * 1024); - - /* Penalty of a backwards seek, in number of sectors. */ - static const int bfq_back_penalty = 2; - --/* Idling period duration, in jiffies. */ --static int bfq_slice_idle = HZ / 125; -+/* Idling period duration, in ns. */ -+static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); - - /* Minimum number of assigned budgets for which stats are safe to compute. */ - static const int bfq_stats_min_budgets = 194; - - /* Default maximum budget values, in sectors and number of requests. */ --static const int bfq_default_max_budget = 16 * 1024; --static const int bfq_max_budget_async_rq = 4; -+static const int bfq_default_max_budget = (16 * 1024); - - /* - * Async to sync throughput distribution is controlled as follows: -@@ -97,23 +105,28 @@ static const int bfq_max_budget_async_rq = 4; - static const int bfq_async_charge_factor = 10; - - /* Default timeout values, in jiffies, approximating CFQ defaults. */ --static const int bfq_timeout_sync = HZ / 8; --static int bfq_timeout_async = HZ / 25; -+static const int bfq_timeout = (HZ / 8); - --struct kmem_cache *bfq_pool; -+static struct kmem_cache *bfq_pool; - --/* Below this threshold (in ms), we consider thinktime immediate. */ --#define BFQ_MIN_TT 2 -+/* Below this threshold (in ns), we consider thinktime immediate. */ -+#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) - - /* hw_tag detection: parallel requests threshold and min samples needed. */ - #define BFQ_HW_QUEUE_THRESHOLD 4 - #define BFQ_HW_QUEUE_SAMPLES 32 - --#define BFQQ_SEEK_THR (sector_t)(8 * 1024) --#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) -+#define BFQQ_SEEK_THR (sector_t)(8 * 100) -+#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) -+#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) -+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) - --/* Min samples used for peak rate estimation (for autotuning). */ --#define BFQ_PEAK_RATE_SAMPLES 32 -+/* Min number of samples required to perform peak-rate update */ -+#define BFQ_RATE_MIN_SAMPLES 32 -+/* Min observation time interval required to perform a peak-rate update (ns) */ -+#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) -+/* Target observation time interval for a peak-rate update (ns) */ -+#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC - - /* Shift used for peak rate fixed precision calculations. */ - #define BFQ_RATE_SHIFT 16 -@@ -141,16 +154,24 @@ struct kmem_cache *bfq_pool; - * The device's speed class is dynamically (re)detected in - * bfq_update_peak_rate() every time the estimated peak rate is updated. - * -- * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0] -- * are the reference values for a slow/fast rotational device, whereas -- * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for -- * a slow/fast non-rotational device. Finally, device_speed_thresh are the -- * thresholds used to switch between speed classes. -+ * In the following definitions, R_slow[0]/R_fast[0] and -+ * T_slow[0]/T_fast[0] are the reference values for a slow/fast -+ * rotational device, whereas R_slow[1]/R_fast[1] and -+ * T_slow[1]/T_fast[1] are the reference values for a slow/fast -+ * non-rotational device. Finally, device_speed_thresh are the -+ * thresholds used to switch between speed classes. The reference -+ * rates are not the actual peak rates of the devices used as a -+ * reference, but slightly lower values. The reason for using these -+ * slightly lower values is that the peak-rate estimator tends to -+ * yield slightly lower values than the actual peak rate (it can yield -+ * the actual peak rate only if there is only one process doing I/O, -+ * and the process does sequential I/O). -+ * - * Both the reference peak rates and the thresholds are measured in - * sectors/usec, left-shifted by BFQ_RATE_SHIFT. - */ --static int R_slow[2] = {1536, 10752}; --static int R_fast[2] = {17415, 34791}; -+static int R_slow[2] = {1000, 10700}; -+static int R_fast[2] = {14000, 33000}; - /* - * To improve readability, a conversion function is used to initialize the - * following arrays, which entails that they can be initialized only in a -@@ -178,18 +199,6 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd); - #define bfq_sample_valid(samples) ((samples) > 80) - - /* -- * We regard a request as SYNC, if either it's a read or has the SYNC bit -- * set (in which case it could also be a direct WRITE). -- */ --static int bfq_bio_sync(struct bio *bio) --{ -- if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) -- return 1; -- -- return 0; --} -- --/* - * Scheduler run of queue, if there are requests pending and no one in the - * driver that will restart queueing. - */ -@@ -409,11 +418,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) - */ - static bool bfq_symmetric_scenario(struct bfq_data *bfqd) - { -- return --#ifdef CONFIG_BFQ_GROUP_IOSCHED -- !bfqd->active_numerous_groups && --#endif -- !bfq_differentiated_weights(bfqd); -+ return !bfq_differentiated_weights(bfqd); - } - - /* -@@ -469,6 +474,22 @@ static void bfq_weights_tree_add(struct bfq_data *bfqd, - - entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), - GFP_ATOMIC); -+ -+ /* -+ * In the unlucky event of an allocation failure, we just -+ * exit. This will cause the weight of entity to not be -+ * considered in bfq_differentiated_weights, which, in its -+ * turn, causes the scenario to be deemed wrongly symmetric in -+ * case entity's weight would have been the only weight making -+ * the scenario asymmetric. On the bright side, no unbalance -+ * will however occur when entity becomes inactive again (the -+ * invocation of this function is triggered by an activation -+ * of entity). In fact, bfq_weights_tree_remove does nothing -+ * if !entity->weight_counter. -+ */ -+ if (unlikely(!entity->weight_counter)) -+ return; -+ - entity->weight_counter->weight = entity->weight; - rb_link_node(&entity->weight_counter->weights_node, parent, new); - rb_insert_color(&entity->weight_counter->weights_node, root); -@@ -505,13 +526,45 @@ static void bfq_weights_tree_remove(struct bfq_data *bfqd, - entity->weight_counter = NULL; - } - -+/* -+ * Return expired entry, or NULL to just start from scratch in rbtree. -+ */ -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct request *rq; -+ -+ if (bfq_bfqq_fifo_expire(bfqq)) -+ return NULL; -+ -+ bfq_mark_bfqq_fifo_expire(bfqq); -+ -+ rq = rq_entry_fifo(bfqq->fifo.next); -+ -+ if (rq == last || ktime_get_ns() < rq->fifo_time) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); -+ BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); -+ return rq; -+} -+ - static struct request *bfq_find_next_rq(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct request *last) - { - struct rb_node *rbnext = rb_next(&last->rb_node); - struct rb_node *rbprev = rb_prev(&last->rb_node); -- struct request *next = NULL, *prev = NULL; -+ struct request *next, *prev = NULL; -+ -+ BUG_ON(list_empty(&bfqq->fifo)); -+ -+ /* Follow expired path, else get first next available. */ -+ next = bfq_check_fifo(bfqq, last); -+ if (next) { -+ BUG_ON(next == last); -+ return next; -+ } - - BUG_ON(RB_EMPTY_NODE(&last->rb_node)); - -@@ -533,9 +586,19 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd, - static unsigned long bfq_serv_to_charge(struct request *rq, - struct bfq_queue *bfqq) - { -- return blk_rq_sectors(rq) * -- (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * -- bfq_async_charge_factor)); -+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) -+ return blk_rq_sectors(rq); -+ -+ /* -+ * If there are no weight-raised queues, then amplify service -+ * by just the async charge factor; otherwise amplify service -+ * by twice the async charge factor, to further reduce latency -+ * for weight-raised queues. -+ */ -+ if (bfqq->bfqd->wr_busy_queues == 0) -+ return blk_rq_sectors(rq) * bfq_async_charge_factor; -+ -+ return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; - } - - /** -@@ -576,7 +639,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, - entity->budget = new_budget; - bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", - new_budget); -- bfq_activate_bfqq(bfqd, bfqq); -+ bfq_requeue_bfqq(bfqd, bfqq); - } - } - -@@ -590,12 +653,23 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) - dur = bfqd->RT_prod; - do_div(dur, bfqd->peak_rate); - -- return dur; --} -+ /* -+ * Limit duration between 3 and 13 seconds. Tests show that -+ * higher values than 13 seconds often yield the opposite of -+ * the desired result, i.e., worsen responsiveness by letting -+ * non-interactive and non-soft-real-time applications -+ * preserve weight raising for a too long time interval. -+ * -+ * On the other end, lower values than 3 seconds make it -+ * difficult for most interactive tasks to complete their jobs -+ * before weight-raising finishes. -+ */ -+ if (dur > msecs_to_jiffies(13000)) -+ dur = msecs_to_jiffies(13000); -+ else if (dur < msecs_to_jiffies(3000)) -+ dur = msecs_to_jiffies(3000); - --static unsigned int bfq_bfqq_cooperations(struct bfq_queue *bfqq) --{ -- return bfqq->bic ? bfqq->bic->cooperations : 0; -+ return dur; - } - - static void -@@ -605,31 +679,31 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) - bfq_mark_bfqq_idle_window(bfqq); - else - bfq_clear_bfqq_idle_window(bfqq); -+ - if (bic->saved_IO_bound) - bfq_mark_bfqq_IO_bound(bfqq); - else - bfq_clear_bfqq_IO_bound(bfqq); -- /* Assuming that the flag in_large_burst is already correctly set */ -- if (bic->wr_time_left && bfqq->bfqd->low_latency && -- !bfq_bfqq_in_large_burst(bfqq) && -- bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { -- /* -- * Start a weight raising period with the duration given by -- * the raising_time_left snapshot. -- */ -- if (bfq_bfqq_busy(bfqq)) -- bfqq->bfqd->wr_busy_queues++; -- bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; -- bfqq->wr_cur_max_time = bic->wr_time_left; -- bfqq->last_wr_start_finish = jiffies; -- bfqq->entity.prio_changed = 1; -+ -+ bfqq->wr_coeff = bic->saved_wr_coeff; -+ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); -+ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; -+ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time))) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "resume state: switching off wr (%lu + %lu < %lu)", -+ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, -+ jiffies); -+ -+ bfqq->wr_coeff = 1; - } -- /* -- * Clear wr_time_left to prevent bfq_bfqq_save_state() from -- * getting confused about the queue's need of a weight-raising -- * period. -- */ -- bic->wr_time_left = 0; -+ /* make sure weight will be updated, however we got here */ -+ bfqq->entity.prio_changed = 1; - } - - static int bfqq_process_refs(struct bfq_queue *bfqq) -@@ -639,7 +713,7 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) - lockdep_assert_held(bfqq->bfqd->queue->queue_lock); - - io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; -- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; -+ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; - BUG_ON(process_refs < 0); - return process_refs; - } -@@ -654,6 +728,7 @@ static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) - hlist_del_init(&item->burst_list_node); - hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); - bfqd->burst_size = 1; -+ bfqd->burst_parent_entity = bfqq->entity.parent; - } - - /* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ -@@ -662,6 +737,10 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - /* Increment burst size to take into account also bfqq */ - bfqd->burst_size++; - -+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); -+ -+ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); -+ - if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { - struct bfq_queue *pos, *bfqq_item; - struct hlist_node *n; -@@ -671,15 +750,19 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - * other to consider this burst as large. - */ - bfqd->large_burst = true; -+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); - - /* - * We can now mark all queues in the burst list as - * belonging to a large burst. - */ - hlist_for_each_entry(bfqq_item, &bfqd->burst_list, -- burst_list_node) -+ burst_list_node) { - bfq_mark_bfqq_in_large_burst(bfqq_item); -+ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); -+ } - bfq_mark_bfqq_in_large_burst(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); - - /* - * From now on, and until the current burst finishes, any -@@ -691,67 +774,79 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, - burst_list_node) - hlist_del_init(&pos->burst_list_node); -- } else /* burst not yet large: add bfqq to the burst list */ -+ } else /* -+ * Burst not yet large: add bfqq to the burst list. Do -+ * not increment the ref counter for bfqq, because bfqq -+ * is removed from the burst list before freeing bfqq -+ * in put_queue. -+ */ - hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); - } - - /* -- * If many queues happen to become active shortly after each other, then, -- * to help the processes associated to these queues get their job done as -- * soon as possible, it is usually better to not grant either weight-raising -- * or device idling to these queues. In this comment we describe, firstly, -- * the reasons why this fact holds, and, secondly, the next function, which -- * implements the main steps needed to properly mark these queues so that -- * they can then be treated in a different way. -+ * If many queues belonging to the same group happen to be created -+ * shortly after each other, then the processes associated with these -+ * queues have typically a common goal. In particular, bursts of queue -+ * creations are usually caused by services or applications that spawn -+ * many parallel threads/processes. Examples are systemd during boot, -+ * or git grep. To help these processes get their job done as soon as -+ * possible, it is usually better to not grant either weight-raising -+ * or device idling to their queues. - * -- * As for the terminology, we say that a queue becomes active, i.e., -- * switches from idle to backlogged, either when it is created (as a -- * consequence of the arrival of an I/O request), or, if already existing, -- * when a new request for the queue arrives while the queue is idle. -- * Bursts of activations, i.e., activations of different queues occurring -- * shortly after each other, are typically caused by services or applications -- * that spawn or reactivate many parallel threads/processes. Examples are -- * systemd during boot or git grep. -+ * In this comment we describe, firstly, the reasons why this fact -+ * holds, and, secondly, the next function, which implements the main -+ * steps needed to properly mark these queues so that they can then be -+ * treated in a different way. - * -- * These services or applications benefit mostly from a high throughput: -- * the quicker the requests of the activated queues are cumulatively served, -- * the sooner the target job of these queues gets completed. As a consequence, -- * weight-raising any of these queues, which also implies idling the device -- * for it, is almost always counterproductive: in most cases it just lowers -- * throughput. -+ * The above services or applications benefit mostly from a high -+ * throughput: the quicker the requests of the activated queues are -+ * cumulatively served, the sooner the target job of these queues gets -+ * completed. As a consequence, weight-raising any of these queues, -+ * which also implies idling the device for it, is almost always -+ * counterproductive. In most cases it just lowers throughput. - * -- * On the other hand, a burst of activations may be also caused by the start -- * of an application that does not consist in a lot of parallel I/O-bound -- * threads. In fact, with a complex application, the burst may be just a -- * consequence of the fact that several processes need to be executed to -- * start-up the application. To start an application as quickly as possible, -- * the best thing to do is to privilege the I/O related to the application -- * with respect to all other I/O. Therefore, the best strategy to start as -- * quickly as possible an application that causes a burst of activations is -- * to weight-raise all the queues activated during the burst. This is the -+ * On the other hand, a burst of queue creations may be caused also by -+ * the start of an application that does not consist of a lot of -+ * parallel I/O-bound threads. In fact, with a complex application, -+ * several short processes may need to be executed to start-up the -+ * application. In this respect, to start an application as quickly as -+ * possible, the best thing to do is in any case to privilege the I/O -+ * related to the application with respect to all other -+ * I/O. Therefore, the best strategy to start as quickly as possible -+ * an application that causes a burst of queue creations is to -+ * weight-raise all the queues created during the burst. This is the - * exact opposite of the best strategy for the other type of bursts. - * -- * In the end, to take the best action for each of the two cases, the two -- * types of bursts need to be distinguished. Fortunately, this seems -- * relatively easy to do, by looking at the sizes of the bursts. In -- * particular, we found a threshold such that bursts with a larger size -- * than that threshold are apparently caused only by services or commands -- * such as systemd or git grep. For brevity, hereafter we call just 'large' -- * these bursts. BFQ *does not* weight-raise queues whose activations occur -- * in a large burst. In addition, for each of these queues BFQ performs or -- * does not perform idling depending on which choice boosts the throughput -- * most. The exact choice depends on the device and request pattern at -+ * In the end, to take the best action for each of the two cases, the -+ * two types of bursts need to be distinguished. Fortunately, this -+ * seems relatively easy, by looking at the sizes of the bursts. In -+ * particular, we found a threshold such that only bursts with a -+ * larger size than that threshold are apparently caused by -+ * services or commands such as systemd or git grep. For brevity, -+ * hereafter we call just 'large' these bursts. BFQ *does not* -+ * weight-raise queues whose creation occurs in a large burst. In -+ * addition, for each of these queues BFQ performs or does not perform -+ * idling depending on which choice boosts the throughput more. The -+ * exact choice depends on the device and request pattern at - * hand. - * -- * Turning back to the next function, it implements all the steps needed -- * to detect the occurrence of a large burst and to properly mark all the -- * queues belonging to it (so that they can then be treated in a different -- * way). This goal is achieved by maintaining a special "burst list" that -- * holds, temporarily, the queues that belong to the burst in progress. The -- * list is then used to mark these queues as belonging to a large burst if -- * the burst does become large. The main steps are the following. -+ * Unfortunately, false positives may occur while an interactive task -+ * is starting (e.g., an application is being started). The -+ * consequence is that the queues associated with the task do not -+ * enjoy weight raising as expected. Fortunately these false positives -+ * are very rare. They typically occur if some service happens to -+ * start doing I/O exactly when the interactive task starts. -+ * -+ * Turning back to the next function, it implements all the steps -+ * needed to detect the occurrence of a large burst and to properly -+ * mark all the queues belonging to it (so that they can then be -+ * treated in a different way). This goal is achieved by maintaining a -+ * "burst list" that holds, temporarily, the queues that belong to the -+ * burst in progress. The list is then used to mark these queues as -+ * belonging to a large burst if the burst does become large. The main -+ * steps are the following. - * -- * . when the very first queue is activated, the queue is inserted into the -+ * . when the very first queue is created, the queue is inserted into the - * list (as it could be the first queue in a possible burst) - * - * . if the current burst has not yet become large, and a queue Q that does -@@ -772,13 +867,13 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - * - * . the device enters a large-burst mode - * -- * . if a queue Q that does not belong to the burst is activated while -+ * . if a queue Q that does not belong to the burst is created while - * the device is in large-burst mode and shortly after the last time - * at which a queue either entered the burst list or was marked as - * belonging to the current large burst, then Q is immediately marked - * as belonging to a large burst. - * -- * . if a queue Q that does not belong to the burst is activated a while -+ * . if a queue Q that does not belong to the burst is created a while - * later, i.e., not shortly after, than the last time at which a queue - * either entered the burst list or was marked as belonging to the - * current large burst, then the current burst is deemed as finished and: -@@ -791,52 +886,44 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - * in a possible new burst (then the burst list contains just Q - * after this step). - */ --static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, -- bool idle_for_long_time) -+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - { - /* -- * If bfqq happened to be activated in a burst, but has been idle -- * for at least as long as an interactive queue, then we assume -- * that, in the overall I/O initiated in the burst, the I/O -- * associated to bfqq is finished. So bfqq does not need to be -- * treated as a queue belonging to a burst anymore. Accordingly, -- * we reset bfqq's in_large_burst flag if set, and remove bfqq -- * from the burst list if it's there. We do not decrement instead -- * burst_size, because the fact that bfqq does not need to belong -- * to the burst list any more does not invalidate the fact that -- * bfqq may have been activated during the current burst. -- */ -- if (idle_for_long_time) { -- hlist_del_init(&bfqq->burst_list_node); -- bfq_clear_bfqq_in_large_burst(bfqq); -- } -- -- /* - * If bfqq is already in the burst list or is part of a large -- * burst, then there is nothing else to do. -+ * burst, or finally has just been split, then there is -+ * nothing else to do. - */ - if (!hlist_unhashed(&bfqq->burst_list_node) || -- bfq_bfqq_in_large_burst(bfqq)) -+ bfq_bfqq_in_large_burst(bfqq) || -+ time_is_after_eq_jiffies(bfqq->split_time + -+ msecs_to_jiffies(10))) - return; - - /* -- * If bfqq's activation happens late enough, then the current -- * burst is finished, and related data structures must be reset. -+ * If bfqq's creation happens late enough, or bfqq belongs to -+ * a different group than the burst group, then the current -+ * burst is finished, and related data structures must be -+ * reset. - * -- * In this respect, consider the special case where bfqq is the very -- * first queue being activated. In this case, last_ins_in_burst is -- * not yet significant when we get here. But it is easy to verify -- * that, whether or not the following condition is true, bfqq will -- * end up being inserted into the burst list. In particular the -- * list will happen to contain only bfqq. And this is exactly what -- * has to happen, as bfqq may be the first queue in a possible -+ * In this respect, consider the special case where bfqq is -+ * the very first queue created after BFQ is selected for this -+ * device. In this case, last_ins_in_burst and -+ * burst_parent_entity are not yet significant when we get -+ * here. But it is easy to verify that, whether or not the -+ * following condition is true, bfqq will end up being -+ * inserted into the burst list. In particular the list will -+ * happen to contain only bfqq. And this is exactly what has -+ * to happen, as bfqq may be the first queue of the first - * burst. - */ - if (time_is_before_jiffies(bfqd->last_ins_in_burst + -- bfqd->bfq_burst_interval)) { -+ bfqd->bfq_burst_interval) || -+ bfqq->entity.parent != bfqd->burst_parent_entity) { - bfqd->large_burst = false; - bfq_reset_burst_list(bfqd, bfqq); -- return; -+ bfq_log_bfqq(bfqd, bfqq, -+ "handle_burst: late activation or different group"); -+ goto end; - } - - /* -@@ -845,8 +932,9 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * bfqq as belonging to this large burst immediately. - */ - if (bfqd->large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); - bfq_mark_bfqq_in_large_burst(bfqq); -- return; -+ goto end; - } - - /* -@@ -855,25 +943,489 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * queue. Then we add bfqq to the burst. - */ - bfq_add_to_burst(bfqd, bfqq); -+end: -+ /* -+ * At this point, bfqq either has been added to the current -+ * burst or has caused the current burst to terminate and a -+ * possible new burst to start. In particular, in the second -+ * case, bfqq has become the first queue in the possible new -+ * burst. In both cases last_ins_in_burst needs to be moved -+ * forward. -+ */ -+ bfqd->last_ins_in_burst = jiffies; -+ -+} -+ -+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ return entity->budget - entity->service; -+} -+ -+/* -+ * If enough samples have been computed, return the current max budget -+ * stored in bfqd, which is dynamically updated according to the -+ * estimated disk peak rate; otherwise return the default max budget -+ */ -+static int bfq_max_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget; -+ else -+ return bfqd->bfq_max_budget; -+} -+ -+/* -+ * Return min budget, which is a fraction of the current or default -+ * max budget (trying with 1/32) -+ */ -+static int bfq_min_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget / 32; -+ else -+ return bfqd->bfq_max_budget / 32; -+} -+ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ -+/* -+ * The next function, invoked after the input queue bfqq switches from -+ * idle to busy, updates the budget of bfqq. The function also tells -+ * whether the in-service queue should be expired, by returning -+ * true. The purpose of expiring the in-service queue is to give bfqq -+ * the chance to possibly preempt the in-service queue, and the reason -+ * for preempting the in-service queue is to achieve one of the two -+ * goals below. -+ * -+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has -+ * expired because it has remained idle. In particular, bfqq may have -+ * expired for one of the following two reasons: -+ * -+ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and -+ * did not make it to issue a new request before its last request -+ * was served; -+ * -+ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue -+ * a new request before the expiration of the idling-time. -+ * -+ * Even if bfqq has expired for one of the above reasons, the process -+ * associated with the queue may be however issuing requests greedily, -+ * and thus be sensitive to the bandwidth it receives (bfqq may have -+ * remained idle for other reasons: CPU high load, bfqq not enjoying -+ * idling, I/O throttling somewhere in the path from the process to -+ * the I/O scheduler, ...). But if, after every expiration for one of -+ * the above two reasons, bfqq has to wait for the service of at least -+ * one full budget of another queue before being served again, then -+ * bfqq is likely to get a much lower bandwidth or resource time than -+ * its reserved ones. To address this issue, two countermeasures need -+ * to be taken. -+ * -+ * First, the budget and the timestamps of bfqq need to be updated in -+ * a special way on bfqq reactivation: they need to be updated as if -+ * bfqq did not remain idle and did not expire. In fact, if they are -+ * computed as if bfqq expired and remained idle until reactivation, -+ * then the process associated with bfqq is treated as if, instead of -+ * being greedy, it stopped issuing requests when bfqq remained idle, -+ * and restarts issuing requests only on this reactivation. In other -+ * words, the scheduler does not help the process recover the "service -+ * hole" between bfqq expiration and reactivation. As a consequence, -+ * the process receives a lower bandwidth than its reserved one. In -+ * contrast, to recover this hole, the budget must be updated as if -+ * bfqq was not expired at all before this reactivation, i.e., it must -+ * be set to the value of the remaining budget when bfqq was -+ * expired. Along the same line, timestamps need to be assigned the -+ * value they had the last time bfqq was selected for service, i.e., -+ * before last expiration. Thus timestamps need to be back-shifted -+ * with respect to their normal computation (see [1] for more details -+ * on this tricky aspect). -+ * -+ * Secondly, to allow the process to recover the hole, the in-service -+ * queue must be expired too, to give bfqq the chance to preempt it -+ * immediately. In fact, if bfqq has to wait for a full budget of the -+ * in-service queue to be completed, then it may become impossible to -+ * let the process recover the hole, even if the back-shifted -+ * timestamps of bfqq are lower than those of the in-service queue. If -+ * this happens for most or all of the holes, then the process may not -+ * receive its reserved bandwidth. In this respect, it is worth noting -+ * that, being the service of outstanding requests unpreemptible, a -+ * little fraction of the holes may however be unrecoverable, thereby -+ * causing a little loss of bandwidth. -+ * -+ * The last important point is detecting whether bfqq does need this -+ * bandwidth recovery. In this respect, the next function deems the -+ * process associated with bfqq greedy, and thus allows it to recover -+ * the hole, if: 1) the process is waiting for the arrival of a new -+ * request (which implies that bfqq expired for one of the above two -+ * reasons), and 2) such a request has arrived soon. The first -+ * condition is controlled through the flag non_blocking_wait_rq, -+ * while the second through the flag arrived_in_time. If both -+ * conditions hold, then the function computes the budget in the -+ * above-described special way, and signals that the in-service queue -+ * should be expired. Timestamp back-shifting is done later in -+ * __bfq_activate_entity. -+ * -+ * 2. Reduce latency. Even if timestamps are not backshifted to let -+ * the process associated with bfqq recover a service hole, bfqq may -+ * however happen to have, after being (re)activated, a lower finish -+ * timestamp than the in-service queue. That is, the next budget of -+ * bfqq may have to be completed before the one of the in-service -+ * queue. If this is the case, then preempting the in-service queue -+ * allows this goal to be achieved, apart from the unpreemptible, -+ * outstanding requests mentioned above. -+ * -+ * Unfortunately, regardless of which of the above two goals one wants -+ * to achieve, service trees need first to be updated to know whether -+ * the in-service queue must be preempted. To have service trees -+ * correctly updated, the in-service queue must be expired and -+ * rescheduled, and bfqq must be scheduled too. This is one of the -+ * most costly operations (in future versions, the scheduling -+ * mechanism may be re-designed in such a way to make it possible to -+ * know whether preemption is needed without needing to update service -+ * trees). In addition, queue preemptions almost always cause random -+ * I/O, and thus loss of throughput. Because of these facts, the next -+ * function adopts the following simple scheme to avoid both costly -+ * operations and too frequent preemptions: it requests the expiration -+ * of the in-service queue (unconditionally) only for queues that need -+ * to recover a hole, or that either are weight-raised or deserve to -+ * be weight-raised. -+ */ -+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool arrived_in_time, -+ bool wr_or_deserves_wr) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { -+ /* -+ * We do not clear the flag non_blocking_wait_rq here, as -+ * the latter is used in bfq_activate_bfqq to signal -+ * that timestamps need to be back-shifted (and is -+ * cleared right after). -+ */ -+ -+ /* -+ * In next assignment we rely on that either -+ * entity->service or entity->budget are not updated -+ * on expiration if bfqq is empty (see -+ * __bfq_bfqq_recalc_budget). Thus both quantities -+ * remain unchanged after such an expiration, and the -+ * following statement therefore assigns to -+ * entity->budget the remaining budget on such an -+ * expiration. For clarity, entity->service is not -+ * updated on expiration in any case, and, in normal -+ * operation, is reset only when bfqq is selected for -+ * service (see bfq_get_next_queue). -+ */ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = min_t(unsigned long, -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->max_budget); -+ -+ BUG_ON(entity->budget < 0); -+ return true; -+ } -+ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(bfqq->next_rq, bfqq)); -+ BUG_ON(entity->budget < 0); -+ -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+ return wr_or_deserves_wr; -+} -+ -+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ unsigned int old_wr_coeff, -+ bool wr_or_deserves_wr, -+ bool interactive, -+ bool in_burst, -+ bool soft_rt) -+{ -+ if (old_wr_coeff == 1 && wr_or_deserves_wr) { -+ /* start a weight-raising period */ -+ if (interactive) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else { -+ bfqq->wr_start_at_switch_to_srt = jiffies; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ } -+ /* -+ * If needed, further reduce budget to make sure it is -+ * close to bfqq's backlog, so as to reduce the -+ * scheduling-error component due to a too large -+ * budget. Do not care about throughput consequences, -+ * but only about latency. Finally, do not assign a -+ * too small budget either, to avoid increasing -+ * latency by causing too frequent expirations. -+ */ -+ bfqq->entity.budget = min_t(unsigned long, -+ bfqq->entity.budget, -+ 2 * bfq_min_budget(bfqd)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais starting at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } else if (old_wr_coeff > 1) { -+ if (interactive) { /* update wr coeff and duration */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else if (in_burst) { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq-> -+ wr_cur_max_time)); -+ } else if (soft_rt) { -+ /* -+ * The application is now or still meeting the -+ * requirements for being deemed soft rt. We -+ * can then correctly and safely (re)charge -+ * the weight-raising duration for the -+ * application with the weight-raising -+ * duration for soft rt applications. -+ * -+ * In particular, doing this recharge now, i.e., -+ * before the weight-raising period for the -+ * application finishes, reduces the probability -+ * of the following negative scenario: -+ * 1) the weight of a soft rt application is -+ * raised at startup (as for any newly -+ * created application), -+ * 2) since the application is not interactive, -+ * at a certain time weight-raising is -+ * stopped for the application, -+ * 3) at that time the application happens to -+ * still have pending requests, and hence -+ * is destined to not have a chance to be -+ * deemed soft rt before these requests are -+ * completed (see the comments to the -+ * function bfq_bfqq_softrt_next_start() -+ * for details on soft rt detection), -+ * 4) these pending requests experience a high -+ * latency because the application is not -+ * weight-raised while they are pending. -+ */ -+ if (bfqq->wr_cur_max_time != -+ bfqd->bfq_wr_rt_max_time) { -+ bfqq->wr_start_at_switch_to_srt = -+ bfqq->last_wr_start_finish; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfq_log_bfqq(bfqd, bfqq, -+ "switching to soft_rt wr"); -+ } else -+ bfq_log_bfqq(bfqd, bfqq, -+ "moving forward soft_rt wr duration"); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+} -+ -+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ return bfqq->dispatched == 0 && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ bfqd->bfq_wr_min_idle_time); -+} -+ -+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ int old_wr_coeff, -+ struct request *rq, -+ bool *interactive) -+{ -+ bool soft_rt, in_burst, wr_or_deserves_wr, -+ bfqq_wants_to_preempt, -+ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), -+ /* -+ * See the comments on -+ * bfq_bfqq_update_budg_for_activation for -+ * details on the usage of the next variable. -+ */ -+ arrived_in_time = ktime_get_ns() <= -+ RQ_BIC(rq)->ttime.last_end_request + -+ bfqd->bfq_slice_idle * 3; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request non-busy: " -+ "jiffies %lu, in_time %d, idle_long %d busyw %d " -+ "wr_coeff %u", -+ jiffies, arrived_in_time, -+ idle_for_long_time, -+ bfq_bfqq_non_blocking_wait_rq(bfqq), -+ old_wr_coeff); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); -+ -+ /* -+ * bfqq deserves to be weight-raised if: -+ * - it is sync, -+ * - it does not belong to a large burst, -+ * - it has been idle for enough time or is soft real-time, -+ * - is linked to a bfq_io_cq (it is not shared in any sense) -+ */ -+ in_burst = bfq_bfqq_in_large_burst(bfqq); -+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && -+ !in_burst && -+ time_is_before_jiffies(bfqq->soft_rt_next_start); -+ *interactive = -+ !in_burst && -+ idle_for_long_time; -+ wr_or_deserves_wr = bfqd->low_latency && -+ (bfqq->wr_coeff > 1 || -+ (bfq_bfqq_sync(bfqq) && -+ bfqq->bic && (*interactive || soft_rt))); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request: " -+ "in_burst %d, " -+ "soft_rt %d (next %lu), inter %d, bic %p", -+ bfq_bfqq_in_large_burst(bfqq), soft_rt, -+ bfqq->soft_rt_next_start, -+ *interactive, -+ bfqq->bic); -+ -+ /* -+ * Using the last flag, update budget and check whether bfqq -+ * may want to preempt the in-service queue. -+ */ -+ bfqq_wants_to_preempt = -+ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, -+ arrived_in_time, -+ wr_or_deserves_wr); -+ -+ /* -+ * If bfqq happened to be activated in a burst, but has been -+ * idle for much more than an interactive queue, then we -+ * assume that, in the overall I/O initiated in the burst, the -+ * I/O associated with bfqq is finished. So bfqq does not need -+ * to be treated as a queue belonging to a burst -+ * anymore. Accordingly, we reset bfqq's in_large_burst flag -+ * if set, and remove bfqq from the burst list if it's -+ * there. We do not decrement burst_size, because the fact -+ * that bfqq does not need to belong to the burst list any -+ * more does not invalidate the fact that bfqq was created in -+ * a burst. -+ */ -+ if (likely(!bfq_bfqq_just_created(bfqq)) && -+ idle_for_long_time && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ msecs_to_jiffies(10000))) { -+ hlist_del_init(&bfqq->burst_list_node); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ } -+ -+ bfq_clear_bfqq_just_created(bfqq); -+ -+ if (!bfq_bfqq_IO_bound(bfqq)) { -+ if (arrived_in_time) { -+ bfqq->requests_within_timer++; -+ if (bfqq->requests_within_timer >= -+ bfqd->bfq_requests_within_timer) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ } else -+ bfqq->requests_within_timer = 0; -+ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", -+ bfqq->requests_within_timer); -+ } -+ -+ if (bfqd->low_latency) { -+ if (unlikely(time_is_after_jiffies(bfqq->split_time))) -+ /* wraparound */ -+ bfqq->split_time = -+ jiffies - bfqd->bfq_wr_min_idle_time - 1; -+ -+ if (time_is_before_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) { -+ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, -+ old_wr_coeff, -+ wr_or_deserves_wr, -+ *interactive, -+ in_burst, -+ soft_rt); -+ -+ if (old_wr_coeff != bfqq->wr_coeff) -+ bfqq->entity.prio_changed = 1; -+ } -+ } -+ -+ bfqq->last_idle_bklogged = jiffies; -+ bfqq->service_from_backlogged = 0; -+ bfq_clear_bfqq_softrt_update(bfqq); -+ -+ bfq_add_bfqq_busy(bfqd, bfqq); -+ -+ /* -+ * Expire in-service queue only if preemption may be needed -+ * for guarantees. In this respect, the function -+ * next_queue_may_preempt just checks a simple, necessary -+ * condition, and not a sufficient condition based on -+ * timestamps. In fact, for the latter condition to be -+ * evaluated, timestamps would need first to be updated, and -+ * this operation is quite costly (see the comments on the -+ * function bfq_bfqq_update_budg_for_activation). -+ */ -+ if (bfqd->in_service_queue && bfqq_wants_to_preempt && -+ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && -+ next_queue_may_preempt(bfqd)) { -+ struct bfq_queue *in_serv = -+ bfqd->in_service_queue; -+ BUG_ON(in_serv == bfqq); -+ -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ } - } - - static void bfq_add_request(struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq); -- struct bfq_entity *entity = &bfqq->entity; - struct bfq_data *bfqd = bfqq->bfqd; - struct request *next_rq, *prev; -- unsigned long old_wr_coeff = bfqq->wr_coeff; -+ unsigned int old_wr_coeff = bfqq->wr_coeff; - bool interactive = false; - -- bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); -+ bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", -+ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); -+ -+ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ - bfqq->queued[rq_is_sync(rq)]++; - bfqd->queued++; - - elv_rb_add(&bfqq->sort_list, rq); - - /* -- * Check if this request is a better next-serve candidate. -+ * Check if this request is a better next-to-serve candidate. - */ - prev = bfqq->next_rq; - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); -@@ -886,160 +1438,10 @@ static void bfq_add_request(struct request *rq) - if (prev != bfqq->next_rq) - bfq_pos_tree_add_move(bfqd, bfqq); - -- if (!bfq_bfqq_busy(bfqq)) { -- bool soft_rt, coop_or_in_burst, -- idle_for_long_time = time_is_before_jiffies( -- bfqq->budget_timeout + -- bfqd->bfq_wr_min_idle_time); -- --#ifdef CONFIG_BFQ_GROUP_IOSCHED -- bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, -- rq->cmd_flags); --#endif -- if (bfq_bfqq_sync(bfqq)) { -- bool already_in_burst = -- !hlist_unhashed(&bfqq->burst_list_node) || -- bfq_bfqq_in_large_burst(bfqq); -- bfq_handle_burst(bfqd, bfqq, idle_for_long_time); -- /* -- * If bfqq was not already in the current burst, -- * then, at this point, bfqq either has been -- * added to the current burst or has caused the -- * current burst to terminate. In particular, in -- * the second case, bfqq has become the first -- * queue in a possible new burst. -- * In both cases last_ins_in_burst needs to be -- * moved forward. -- */ -- if (!already_in_burst) -- bfqd->last_ins_in_burst = jiffies; -- } -- -- coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) || -- bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh; -- soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && -- !coop_or_in_burst && -- time_is_before_jiffies(bfqq->soft_rt_next_start); -- interactive = !coop_or_in_burst && idle_for_long_time; -- entity->budget = max_t(unsigned long, bfqq->max_budget, -- bfq_serv_to_charge(next_rq, bfqq)); -- -- if (!bfq_bfqq_IO_bound(bfqq)) { -- if (time_before(jiffies, -- RQ_BIC(rq)->ttime.last_end_request + -- bfqd->bfq_slice_idle)) { -- bfqq->requests_within_timer++; -- if (bfqq->requests_within_timer >= -- bfqd->bfq_requests_within_timer) -- bfq_mark_bfqq_IO_bound(bfqq); -- } else -- bfqq->requests_within_timer = 0; -- } -- -- if (!bfqd->low_latency) -- goto add_bfqq_busy; -- -- if (bfq_bfqq_just_split(bfqq)) -- goto set_prio_changed; -- -- /* -- * If the queue: -- * - is not being boosted, -- * - has been idle for enough time, -- * - is not a sync queue or is linked to a bfq_io_cq (it is -- * shared "for its nature" or it is not shared and its -- * requests have not been redirected to a shared queue) -- * start a weight-raising period. -- */ -- if (old_wr_coeff == 1 && (interactive || soft_rt) && -- (!bfq_bfqq_sync(bfqq) || bfqq->bic)) { -- bfqq->wr_coeff = bfqd->bfq_wr_coeff; -- if (interactive) -- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -- else -- bfqq->wr_cur_max_time = -- bfqd->bfq_wr_rt_max_time; -- bfq_log_bfqq(bfqd, bfqq, -- "wrais starting at %lu, rais_max_time %u", -- jiffies, -- jiffies_to_msecs(bfqq->wr_cur_max_time)); -- } else if (old_wr_coeff > 1) { -- if (interactive) -- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -- else if (coop_or_in_burst || -- (bfqq->wr_cur_max_time == -- bfqd->bfq_wr_rt_max_time && -- !soft_rt)) { -- bfqq->wr_coeff = 1; -- bfq_log_bfqq(bfqd, bfqq, -- "wrais ending at %lu, rais_max_time %u", -- jiffies, -- jiffies_to_msecs(bfqq-> -- wr_cur_max_time)); -- } else if (time_before( -- bfqq->last_wr_start_finish + -- bfqq->wr_cur_max_time, -- jiffies + -- bfqd->bfq_wr_rt_max_time) && -- soft_rt) { -- /* -- * -- * The remaining weight-raising time is lower -- * than bfqd->bfq_wr_rt_max_time, which means -- * that the application is enjoying weight -- * raising either because deemed soft-rt in -- * the near past, or because deemed interactive -- * a long ago. -- * In both cases, resetting now the current -- * remaining weight-raising time for the -- * application to the weight-raising duration -- * for soft rt applications would not cause any -- * latency increase for the application (as the -- * new duration would be higher than the -- * remaining time). -- * -- * In addition, the application is now meeting -- * the requirements for being deemed soft rt. -- * In the end we can correctly and safely -- * (re)charge the weight-raising duration for -- * the application with the weight-raising -- * duration for soft rt applications. -- * -- * In particular, doing this recharge now, i.e., -- * before the weight-raising period for the -- * application finishes, reduces the probability -- * of the following negative scenario: -- * 1) the weight of a soft rt application is -- * raised at startup (as for any newly -- * created application), -- * 2) since the application is not interactive, -- * at a certain time weight-raising is -- * stopped for the application, -- * 3) at that time the application happens to -- * still have pending requests, and hence -- * is destined to not have a chance to be -- * deemed soft rt before these requests are -- * completed (see the comments to the -- * function bfq_bfqq_softrt_next_start() -- * for details on soft rt detection), -- * 4) these pending requests experience a high -- * latency because the application is not -- * weight-raised while they are pending. -- */ -- bfqq->last_wr_start_finish = jiffies; -- bfqq->wr_cur_max_time = -- bfqd->bfq_wr_rt_max_time; -- } -- } --set_prio_changed: -- if (old_wr_coeff != bfqq->wr_coeff) -- entity->prio_changed = 1; --add_bfqq_busy: -- bfqq->last_idle_bklogged = jiffies; -- bfqq->service_from_backlogged = 0; -- bfq_clear_bfqq_softrt_update(bfqq); -- bfq_add_bfqq_busy(bfqd, bfqq); -- } else { -+ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ -+ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, -+ rq, &interactive); -+ else { - if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && - time_is_before_jiffies( - bfqq->last_wr_start_finish + -@@ -1048,21 +1450,48 @@ static void bfq_add_request(struct request *rq) - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - - bfqd->wr_busy_queues++; -- entity->prio_changed = 1; -+ bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqd, bfqq, -- "non-idle wrais starting at %lu, rais_max_time %u", -- jiffies, -- jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ "non-idle wrais starting, " -+ "wr_max_time %u wr_busy %d", -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqd->wr_busy_queues); - } - if (prev != bfqq->next_rq) - bfq_updated_next_req(bfqd, bfqq); - } - -- if (bfqd->low_latency && -- (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) -- bfqq->last_wr_start_finish = jiffies; --} -- -+ /* -+ * Assign jiffies to last_wr_start_finish in the following -+ * cases: -+ * -+ * . if bfqq is not going to be weight-raised, because, for -+ * non weight-raised queues, last_wr_start_finish stores the -+ * arrival time of the last request; as of now, this piece -+ * of information is used only for deciding whether to -+ * weight-raise async queues -+ * -+ * . if bfqq is not weight-raised, because, if bfqq is now -+ * switching to weight-raised, then last_wr_start_finish -+ * stores the time when weight-raising starts -+ * -+ * . if bfqq is interactive, because, regardless of whether -+ * bfqq is currently weight-raised, the weight-raising -+ * period must start or restart (this case is considered -+ * separately because it is not detected by the above -+ * conditions, if bfqq is already weight-raised) -+ * -+ * last_wr_start_finish has to be updated also if bfqq is soft -+ * real-time, because the weight-raising period is constantly -+ * restarted on idle-to-busy transitions for these queues, but -+ * this is already done in bfq_bfqq_handle_idle_busy_switch if -+ * needed. -+ */ -+ if (bfqd->low_latency && -+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) -+ bfqq->last_wr_start_finish = jiffies; -+} -+ - static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, - struct bio *bio) - { -@@ -1074,21 +1503,31 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, - if (!bic) - return NULL; - -- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); -+ bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); - if (bfqq) - return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); - - return NULL; - } - -+static sector_t get_sdist(sector_t last_pos, struct request *rq) -+{ -+ sector_t sdist = 0; -+ -+ if (last_pos) { -+ if (last_pos < blk_rq_pos(rq)) -+ sdist = blk_rq_pos(rq) - last_pos; -+ else -+ sdist = last_pos - blk_rq_pos(rq); -+ } -+ -+ return sdist; -+} -+ - static void bfq_activate_request(struct request_queue *q, struct request *rq) - { - struct bfq_data *bfqd = q->elevator->elevator_data; -- - bfqd->rq_in_driver++; -- bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); -- bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", -- (unsigned long long) bfqd->last_position); - } - - static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -@@ -1105,6 +1544,9 @@ static void bfq_remove_request(struct request *rq) - struct bfq_data *bfqd = bfqq->bfqd; - const int sync = rq_is_sync(rq); - -+ BUG_ON(bfqq->entity.service > bfqq->entity.budget && -+ bfqq == bfqd->in_service_queue); -+ - if (bfqq->next_rq == rq) { - bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); - bfq_updated_next_req(bfqd, bfqq); -@@ -1118,8 +1560,29 @@ static void bfq_remove_request(struct request *rq) - elv_rb_del(&bfqq->sort_list, rq); - - if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -- if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) -- bfq_del_bfqq_busy(bfqd, bfqq, 1); -+ bfqq->next_rq = NULL; -+ -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { -+ BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ -+ bfq_del_bfqq_busy(bfqd, bfqq, false); -+ /* -+ * bfqq emptied. In normal operation, when -+ * bfqq is empty, bfqq->entity.service and -+ * bfqq->entity.budget must contain, -+ * respectively, the service received and the -+ * budget used last time bfqq emptied. These -+ * facts do not hold in this case, as at least -+ * this last removal occurred while bfqq is -+ * not in service. To avoid inconsistencies, -+ * reset both bfqq->entity.service and -+ * bfqq->entity.budget, if bfqq has still a -+ * process that may issue I/O requests to it. -+ */ -+ bfqq->entity.budget = bfqq->entity.service = 0; -+ } -+ - /* - * Remove queue from request-position tree as it is empty. - */ -@@ -1133,19 +1596,17 @@ static void bfq_remove_request(struct request *rq) - BUG_ON(bfqq->meta_pending == 0); - bfqq->meta_pending--; - } --#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); --#endif - } - --static int bfq_merge(struct request_queue *q, struct request **req, -- struct bio *bio) -+static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, -+ struct bio *bio) - { - struct bfq_data *bfqd = q->elevator->elevator_data; - struct request *__rq; - - __rq = bfq_find_rq_fmerge(bfqd, bio); -- if (__rq && elv_rq_merge_ok(__rq, bio)) { -+ if (__rq && elv_bio_merge_ok(__rq, bio)) { - *req = __rq; - return ELEVATOR_FRONT_MERGE; - } -@@ -1154,7 +1615,7 @@ static int bfq_merge(struct request_queue *q, struct request **req, - } - - static void bfq_merged_request(struct request_queue *q, struct request *req, -- int type) -+ enum elv_merge type) - { - if (type == ELEVATOR_FRONT_MERGE && - rb_prev(&req->rb_node) && -@@ -1190,7 +1651,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, - static void bfq_bio_merged(struct request_queue *q, struct request *req, - struct bio *bio) - { -- bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw); -+ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); - } - #endif - -@@ -1210,7 +1671,7 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, - */ - if (bfqq == next_bfqq && - !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && -- time_before(next->fifo_time, rq->fifo_time)) { -+ next->fifo_time < rq->fifo_time) { - list_del_init(&rq->queuelist); - list_replace_init(&next->queuelist, &rq->queuelist); - rq->fifo_time = next->fifo_time; -@@ -1220,21 +1681,30 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, - bfqq->next_rq = rq; - - bfq_remove_request(next); --#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); --#endif - } - - /* Must be called with bfqq != NULL */ - static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) - { - BUG_ON(!bfqq); -+ - if (bfq_bfqq_busy(bfqq)) - bfqq->bfqd->wr_busy_queues--; - bfqq->wr_coeff = 1; - bfqq->wr_cur_max_time = 0; -- /* Trigger a weight change on the next activation of the queue */ -+ bfqq->last_wr_start_finish = jiffies; -+ /* -+ * Trigger a weight change on the next invocation of -+ * __bfq_entity_update_weight_prio. -+ */ - bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "end_wr: wrais ending at %lu, rais_max_time %u", -+ bfqq->last_wr_start_finish, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", -+ bfqq->bfqd->wr_busy_queues); - } - - static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -@@ -1277,7 +1747,7 @@ static int bfq_rq_close_to_sector(void *io_struct, bool request, - sector_t sector) - { - return abs(bfq_io_struct_pos(io_struct, request) - sector) <= -- BFQQ_SEEK_THR; -+ BFQQ_CLOSE_THR; - } - - static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, -@@ -1399,7 +1869,7 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) - * throughput. - */ - bfqq->new_bfqq = new_bfqq; -- atomic_add(process_refs, &new_bfqq->ref); -+ new_bfqq->ref += process_refs; - return new_bfqq; - } - -@@ -1430,9 +1900,23 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - } - - /* -- * Attempt to schedule a merge of bfqq with the currently in-service queue -- * or with a close queue among the scheduled queues. -- * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue -+ * If this function returns true, then bfqq cannot be merged. The idea -+ * is that true cooperation happens very early after processes start -+ * to do I/O. Usually, late cooperations are just accidental false -+ * positives. In case bfqq is weight-raised, such false positives -+ * would evidently degrade latency guarantees for bfqq. -+ */ -+static bool wr_from_too_long(struct bfq_queue *bfqq) -+{ -+ return bfqq->wr_coeff > 1 && -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ msecs_to_jiffies(100)); -+} -+ -+/* -+ * Attempt to schedule a merge of bfqq with the currently in-service -+ * queue or with a close queue among the scheduled queues. Return -+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue - * structure otherwise. - * - * The OOM queue is not allowed to participate to cooperation: in fact, since -@@ -1441,6 +1925,18 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - * handle merging with the OOM queue would be quite complex and expensive - * to maintain. Besides, in such a critical condition as an out of memory, - * the benefits of queue merging may be little relevant, or even negligible. -+ * -+ * Weight-raised queues can be merged only if their weight-raising -+ * period has just started. In fact cooperating processes are usually -+ * started together. Thus, with this filter we avoid false positives -+ * that would jeopardize low-latency guarantees. -+ * -+ * WARNING: queue merging may impair fairness among non-weight raised -+ * queues, for at least two reasons: 1) the original weight of a -+ * merged queue may change during the merged state, 2) even being the -+ * weight the same, a merged queue may be bloated with many more -+ * requests than the ones produced by its originally-associated -+ * process. - */ - static struct bfq_queue * - bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, -@@ -1450,16 +1946,32 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - - if (bfqq->new_bfqq) - return bfqq->new_bfqq; -- if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) -+ -+ if (io_struct && wr_from_too_long(bfqq) && -+ likely(bfqq != &bfqd->oom_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have looked for coop, but bfq%d wr", -+ bfqq->pid); -+ -+ if (!io_struct || -+ wr_from_too_long(bfqq) || -+ unlikely(bfqq == &bfqd->oom_bfqq)) - return NULL; -- /* If device has only one backlogged bfq_queue, don't search. */ -+ -+ /* If there is only one backlogged queue, don't search. */ - if (bfqd->busy_queues == 1) - return NULL; - - in_service_bfqq = bfqd->in_service_queue; - -+ if (in_service_bfqq && in_service_bfqq != bfqq && -+ bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) -+ && likely(in_service_bfqq == &bfqd->oom_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have tried merge with in-service-queue, but wr"); -+ - if (!in_service_bfqq || in_service_bfqq == bfqq || -- !bfqd->in_service_bic || -+ !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || - unlikely(in_service_bfqq == &bfqd->oom_bfqq)) - goto check_scheduled; - -@@ -1481,7 +1993,15 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - - BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); - -- if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && -+ if (new_bfqq && wr_from_too_long(new_bfqq) && -+ likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have merged with bfq%d, but wr", -+ new_bfqq->pid); -+ -+ if (new_bfqq && !wr_from_too_long(new_bfqq) && -+ likely(new_bfqq != &bfqd->oom_bfqq) && - bfq_may_be_close_cooperator(bfqq, new_bfqq)) - return bfq_setup_merge(bfqq, new_bfqq); - -@@ -1490,53 +2010,25 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - - static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - { -+ struct bfq_io_cq *bic = bfqq->bic; -+ - /* - * If !bfqq->bic, the queue is already shared or its requests - * have already been redirected to a shared queue; both idle window - * and weight raising state have already been saved. Do nothing. - */ -- if (!bfqq->bic) -+ if (!bic) - return; -- if (bfqq->bic->wr_time_left) -- /* -- * This is the queue of a just-started process, and would -- * deserve weight raising: we set wr_time_left to the full -- * weight-raising duration to trigger weight-raising when -- * and if the queue is split and the first request of the -- * queue is enqueued. -- */ -- bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); -- else if (bfqq->wr_coeff > 1) { -- unsigned long wr_duration = -- jiffies - bfqq->last_wr_start_finish; -- /* -- * It may happen that a queue's weight raising period lasts -- * longer than its wr_cur_max_time, as weight raising is -- * handled only when a request is enqueued or dispatched (it -- * does not use any timer). If the weight raising period is -- * about to end, don't save it. -- */ -- if (bfqq->wr_cur_max_time <= wr_duration) -- bfqq->bic->wr_time_left = 0; -- else -- bfqq->bic->wr_time_left = -- bfqq->wr_cur_max_time - wr_duration; -- /* -- * The bfq_queue is becoming shared or the requests of the -- * process owning the queue are being redirected to a shared -- * queue. Stop the weight raising period of the queue, as in -- * both cases it should not be owned by an interactive or -- * soft real-time application. -- */ -- bfq_bfqq_end_wr(bfqq); -- } else -- bfqq->bic->wr_time_left = 0; -- bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); -- bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); -- bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -- bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -- bfqq->bic->cooperations++; -- bfqq->bic->failed_cooperations = 0; -+ -+ bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); -+ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); -+ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -+ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -+ bic->saved_wr_coeff = bfqq->wr_coeff; -+ bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; -+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - } - - static void bfq_get_bic_reference(struct bfq_queue *bfqq) -@@ -1561,6 +2053,41 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - if (bfq_bfqq_IO_bound(bfqq)) - bfq_mark_bfqq_IO_bound(new_bfqq); - bfq_clear_bfqq_IO_bound(bfqq); -+ -+ /* -+ * If bfqq is weight-raised, then let new_bfqq inherit -+ * weight-raising. To reduce false positives, neglect the case -+ * where bfqq has just been created, but has not yet made it -+ * to be weight-raised (which may happen because EQM may merge -+ * bfqq even before bfq_add_request is executed for the first -+ * time for bfqq). Handling this case would however be very -+ * easy, thanks to the flag just_created. -+ */ -+ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ new_bfqq->wr_coeff = bfqq->wr_coeff; -+ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; -+ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; -+ new_bfqq->wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ if (bfq_bfqq_busy(new_bfqq)) -+ bfqd->wr_busy_queues++; -+ new_bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, new_bfqq, -+ "wr start after merge with %d, rais_max_time %u", -+ bfqq->pid, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ -+ bfqq->wr_coeff = 1; -+ bfqq->entity.prio_changed = 1; -+ if (bfq_bfqq_busy(bfqq)) -+ bfqd->wr_busy_queues--; -+ } -+ -+ bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", -+ bfqd->wr_busy_queues); -+ - /* - * Grab a reference to the bic, to prevent it from being destroyed - * before being possibly touched by a bfq_split_bfqq(). -@@ -1584,33 +2111,23 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - */ - new_bfqq->bic = NULL; - bfqq->bic = NULL; -+ /* release process reference to bfqq */ - bfq_put_queue(bfqq); - } - --static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) --{ -- struct bfq_io_cq *bic = bfqq->bic; -- struct bfq_data *bfqd = bfqq->bfqd; -- -- if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { -- bic->failed_cooperations++; -- if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) -- bic->cooperations = 0; -- } --} -- --static int bfq_allow_merge(struct request_queue *q, struct request *rq, -- struct bio *bio) -+static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, -+ struct bio *bio) - { - struct bfq_data *bfqd = q->elevator->elevator_data; -+ bool is_sync = op_is_sync(bio->bi_opf); - struct bfq_io_cq *bic; - struct bfq_queue *bfqq, *new_bfqq; - - /* - * Disallow merge of a sync bio into an async request. - */ -- if (bfq_bio_sync(bio) && !rq_is_sync(rq)) -- return 0; -+ if (is_sync && !rq_is_sync(rq)) -+ return false; - - /* - * Lookup the bfqq that this bio will be queued with. Allow -@@ -1619,9 +2136,9 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, - */ - bic = bfq_bic_lookup(bfqd, current->io_context); - if (!bic) -- return 0; -+ return false; - -- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); -+ bfqq = bic_to_bfqq(bic, is_sync); - /* - * We take advantage of this function to perform an early merge - * of the queues of possible cooperating processes. -@@ -1636,30 +2153,111 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, - * to decide whether bio and rq can be merged. - */ - bfqq = new_bfqq; -- } else -- bfq_bfqq_increase_failed_cooperations(bfqq); -+ } - } - - return bfqq == RQ_BFQQ(rq); - } - -+static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ return RQ_BFQQ(rq) == RQ_BFQQ(next); -+} -+ -+/* -+ * Set the maximum time for the in-service queue to consume its -+ * budget. This prevents seeky processes from lowering the throughput. -+ * In practice, a time-slice service scheme is used with seeky -+ * processes. -+ */ -+static void bfq_set_budget_timeout(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ unsigned int timeout_coeff; -+ -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) -+ timeout_coeff = 1; -+ else -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; -+ -+ bfqd->last_budget_start = ktime_get(); -+ -+ bfqq->budget_timeout = jiffies + -+ bfqd->bfq_timeout * timeout_coeff; -+ -+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", -+ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); -+} -+ - static void __bfq_set_in_service_queue(struct bfq_data *bfqd, - struct bfq_queue *bfqq) - { - if (bfqq) { --#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); --#endif - bfq_mark_bfqq_must_alloc(bfqq); -- bfq_mark_bfqq_budget_new(bfqq); - bfq_clear_bfqq_fifo_expire(bfqq); - - bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; - -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (time_is_before_jiffies(bfqq->last_wr_start_finish) && -+ bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_before_jiffies(bfqq->budget_timeout)) { -+ /* -+ * For soft real-time queues, move the start -+ * of the weight-raising period forward by the -+ * time the queue has not received any -+ * service. Otherwise, a relatively long -+ * service delay is likely to cause the -+ * weight-raising period of the queue to end, -+ * because of the short duration of the -+ * weight-raising period of a soft real-time -+ * queue. It is worth noting that this move -+ * is not so dangerous for the other queues, -+ * because soft real-time queues are not -+ * greedy. -+ * -+ * To not add a further variable, we use the -+ * overloaded field budget_timeout to -+ * determine for how long the queue has not -+ * received service, i.e., how much time has -+ * elapsed since the queue expired. However, -+ * this is a little imprecise, because -+ * budget_timeout is set to jiffies if bfqq -+ * not only expires, but also remains with no -+ * request. -+ */ -+ if (time_after(bfqq->budget_timeout, -+ bfqq->last_wr_start_finish)) -+ bfqq->last_wr_start_finish += -+ jiffies - bfqq->budget_timeout; -+ else -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { -+ pr_crit( -+ "BFQ WARNING:last %lu budget %lu jiffies %lu", -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout, -+ jiffies); -+ pr_crit("diff %lu", jiffies - -+ max_t(unsigned long, -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout)); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+ -+ bfq_set_budget_timeout(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, - "set_in_service_queue, cur-budget = %d", - bfqq->entity.budget); -- } -+ } else -+ bfq_log(bfqd, "set_in_service_queue: NULL"); - - bfqd->in_service_queue = bfqq; - } -@@ -1675,36 +2273,11 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) - return bfqq; - } - --/* -- * If enough samples have been computed, return the current max budget -- * stored in bfqd, which is dynamically updated according to the -- * estimated disk peak rate; otherwise return the default max budget -- */ --static int bfq_max_budget(struct bfq_data *bfqd) --{ -- if (bfqd->budgets_assigned < bfq_stats_min_budgets) -- return bfq_default_max_budget; -- else -- return bfqd->bfq_max_budget; --} -- --/* -- * Return min budget, which is a fraction of the current or default -- * max budget (trying with 1/32) -- */ --static int bfq_min_budget(struct bfq_data *bfqd) --{ -- if (bfqd->budgets_assigned < bfq_stats_min_budgets) -- return bfq_default_max_budget / 32; -- else -- return bfqd->bfq_max_budget / 32; --} -- - static void bfq_arm_slice_timer(struct bfq_data *bfqd) - { - struct bfq_queue *bfqq = bfqd->in_service_queue; - struct bfq_io_cq *bic; -- unsigned long sl; -+ u32 sl; - - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - -@@ -1728,119 +2301,366 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) - sl = bfqd->bfq_slice_idle; - /* - * Unless the queue is being weight-raised or the scenario is -- * asymmetric, grant only minimum idle time if the queue either -- * has been seeky for long enough or has already proved to be -- * constantly seeky. -+ * asymmetric, grant only minimum idle time if the queue -+ * is seeky. A long idling is preserved for a weight-raised -+ * queue, or, more in general, in an asymemtric scenario, -+ * because a long idling is needed for guaranteeing to a queue -+ * its reserved share of the throughput (in particular, it is -+ * needed if the queue has a higher weight than some other -+ * queue). - */ -- if (bfq_sample_valid(bfqq->seek_samples) && -- ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > -- bfq_max_budget(bfqq->bfqd) / 8) || -- bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && -+ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && - bfq_symmetric_scenario(bfqd)) -- sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); -- else if (bfqq->wr_coeff > 1) -- sl = sl * 3; -+ sl = min_t(u32, sl, BFQ_MIN_TT); -+ - bfqd->last_idling_start = ktime_get(); -- mod_timer(&bfqd->idle_slice_timer, jiffies + sl); --#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), -+ HRTIMER_MODE_REL); - bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); --#endif -- bfq_log(bfqd, "arm idle: %u/%u ms", -- jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); -+ bfq_log(bfqd, "arm idle: %ld/%ld ms", -+ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); - } - - /* -- * Set the maximum time for the in-service queue to consume its -- * budget. This prevents seeky processes from lowering the disk -- * throughput (always guaranteed with a time slice scheme as in CFQ). -+ * In autotuning mode, max_budget is dynamically recomputed as the -+ * amount of sectors transferred in timeout at the estimated peak -+ * rate. This enables BFQ to utilize a full timeslice with a full -+ * budget, even if the in-service queue is served at peak rate. And -+ * this maximises throughput with sequential workloads. - */ --static void bfq_set_budget_timeout(struct bfq_data *bfqd) -+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) - { -- struct bfq_queue *bfqq = bfqd->in_service_queue; -- unsigned int timeout_coeff; -+ return (u64)bfqd->peak_rate * USEC_PER_MSEC * -+ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; -+} - -- if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) -- timeout_coeff = 1; -- else -- timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; -+/* -+ * Update parameters related to throughput and responsiveness, as a -+ * function of the estimated peak rate. See comments on -+ * bfq_calc_max_budget(), and on T_slow and T_fast arrays. -+ */ -+static void update_thr_responsiveness_params(struct bfq_data *bfqd) -+{ -+ int dev_type = blk_queue_nonrot(bfqd->queue); -+ -+ if (bfqd->bfq_user_max_budget == 0) { -+ bfqd->bfq_max_budget = -+ bfq_calc_max_budget(bfqd); -+ BUG_ON(bfqd->bfq_max_budget < 0); -+ bfq_log(bfqd, "new max_budget = %d", -+ bfqd->bfq_max_budget); -+ } - -- bfqd->last_budget_start = ktime_get(); -+ if (bfqd->device_speed == BFQ_BFQD_FAST && -+ bfqd->peak_rate < device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_SLOW; -+ bfqd->RT_prod = R_slow[dev_type] * -+ T_slow[dev_type]; -+ } else if (bfqd->device_speed == BFQ_BFQD_SLOW && -+ bfqd->peak_rate > device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_FAST; -+ bfqd->RT_prod = R_fast[dev_type] * -+ T_fast[dev_type]; -+ } - -- bfq_clear_bfqq_budget_new(bfqq); -- bfqq->budget_timeout = jiffies + -- bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; -+ bfq_log(bfqd, -+"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", -+ dev_type == 0 ? "ROT" : "NONROT", -+ bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", -+ bfqd->device_speed == BFQ_BFQD_FAST ? -+ (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : -+ (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> -+ BFQ_RATE_SHIFT); -+} - -- bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", -- jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * -- timeout_coeff)); -+static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) -+{ -+ if (rq != NULL) { /* new rq dispatch now, reset accordingly */ -+ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; -+ bfqd->peak_rate_samples = 1; -+ bfqd->sequential_samples = 0; -+ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = -+ blk_rq_sectors(rq); -+ } else /* no new rq dispatched, just reset the number of samples */ -+ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ -+ -+ bfq_log(bfqd, -+ "reset_rate_computation at end, sample %u/%u tot_sects %llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched); - } - --/* -- * Move request from internal lists to the request queue dispatch list. -- */ --static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -+static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - { -- struct bfq_data *bfqd = q->elevator->elevator_data; -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ u32 rate, weight, divisor; - - /* -- * For consistency, the next instruction should have been executed -- * after removing the request from the queue and dispatching it. -- * We execute instead this instruction before bfq_remove_request() -- * (and hence introduce a temporary inconsistency), for efficiency. -- * In fact, in a forced_dispatch, this prevents two counters related -- * to bfqq->dispatched to risk to be uselessly decremented if bfqq -- * is not in service, and then to be incremented again after -- * incrementing bfqq->dispatched. -+ * For the convergence property to hold (see comments on -+ * bfq_update_peak_rate()) and for the assessment to be -+ * reliable, a minimum number of samples must be present, and -+ * a minimum amount of time must have elapsed. If not so, do -+ * not compute new rate. Just reset parameters, to get ready -+ * for a new evaluation attempt. - */ -- bfqq->dispatched++; -- bfq_remove_request(rq); -- elv_dispatch_sort(q, rq); -+ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || -+ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { -+ bfq_log(bfqd, -+ "update_rate_reset: only resetting, delta_first %lluus samples %d", -+ bfqd->delta_from_first>>10, bfqd->peak_rate_samples); -+ goto reset_computation; -+ } - -- if (bfq_bfqq_sync(bfqq)) -- bfqd->sync_flight++; --#ifdef CONFIG_BFQ_GROUP_IOSCHED -- bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq), -- rq->cmd_flags); --#endif -+ /* -+ * If a new request completion has occurred after last -+ * dispatch, then, to approximate the rate at which requests -+ * have been served by the device, it is more precise to -+ * extend the observation interval to the last completion. -+ */ -+ bfqd->delta_from_first = -+ max_t(u64, bfqd->delta_from_first, -+ bfqd->last_completion - bfqd->first_dispatch); -+ -+ BUG_ON(bfqd->delta_from_first == 0); -+ /* -+ * Rate computed in sects/usec, and not sects/nsec, for -+ * precision issues. -+ */ -+ rate = div64_ul(bfqd->tot_sectors_dispatched<delta_from_first, NSEC_PER_USEC)); -+ -+ bfq_log(bfqd, -+"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", -+ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ rate > 20< 20M sectors/sec) -+ */ -+ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && -+ rate <= bfqd->peak_rate) || -+ rate > 20<peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ goto reset_computation; -+ } else { -+ bfq_log(bfqd, -+ "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ } -+ -+ /* -+ * We have to update the peak rate, at last! To this purpose, -+ * we use a low-pass filter. We compute the smoothing constant -+ * of the filter as a function of the 'weight' of the new -+ * measured rate. -+ * -+ * As can be seen in next formulas, we define this weight as a -+ * quantity proportional to how sequential the workload is, -+ * and to how long the observation time interval is. -+ * -+ * The weight runs from 0 to 8. The maximum value of the -+ * weight, 8, yields the minimum value for the smoothing -+ * constant. At this minimum value for the smoothing constant, -+ * the measured rate contributes for half of the next value of -+ * the estimated peak rate. -+ * -+ * So, the first step is to compute the weight as a function -+ * of how sequential the workload is. Note that the weight -+ * cannot reach 9, because bfqd->sequential_samples cannot -+ * become equal to bfqd->peak_rate_samples, which, in its -+ * turn, holds true because bfqd->sequential_samples is not -+ * incremented for the first sample. -+ */ -+ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; -+ -+ /* -+ * Second step: further refine the weight as a function of the -+ * duration of the observation interval. -+ */ -+ weight = min_t(u32, 8, -+ div_u64(weight * bfqd->delta_from_first, -+ BFQ_RATE_REF_INTERVAL)); -+ -+ /* -+ * Divisor ranging from 10, for minimum weight, to 2, for -+ * maximum weight. -+ */ -+ divisor = 10 - weight; -+ BUG_ON(divisor == 0); -+ -+ /* -+ * Finally, update peak rate: -+ * -+ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor -+ */ -+ bfqd->peak_rate *= divisor-1; -+ bfqd->peak_rate /= divisor; -+ rate /= divisor; /* smoothing constant alpha = 1/divisor */ -+ -+ bfq_log(bfqd, -+ "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", -+ divisor, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), -+ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); -+ -+ BUG_ON(bfqd->peak_rate == 0); -+ BUG_ON(bfqd->peak_rate > 20<peak_rate += rate; -+ update_thr_responsiveness_params(bfqd); -+ BUG_ON(bfqd->peak_rate > 20<peak_rate_samples == 0) { /* first dispatch */ -+ bfq_log(bfqd, -+ "update_peak_rate: goto reset, samples %d", -+ bfqd->peak_rate_samples) ; -+ bfq_reset_rate_computation(bfqd, rq); -+ goto update_last_values; /* will add one sample */ -+ } - -- if (bfq_bfqq_fifo_expire(bfqq)) -- return NULL; -+ /* -+ * Device idle for very long: the observation interval lasting -+ * up to this dispatch cannot be a valid observation interval -+ * for computing a new peak rate (similarly to the late- -+ * completion event in bfq_completed_request()). Go to -+ * update_rate_and_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - start a new observation interval with this dispatch -+ */ -+ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && -+ bfqd->rq_in_driver == 0) { -+ bfq_log(bfqd, -+"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", -+ (now_ns - bfqd->last_dispatch)>>10, -+ bfqd->peak_rate_samples) ; -+ goto update_rate_and_reset; -+ } - -- bfq_mark_bfqq_fifo_expire(bfqq); -+ /* Update sampling information */ -+ bfqd->peak_rate_samples++; - -- if (list_empty(&bfqq->fifo)) -- return NULL; -+ if ((bfqd->rq_in_driver > 0 || -+ now_ns - bfqd->last_completion < BFQ_MIN_TT) -+ && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) -+ bfqd->sequential_samples++; - -- rq = rq_entry_fifo(bfqq->fifo.next); -+ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); - -- if (time_before(jiffies, rq->fifo_time)) -- return NULL; -+ /* Reset max observed rq size every 32 dispatches */ -+ if (likely(bfqd->peak_rate_samples % 32)) -+ bfqd->last_rq_max_size = -+ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); -+ else -+ bfqd->last_rq_max_size = blk_rq_sectors(rq); - -- return rq; -+ bfqd->delta_from_first = now_ns - bfqd->first_dispatch; -+ -+ bfq_log(bfqd, -+ "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched, -+ bfqd->delta_from_first>>10); -+ -+ /* Target observation interval not yet reached, go on sampling */ -+ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) -+ goto update_last_values; -+ -+update_rate_and_reset: -+ bfq_update_rate_reset(bfqd, rq); -+update_last_values: -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ bfqd->last_dispatch = now_ns; -+ -+ bfq_log(bfqd, -+ "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", -+ (now_ns - bfqd->first_dispatch)>>10, -+ (unsigned long long) bfqd->last_position, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ bfq_log(bfqd, -+ "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); - } - --static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -+/* -+ * Move request from internal lists to the dispatch list of the request queue -+ */ -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) - { -- struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); - -- return entity->budget - entity->service; -+ /* -+ * For consistency, the next instruction should have been executed -+ * after removing the request from the queue and dispatching it. -+ * We execute instead this instruction before bfq_remove_request() -+ * (and hence introduce a temporary inconsistency), for efficiency. -+ * In fact, in a forced_dispatch, this prevents two counters related -+ * to bfqq->dispatched to risk to be uselessly decremented if bfqq -+ * is not in service, and then to be incremented again after -+ * incrementing bfqq->dispatched. -+ */ -+ bfqq->dispatched++; -+ bfq_update_peak_rate(q->elevator->elevator_data, rq); -+ -+ bfq_remove_request(rq); -+ elv_dispatch_sort(q, rq); - } - - static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) - { - BUG_ON(bfqq != bfqd->in_service_queue); - -- __bfq_bfqd_reset_in_service(bfqd); -- - /* - * If this bfqq is shared between multiple processes, check - * to make sure that those processes are still issuing I/Os -@@ -1851,20 +2671,30 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_mark_bfqq_split_coop(bfqq); - - if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -- /* -- * Overloading budget_timeout field to store the time -- * at which the queue remains with no backlog; used by -- * the weight-raising mechanism. -- */ -- bfqq->budget_timeout = jiffies; -- bfq_del_bfqq_busy(bfqd, bfqq, 1); -+ if (bfqq->dispatched == 0) -+ /* -+ * Overloading budget_timeout field to store -+ * the time at which the queue remains with no -+ * backlog and no outstanding request; used by -+ * the weight-raising mechanism. -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_del_bfqq_busy(bfqd, bfqq, true); - } else { -- bfq_activate_bfqq(bfqd, bfqq); -+ bfq_requeue_bfqq(bfqd, bfqq); - /* - * Resort priority tree of potential close cooperators. - */ - bfq_pos_tree_add_move(bfqd, bfqq); - } -+ -+ /* -+ * All in-service entities must have been properly deactivated -+ * or requeued before executing the next function, which -+ * resets all in-service entites as no more in service. -+ */ -+ __bfq_bfqd_reset_in_service(bfqd); - } - - /** -@@ -1883,10 +2713,19 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - struct request *next_rq; - int budget, min_budget; - -- budget = bfqq->max_budget; -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ - min_budget = bfq_min_budget(bfqd); - -- BUG_ON(bfqq != bfqd->in_service_queue); -+ if (bfqq->wr_coeff == 1) -+ budget = bfqq->max_budget; -+ else /* -+ * Use a constant, low budget for weight-raised queues, -+ * to help achieve a low latency. Keep it slightly higher -+ * than the minimum possible budget, to cause a little -+ * bit fewer expirations. -+ */ -+ budget = 2 * min_budget; - - bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", - bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -@@ -1895,7 +2734,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", - bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); - -- if (bfq_bfqq_sync(bfqq)) { -+ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { - switch (reason) { - /* - * Caveat: in all the following cases we trade latency -@@ -1937,14 +2776,10 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - break; - case BFQ_BFQQ_BUDGET_TIMEOUT: - /* -- * We double the budget here because: 1) it -- * gives the chance to boost the throughput if -- * this is not a seeky process (which may have -- * bumped into this timeout because of, e.g., -- * ZBR), 2) together with charge_full_budget -- * it helps give seeky processes higher -- * timestamps, and hence be served less -- * frequently. -+ * We double the budget here because it gives -+ * the chance to boost the throughput if this -+ * is not a seeky process (and has bumped into -+ * this timeout because of, e.g., ZBR). - */ - budget = min(budget * 2, bfqd->bfq_max_budget); - break; -@@ -1961,17 +2796,49 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - budget = min(budget * 4, bfqd->bfq_max_budget); - break; - case BFQ_BFQQ_NO_MORE_REQUESTS: -- /* -- * Leave the budget unchanged. -- */ -+ /* -+ * For queues that expire for this reason, it -+ * is particularly important to keep the -+ * budget close to the actual service they -+ * need. Doing so reduces the timestamp -+ * misalignment problem described in the -+ * comments in the body of -+ * __bfq_activate_entity. In fact, suppose -+ * that a queue systematically expires for -+ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a -+ * new request in time to enjoy timestamp -+ * back-shifting. The larger the budget of the -+ * queue is with respect to the service the -+ * queue actually requests in each service -+ * slot, the more times the queue can be -+ * reactivated with the same virtual finish -+ * time. It follows that, even if this finish -+ * time is pushed to the system virtual time -+ * to reduce the consequent timestamp -+ * misalignment, the queue unjustly enjoys for -+ * many re-activations a lower finish time -+ * than all newly activated queues. -+ * -+ * The service needed by bfqq is measured -+ * quite precisely by bfqq->entity.service. -+ * Since bfqq does not enjoy device idling, -+ * bfqq->entity.service is equal to the number -+ * of sectors that the process associated with -+ * bfqq requested to read/write before waiting -+ * for request completions, or blocking for -+ * other reasons. -+ */ -+ budget = max_t(int, bfqq->entity.service, min_budget); -+ break; - default: - return; - } -- } else -+ } else if (!bfq_bfqq_sync(bfqq)) - /* -- * Async queues get always the maximum possible budget -- * (their ability to dispatch is limited by -- * @bfqd->bfq_max_budget_async_rq). -+ * Async queues get always the maximum possible -+ * budget, as for them we do not care about latency -+ * (in addition, their ability to dispatch is limited -+ * by the charging factor). - */ - budget = bfqd->bfq_max_budget; - -@@ -1982,160 +2849,120 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); - - /* -- * Make sure that we have enough budget for the next request. -- * Since the finish time of the bfqq must be kept in sync with -- * the budget, be sure to call __bfq_bfqq_expire() after the -+ * If there is still backlog, then assign a new budget, making -+ * sure that it is large enough for the next request. Since -+ * the finish time of bfqq must be kept in sync with the -+ * budget, be sure to call __bfq_bfqq_expire() *after* this - * update. -+ * -+ * If there is no backlog, then no need to update the budget; -+ * it will be updated on the arrival of a new request. - */ - next_rq = bfqq->next_rq; -- if (next_rq) -+ if (next_rq) { -+ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || -+ reason == BFQ_BFQQ_NO_MORE_REQUESTS); - bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, - bfq_serv_to_charge(next_rq, bfqq)); -- else -- bfqq->entity.budget = bfqq->max_budget; -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ } - - bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", - next_rq ? blk_rq_sectors(next_rq) : 0, - bfqq->entity.budget); - } - --static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) --{ -- unsigned long max_budget; -- -- /* -- * The max_budget calculated when autotuning is equal to the -- * amount of sectors transfered in timeout_sync at the -- * estimated peak rate. -- */ -- max_budget = (unsigned long)(peak_rate * 1000 * -- timeout >> BFQ_RATE_SHIFT); -- -- return max_budget; --} -- - /* -- * In addition to updating the peak rate, checks whether the process -- * is "slow", and returns 1 if so. This slow flag is used, in addition -- * to the budget timeout, to reduce the amount of service provided to -- * seeky processes, and hence reduce their chances to lower the -- * throughput. See the code for more details. -+ * Return true if the process associated with bfqq is "slow". The slow -+ * flag is used, in addition to the budget timeout, to reduce the -+ * amount of service provided to seeky processes, and thus reduce -+ * their chances to lower the throughput. More details in the comments -+ * on the function bfq_bfqq_expire(). -+ * -+ * An important observation is in order: as discussed in the comments -+ * on the function bfq_update_peak_rate(), with devices with internal -+ * queues, it is hard if ever possible to know when and for how long -+ * an I/O request is processed by the device (apart from the trivial -+ * I/O pattern where a new request is dispatched only after the -+ * previous one has been completed). This makes it hard to evaluate -+ * the real rate at which the I/O requests of each bfq_queue are -+ * served. In fact, for an I/O scheduler like BFQ, serving a -+ * bfq_queue means just dispatching its requests during its service -+ * slot (i.e., until the budget of the queue is exhausted, or the -+ * queue remains idle, or, finally, a timeout fires). But, during the -+ * service slot of a bfq_queue, around 100 ms at most, the device may -+ * be even still processing requests of bfq_queues served in previous -+ * service slots. On the opposite end, the requests of the in-service -+ * bfq_queue may be completed after the service slot of the queue -+ * finishes. -+ * -+ * Anyway, unless more sophisticated solutions are used -+ * (where possible), the sum of the sizes of the requests dispatched -+ * during the service slot of a bfq_queue is probably the only -+ * approximation available for the service received by the bfq_queue -+ * during its service slot. And this sum is the quantity used in this -+ * function to evaluate the I/O speed of a process. - */ --static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, -- bool compensate, enum bfqq_expiration reason) -+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool compensate, enum bfqq_expiration reason, -+ unsigned long *delta_ms) - { -- u64 bw, usecs, expected, timeout; -- ktime_t delta; -- int update = 0; -+ ktime_t delta_ktime; -+ u32 delta_usecs; -+ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ - -- if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) -+ if (!bfq_bfqq_sync(bfqq)) - return false; - - if (compensate) -- delta = bfqd->last_idling_start; -+ delta_ktime = bfqd->last_idling_start; - else -- delta = ktime_get(); -- delta = ktime_sub(delta, bfqd->last_budget_start); -- usecs = ktime_to_us(delta); -- -- /* Don't trust short/unrealistic values. */ -- if (usecs < 100 || usecs >= LONG_MAX) -- return false; -- -- /* -- * Calculate the bandwidth for the last slice. We use a 64 bit -- * value to store the peak rate, in sectors per usec in fixed -- * point math. We do so to have enough precision in the estimate -- * and to avoid overflows. -- */ -- bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; -- do_div(bw, (unsigned long)usecs); -+ delta_ktime = ktime_get(); -+ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); -+ delta_usecs = ktime_to_us(delta_ktime); -+ -+ /* don't use too short time intervals */ -+ if (delta_usecs < 1000) { -+ if (blk_queue_nonrot(bfqd->queue)) -+ /* -+ * give same worst-case guarantees as idling -+ * for seeky -+ */ -+ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; -+ else /* charge at least one seek */ -+ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; -+ -+ bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); -+ -+ return slow; -+ } - -- timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); -+ *delta_ms = delta_usecs / USEC_PER_MSEC; - - /* -- * Use only long (> 20ms) intervals to filter out spikes for -- * the peak rate estimation. -+ * Use only long (> 20ms) intervals to filter out excessive -+ * spikes in service rate estimation. - */ -- if (usecs > 20000) { -- if (bw > bfqd->peak_rate || -- (!BFQQ_SEEKY(bfqq) && -- reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { -- bfq_log(bfqd, "measured bw =%llu", bw); -- /* -- * To smooth oscillations use a low-pass filter with -- * alpha=7/8, i.e., -- * new_rate = (7/8) * old_rate + (1/8) * bw -- */ -- do_div(bw, 8); -- if (bw == 0) -- return 0; -- bfqd->peak_rate *= 7; -- do_div(bfqd->peak_rate, 8); -- bfqd->peak_rate += bw; -- update = 1; -- bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); -- } -- -- update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; -- -- if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) -- bfqd->peak_rate_samples++; -- -- if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && -- update) { -- int dev_type = blk_queue_nonrot(bfqd->queue); -- -- if (bfqd->bfq_user_max_budget == 0) { -- bfqd->bfq_max_budget = -- bfq_calc_max_budget(bfqd->peak_rate, -- timeout); -- bfq_log(bfqd, "new max_budget=%d", -- bfqd->bfq_max_budget); -- } -- if (bfqd->device_speed == BFQ_BFQD_FAST && -- bfqd->peak_rate < device_speed_thresh[dev_type]) { -- bfqd->device_speed = BFQ_BFQD_SLOW; -- bfqd->RT_prod = R_slow[dev_type] * -- T_slow[dev_type]; -- } else if (bfqd->device_speed == BFQ_BFQD_SLOW && -- bfqd->peak_rate > device_speed_thresh[dev_type]) { -- bfqd->device_speed = BFQ_BFQD_FAST; -- bfqd->RT_prod = R_fast[dev_type] * -- T_fast[dev_type]; -- } -- } -+ if (delta_usecs > 20000) { -+ /* -+ * Caveat for rotational devices: processes doing I/O -+ * in the slower disk zones tend to be slow(er) even -+ * if not seeky. In this respect, the estimated peak -+ * rate is likely to be an average over the disk -+ * surface. Accordingly, to not be too harsh with -+ * unlucky processes, a process is deemed slow only if -+ * its rate has been lower than half of the estimated -+ * peak rate. -+ */ -+ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; -+ bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", -+ bfqq->entity.service, bfqd->bfq_max_budget); - } - -- /* -- * If the process has been served for a too short time -- * interval to let its possible sequential accesses prevail on -- * the initial seek time needed to move the disk head on the -- * first sector it requested, then give the process a chance -- * and for the moment return false. -- */ -- if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) -- return false; -- -- /* -- * A process is considered ``slow'' (i.e., seeky, so that we -- * cannot treat it fairly in the service domain, as it would -- * slow down too much the other processes) if, when a slice -- * ends for whatever reason, it has received service at a -- * rate that would not be high enough to complete the budget -- * before the budget timeout expiration. -- */ -- expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; -+ bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); - -- /* -- * Caveat: processes doing IO in the slower disk zones will -- * tend to be slow(er) even if not seeky. And the estimated -- * peak rate will actually be an average over the disk -- * surface. Hence, to not be too harsh with unlucky processes, -- * we keep a budget/3 margin of safety before declaring a -- * process slow. -- */ -- return expected > (4 * bfqq->entity.budget) / 3; -+ return slow; - } - - /* -@@ -2193,20 +3020,35 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, - static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - struct bfq_queue *bfqq) - { -+ bfq_log_bfqq(bfqd, bfqq, -+"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", -+ bfqq->service_from_backlogged, -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate)); -+ - return max(bfqq->last_idle_bklogged + - HZ * bfqq->service_from_backlogged / - bfqd->bfq_wr_max_softrt_rate, -- jiffies + bfqq->bfqd->bfq_slice_idle + 4); -+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); - } - - /* -- * Return the largest-possible time instant such that, for as long as possible, -- * the current time will be lower than this time instant according to the macro -- * time_is_before_jiffies(). -+ * Return the farthest future time instant according to jiffies -+ * macros. - */ --static unsigned long bfq_infinity_from_now(unsigned long now) -+static unsigned long bfq_greatest_from_now(void) - { -- return now + ULONG_MAX / 2; -+ return jiffies + MAX_JIFFY_OFFSET; -+} -+ -+/* -+ * Return the farthest past time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_smallest_from_now(void) -+{ -+ return jiffies - MAX_JIFFY_OFFSET; - } - - /** -@@ -2216,28 +3058,24 @@ static unsigned long bfq_infinity_from_now(unsigned long now) - * @compensate: if true, compensate for the time spent idling. - * @reason: the reason causing the expiration. - * -+ * If the process associated with bfqq does slow I/O (e.g., because it -+ * issues random requests), we charge bfqq with the time it has been -+ * in service instead of the service it has received (see -+ * bfq_bfqq_charge_time for details on how this goal is achieved). As -+ * a consequence, bfqq will typically get higher timestamps upon -+ * reactivation, and hence it will be rescheduled as if it had -+ * received more service than what it has actually received. In the -+ * end, bfqq receives less service in proportion to how slowly its -+ * associated process consumes its budgets (and hence how seriously it -+ * tends to lower the throughput). In addition, this time-charging -+ * strategy guarantees time fairness among slow processes. In -+ * contrast, if the process associated with bfqq is not slow, we -+ * charge bfqq exactly with the service it has received. - * -- * If the process associated to the queue is slow (i.e., seeky), or in -- * case of budget timeout, or, finally, if it is async, we -- * artificially charge it an entire budget (independently of the -- * actual service it received). As a consequence, the queue will get -- * higher timestamps than the correct ones upon reactivation, and -- * hence it will be rescheduled as if it had received more service -- * than what it actually received. In the end, this class of processes -- * will receive less service in proportion to how slowly they consume -- * their budgets (and hence how seriously they tend to lower the -- * throughput). -- * -- * In contrast, when a queue expires because it has been idling for -- * too much or because it exhausted its budget, we do not touch the -- * amount of service it has received. Hence when the queue will be -- * reactivated and its timestamps updated, the latter will be in sync -- * with the actual service received by the queue until expiration. -- * -- * Charging a full budget to the first type of queues and the exact -- * service to the others has the effect of using the WF2Q+ policy to -- * schedule the former on a timeslice basis, without violating the -- * service domain guarantees of the latter. -+ * Charging time to the first type of queues and the exact service to -+ * the other has the effect of using the WF2Q+ policy to schedule the -+ * former on a timeslice basis, without violating service domain -+ * guarantees among the latter. - */ - static void bfq_bfqq_expire(struct bfq_data *bfqd, - struct bfq_queue *bfqq, -@@ -2245,41 +3083,53 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - enum bfqq_expiration reason) - { - bool slow; -+ unsigned long delta = 0; -+ struct bfq_entity *entity = &bfqq->entity; -+ int ref; - - BUG_ON(bfqq != bfqd->in_service_queue); - - /* -- * Update disk peak rate for autotuning and check whether the -- * process is slow (see bfq_update_peak_rate). -+ * Check whether the process is slow (see bfq_bfqq_is_slow). - */ -- slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); -+ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); - - /* -- * As above explained, 'punish' slow (i.e., seeky), timed-out -- * and async queues, to favor sequential sync workloads. -- * -- * Processes doing I/O in the slower disk zones will tend to be -- * slow(er) even if not seeky. Hence, since the estimated peak -- * rate is actually an average over the disk surface, these -- * processes may timeout just for bad luck. To avoid punishing -- * them we do not charge a full budget to a process that -- * succeeded in consuming at least 2/3 of its budget. -+ * Increase service_from_backlogged before next statement, -+ * because the possible next invocation of -+ * bfq_bfqq_charge_time would likely inflate -+ * entity->service. In contrast, service_from_backlogged must -+ * contain real service, to enable the soft real-time -+ * heuristic to correctly compute the bandwidth consumed by -+ * bfqq. - */ -- if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && -- bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) -- bfq_bfqq_charge_full_budget(bfqq); -+ bfqq->service_from_backlogged += entity->service; - -- bfqq->service_from_backlogged += bfqq->entity.service; -+ /* -+ * As above explained, charge slow (typically seeky) and -+ * timed-out queues with the time and not the service -+ * received, to favor sequential workloads. -+ * -+ * Processes doing I/O in the slower disk zones will tend to -+ * be slow(er) even if not seeky. Therefore, since the -+ * estimated peak rate is actually an average over the disk -+ * surface, these processes may timeout just for bad luck. To -+ * avoid punishing them, do not charge time to processes that -+ * succeeded in consuming at least 2/3 of their budget. This -+ * allows BFQ to preserve enough elasticity to still perform -+ * bandwidth, and not time, distribution with little unlucky -+ * or quasi-sequential processes. -+ */ -+ if (bfqq->wr_coeff == 1 && -+ (slow || -+ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && -+ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) -+ bfq_bfqq_charge_time(bfqd, bfqq, delta); - -- if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && -- !bfq_bfqq_constantly_seeky(bfqq)) { -- bfq_mark_bfqq_constantly_seeky(bfqq); -- if (!blk_queue_nonrot(bfqd->queue)) -- bfqd->const_seeky_busy_in_flight_queues++; -- } -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); - - if (reason == BFQ_BFQQ_TOO_IDLE && -- bfqq->entity.service <= 2 * bfqq->entity.budget / 10) -+ entity->service <= 2 * entity->budget / 10) - bfq_clear_bfqq_IO_bound(bfqq); - - if (bfqd->low_latency && bfqq->wr_coeff == 1) -@@ -2288,19 +3138,23 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && - RB_EMPTY_ROOT(&bfqq->sort_list)) { - /* -- * If we get here, and there are no outstanding requests, -- * then the request pattern is isochronous (see the comments -- * to the function bfq_bfqq_softrt_next_start()). Hence we -- * can compute soft_rt_next_start. If, instead, the queue -- * still has outstanding requests, then we have to wait -- * for the completion of all the outstanding requests to -+ * If we get here, and there are no outstanding -+ * requests, then the request pattern is isochronous -+ * (see the comments on the function -+ * bfq_bfqq_softrt_next_start()). Thus we can compute -+ * soft_rt_next_start. If, instead, the queue still -+ * has outstanding requests, then we have to wait for -+ * the completion of all the outstanding requests to - * discover whether the request pattern is actually - * isochronous. - */ -- if (bfqq->dispatched == 0) -+ BUG_ON(bfqd->busy_queues < 1); -+ if (bfqq->dispatched == 0) { - bfqq->soft_rt_next_start = - bfq_bfqq_softrt_next_start(bfqd, bfqq); -- else { -+ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", -+ bfqq->soft_rt_next_start); -+ } else { - /* - * The application is still waiting for the - * completion of one or more requests: -@@ -2317,7 +3171,7 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - * happened to be in the past. - */ - bfqq->soft_rt_next_start = -- bfq_infinity_from_now(jiffies); -+ bfq_greatest_from_now(); - /* - * Schedule an update of soft_rt_next_start to when - * the task may be discovered to be isochronous. -@@ -2327,15 +3181,30 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - } - - bfq_log_bfqq(bfqd, bfqq, -- "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, -- slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); -+ "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", -+ reason, slow, bfqq->dispatched, -+ bfq_bfqq_idle_window(bfqq), entity->weight); - - /* - * Increase, decrease or leave budget unchanged according to - * reason. - */ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); - __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ ref = bfqq->ref; - __bfq_bfqq_expire(bfqd, bfqq); -+ -+ BUG_ON(ref > 1 && -+ !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && -+ !bfq_class_idle(bfqq)); -+ -+ /* mark bfqq as waiting a request only if a bic still points to it */ -+ if (ref > 1 && !bfq_bfqq_busy(bfqq) && -+ reason != BFQ_BFQQ_BUDGET_TIMEOUT && -+ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) -+ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); - } - - /* -@@ -2345,20 +3214,17 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - */ - static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) - { -- if (bfq_bfqq_budget_new(bfqq) || -- time_before(jiffies, bfqq->budget_timeout)) -- return false; -- return true; -+ return time_is_before_eq_jiffies(bfqq->budget_timeout); - } - - /* -- * If we expire a queue that is waiting for the arrival of a new -- * request, we may prevent the fictitious timestamp back-shifting that -- * allows the guarantees of the queue to be preserved (see [1] for -- * this tricky aspect). Hence we return true only if this condition -- * does not hold, or if the queue is slow enough to deserve only to be -- * kicked off for preserving a high throughput. --*/ -+ * If we expire a queue that is actively waiting (i.e., with the -+ * device idled) for the arrival of a new request, then we may incur -+ * the timestamp misalignment problem described in the body of the -+ * function __bfq_activate_entity. Hence we return true only if this -+ * condition does not hold, or if the queue is slow enough to deserve -+ * only to be kicked off for preserving a high throughput. -+ */ - static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, -@@ -2400,10 +3266,12 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - { - struct bfq_data *bfqd = bfqq->bfqd; - bool idling_boosts_thr, idling_boosts_thr_without_issues, -- all_queues_seeky, on_hdd_and_not_all_queues_seeky, - idling_needed_for_service_guarantees, - asymmetric_scenario; - -+ if (bfqd->strict_guarantees) -+ return true; -+ - /* - * The next variable takes into account the cases where idling - * boosts the throughput. -@@ -2466,74 +3334,27 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - bfqd->wr_busy_queues == 0; - - /* -- * There are then two cases where idling must be performed not -+ * There is then a case where idling must be performed not - * for throughput concerns, but to preserve service -- * guarantees. In the description of these cases, we say, for -- * short, that a queue is sequential/random if the process -- * associated to the queue issues sequential/random requests -- * (in the second case the queue may be tagged as seeky or -- * even constantly_seeky). -+ * guarantees. - * -- * To introduce the first case, we note that, since -- * bfq_bfqq_idle_window(bfqq) is false if the device is -- * NCQ-capable and bfqq is random (see -- * bfq_update_idle_window()), then, from the above two -- * assignments it follows that -- * idling_boosts_thr_without_issues is false if the device is -- * NCQ-capable and bfqq is random. Therefore, for this case, -- * device idling would never be allowed if we used just -- * idling_boosts_thr_without_issues to decide whether to allow -- * it. And, beneficially, this would imply that throughput -- * would always be boosted also with random I/O on NCQ-capable -- * HDDs. -- * -- * But we must be careful on this point, to avoid an unfair -- * treatment for bfqq. In fact, because of the same above -- * assignments, idling_boosts_thr_without_issues is, on the -- * other hand, true if 1) the device is an HDD and bfqq is -- * sequential, and 2) there are no busy weight-raised -- * queues. As a consequence, if we used just -- * idling_boosts_thr_without_issues to decide whether to idle -- * the device, then with an HDD we might easily bump into a -- * scenario where queues that are sequential and I/O-bound -- * would enjoy idling, whereas random queues would not. The -- * latter might then get a low share of the device throughput, -- * simply because the former would get many requests served -- * after being set as in service, while the latter would not. -- * -- * To address this issue, we start by setting to true a -- * sentinel variable, on_hdd_and_not_all_queues_seeky, if the -- * device is rotational and not all queues with pending or -- * in-flight requests are constantly seeky (i.e., there are -- * active sequential queues, and bfqq might then be mistreated -- * if it does not enjoy idling because it is random). -- */ -- all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) && -- bfqd->busy_in_flight_queues == -- bfqd->const_seeky_busy_in_flight_queues; -- -- on_hdd_and_not_all_queues_seeky = -- !blk_queue_nonrot(bfqd->queue) && !all_queues_seeky; -- -- /* -- * To introduce the second case where idling needs to be -- * performed to preserve service guarantees, we can note that -- * allowing the drive to enqueue more than one request at a -- * time, and hence delegating de facto final scheduling -- * decisions to the drive's internal scheduler, causes loss of -- * control on the actual request service order. In particular, -- * the critical situation is when requests from different -- * processes happens to be present, at the same time, in the -- * internal queue(s) of the drive. In such a situation, the -- * drive, by deciding the service order of the -- * internally-queued requests, does determine also the actual -- * throughput distribution among these processes. But the -- * drive typically has no notion or concern about per-process -- * throughput distribution, and makes its decisions only on a -- * per-request basis. Therefore, the service distribution -- * enforced by the drive's internal scheduler is likely to -- * coincide with the desired device-throughput distribution -- * only in a completely symmetric scenario where: -+ * To introduce this case, we can note that allowing the drive -+ * to enqueue more than one request at a time, and hence -+ * delegating de facto final scheduling decisions to the -+ * drive's internal scheduler, entails loss of control on the -+ * actual request service order. In particular, the critical -+ * situation is when requests from different processes happen -+ * to be present, at the same time, in the internal queue(s) -+ * of the drive. In such a situation, the drive, by deciding -+ * the service order of the internally-queued requests, does -+ * determine also the actual throughput distribution among -+ * these processes. But the drive typically has no notion or -+ * concern about per-process throughput distribution, and -+ * makes its decisions only on a per-request basis. Therefore, -+ * the service distribution enforced by the drive's internal -+ * scheduler is likely to coincide with the desired -+ * device-throughput distribution only in a completely -+ * symmetric scenario where: - * (i) each of these processes must get the same throughput as - * the others; - * (ii) all these processes have the same I/O pattern -@@ -2555,26 +3376,53 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - * words, only if sub-condition (i) holds, then idling is - * allowed, and the device tends to be prevented from queueing - * many requests, possibly of several processes. The reason -- * for not controlling also sub-condition (ii) is that, first, -- * in the case of an HDD, the asymmetry in terms of types of -- * I/O patterns is already taken in to account in the above -- * sentinel variable -- * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a -- * flash-based device, we prefer however to privilege -- * throughput (and idling lowers throughput for this type of -- * devices), for the following reasons: -- * 1) differently from HDDs, the service time of random -- * requests is not orders of magnitudes lower than the service -- * time of sequential requests; thus, even if processes doing -- * sequential I/O get a preferential treatment with respect to -- * others doing random I/O, the consequences are not as -- * dramatic as with HDDs; -- * 2) if a process doing random I/O does need strong -- * throughput guarantees, it is hopefully already being -- * weight-raised, or the user is likely to have assigned it a -- * higher weight than the other processes (and thus -- * sub-condition (i) is likely to be false, which triggers -- * idling). -+ * for not controlling also sub-condition (ii) is that we -+ * exploit preemption to preserve guarantees in case of -+ * symmetric scenarios, even if (ii) does not hold, as -+ * explained in the next two paragraphs. -+ * -+ * Even if a queue, say Q, is expired when it remains idle, Q -+ * can still preempt the new in-service queue if the next -+ * request of Q arrives soon (see the comments on -+ * bfq_bfqq_update_budg_for_activation). If all queues and -+ * groups have the same weight, this form of preemption, -+ * combined with the hole-recovery heuristic described in the -+ * comments on function bfq_bfqq_update_budg_for_activation, -+ * are enough to preserve a correct bandwidth distribution in -+ * the mid term, even without idling. In fact, even if not -+ * idling allows the internal queues of the device to contain -+ * many requests, and thus to reorder requests, we can rather -+ * safely assume that the internal scheduler still preserves a -+ * minimum of mid-term fairness. The motivation for using -+ * preemption instead of idling is that, by not idling, -+ * service guarantees are preserved without minimally -+ * sacrificing throughput. In other words, both a high -+ * throughput and its desired distribution are obtained. -+ * -+ * More precisely, this preemption-based, idleless approach -+ * provides fairness in terms of IOPS, and not sectors per -+ * second. This can be seen with a simple example. Suppose -+ * that there are two queues with the same weight, but that -+ * the first queue receives requests of 8 sectors, while the -+ * second queue receives requests of 1024 sectors. In -+ * addition, suppose that each of the two queues contains at -+ * most one request at a time, which implies that each queue -+ * always remains idle after it is served. Finally, after -+ * remaining idle, each queue receives very quickly a new -+ * request. It follows that the two queues are served -+ * alternatively, preempting each other if needed. This -+ * implies that, although both queues have the same weight, -+ * the queue with large requests receives a service that is -+ * 1024/8 times as high as the service received by the other -+ * queue. -+ * -+ * On the other hand, device idling is performed, and thus -+ * pure sector-domain guarantees are provided, for the -+ * following queues, which are likely to need stronger -+ * throughput guarantees: weight-raised queues, and queues -+ * with a higher weight than other queues. When such queues -+ * are active, sub-condition (i) is false, which triggers -+ * device idling. - * - * According to the above considerations, the next variable is - * true (only) if sub-condition (i) holds. To compute the -@@ -2582,7 +3430,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - * the function bfq_symmetric_scenario(), but also check - * whether bfqq is being weight-raised, because - * bfq_symmetric_scenario() does not take into account also -- * weight-raised queues (see comments to -+ * weight-raised queues (see comments on - * bfq_weights_tree_add()). - * - * As a side note, it is worth considering that the above -@@ -2604,17 +3452,16 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - * bfqq. Such a case is when bfqq became active in a burst of - * queue activations. Queues that became active during a large - * burst benefit only from throughput, as discussed in the -- * comments to bfq_handle_burst. Thus, if bfqq became active -+ * comments on bfq_handle_burst. Thus, if bfqq became active - * in a burst and not idling the device maximizes throughput, - * then the device must no be idled, because not idling the - * device provides bfqq and all other queues in the burst with -- * maximum benefit. Combining this and the two cases above, we -- * can now establish when idling is actually needed to -- * preserve service guarantees. -+ * maximum benefit. Combining this and the above case, we can -+ * now establish when idling is actually needed to preserve -+ * service guarantees. - */ - idling_needed_for_service_guarantees = -- (on_hdd_and_not_all_queues_seeky || asymmetric_scenario) && -- !bfq_bfqq_in_large_burst(bfqq); -+ asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); - - /* - * We have now all the components we need to compute the return -@@ -2624,6 +3471,16 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - * 2) idling either boosts the throughput (without issues), or - * is necessary to preserve service guarantees. - */ -+ bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", -+ bfq_bfqq_sync(bfqq), idling_boosts_thr); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", -+ bfqd->wr_busy_queues, -+ idling_boosts_thr_without_issues, -+ bfq_bfqq_IO_bound(bfqq), -+ idling_needed_for_service_guarantees); -+ - return bfq_bfqq_sync(bfqq) && - (idling_boosts_thr_without_issues || - idling_needed_for_service_guarantees); -@@ -2635,7 +3492,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - * 1) the queue must remain in service and cannot be expired, and - * 2) the device must be idled to wait for the possible arrival of a new - * request for the queue. -- * See the comments to the function bfq_bfqq_may_idle for the reasons -+ * See the comments on the function bfq_bfqq_may_idle for the reasons - * why performing device idling is the best choice to boost the throughput - * and preserve service guarantees when bfq_bfqq_may_idle itself - * returns true. -@@ -2665,18 +3522,33 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); - - if (bfq_may_expire_for_budg_timeout(bfqq) && -- !timer_pending(&bfqd->idle_slice_timer) && -+ !hrtimer_active(&bfqd->idle_slice_timer) && - !bfq_bfqq_must_idle(bfqq)) - goto expire; - -+check_queue: -+ /* -+ * This loop is rarely executed more than once. Even when it -+ * happens, it is much more convenient to re-execute this loop -+ * than to return NULL and trigger a new dispatch to get a -+ * request served. -+ */ - next_rq = bfqq->next_rq; - /* - * If bfqq has requests queued and it has enough budget left to - * serve them, keep the queue, otherwise expire it. - */ - if (next_rq) { -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ - if (bfq_serv_to_charge(next_rq, bfqq) > - bfq_bfqq_budget_left(bfqq)) { -+ /* -+ * Expire the queue for budget exhaustion, -+ * which makes sure that the next budget is -+ * enough to serve the next request, even if -+ * it comes from the fifo expired path. -+ */ - reason = BFQ_BFQQ_BUDGET_EXHAUSTED; - goto expire; - } else { -@@ -2685,7 +3557,8 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - * not disable disk idling even when a new request - * arrives. - */ -- if (timer_pending(&bfqd->idle_slice_timer)) { -+ if (bfq_bfqq_wait_request(bfqq)) { -+ BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); - /* - * If we get here: 1) at least a new request - * has arrived but we have not disabled the -@@ -2700,10 +3573,8 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - * So we disable idling. - */ - bfq_clear_bfqq_wait_request(bfqq); -- del_timer(&bfqd->idle_slice_timer); --#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqg_stats_update_idle_time(bfqq_group(bfqq)); --#endif - } - goto keep_queue; - } -@@ -2714,7 +3585,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - * for a new request, or has requests waiting for a completion and - * may idle after their completion, then keep it anyway. - */ -- if (timer_pending(&bfqd->idle_slice_timer) || -+ if (hrtimer_active(&bfqd->idle_slice_timer) || - (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { - bfqq = NULL; - goto keep_queue; -@@ -2725,9 +3596,16 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - bfq_bfqq_expire(bfqd, bfqq, false, reason); - new_queue: - bfqq = bfq_set_in_service_queue(bfqd); -- bfq_log(bfqd, "select_queue: new queue %d returned", -- bfqq ? bfqq->pid : 0); -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); -+ goto check_queue; -+ } - keep_queue: -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); -+ else -+ bfq_log(bfqd, "select_queue: no queue returned"); -+ - return bfqq; - } - -@@ -2736,6 +3614,9 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - struct bfq_entity *entity = &bfqq->entity; - - if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ -+ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ - bfq_log_bfqq(bfqd, bfqq, - "raising period dur %u/%u msec, old coeff %u, w %d(%d)", - jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -@@ -2749,22 +3630,30 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); - - /* -- * If the queue was activated in a burst, or -- * too much time has elapsed from the beginning -- * of this weight-raising period, or the queue has -- * exceeded the acceptable number of cooperations, -- * then end weight raising. -+ * If the queue was activated in a burst, or too much -+ * time has elapsed from the beginning of this -+ * weight-raising period, then end weight raising. - */ -- if (bfq_bfqq_in_large_burst(bfqq) || -- bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || -- time_is_before_jiffies(bfqq->last_wr_start_finish + -- bfqq->wr_cur_max_time)) { -- bfqq->last_wr_start_finish = jiffies; -- bfq_log_bfqq(bfqd, bfqq, -- "wrais ending at %lu, rais_max_time %u", -- bfqq->last_wr_start_finish, -- jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ if (bfq_bfqq_in_large_burst(bfqq)) - bfq_bfqq_end_wr(bfqq); -+ else if (time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time)) { -+ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || -+ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) -+ bfq_bfqq_end_wr(bfqq); -+ else { -+ /* switch back to interactive wr */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ bfqq->last_wr_start_finish = -+ bfqq->wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies( -+ bfqq->last_wr_start_finish)); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "back to interactive wr"); -+ } - } - } - /* Update weight both if it must be raised and if it must be lowered */ -@@ -2782,46 +3671,34 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, - struct bfq_queue *bfqq) - { - int dispatched = 0; -- struct request *rq; -+ struct request *rq = bfqq->next_rq; - unsigned long service_to_charge; - - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -- -- /* Follow expired path, else get first next available. */ -- rq = bfq_check_fifo(bfqq); -- if (!rq) -- rq = bfqq->next_rq; -+ BUG_ON(!rq); - service_to_charge = bfq_serv_to_charge(rq, bfqq); - -- if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { -- /* -- * This may happen if the next rq is chosen in fifo order -- * instead of sector order. The budget is properly -- * dimensioned to be always sufficient to serve the next -- * request only if it is chosen in sector order. The reason -- * is that it would be quite inefficient and little useful -- * to always make sure that the budget is large enough to -- * serve even the possible next rq in fifo order. -- * In fact, requests are seldom served in fifo order. -- * -- * Expire the queue for budget exhaustion, and make sure -- * that the next act_budget is enough to serve the next -- * request, even if it comes from the fifo expired path. -- */ -- bfqq->next_rq = rq; -- /* -- * Since this dispatch is failed, make sure that -- * a new one will be performed -- */ -- if (!bfqd->rq_in_driver) -- bfq_schedule_dispatch(bfqd); -- goto expire; -- } -+ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); - -- /* Finally, insert request into driver dispatch list. */ - bfq_bfqq_served(bfqq, service_to_charge); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ - bfq_dispatch_insert(bfqd->queue, rq); - -+ /* -+ * If weight raising has to terminate for bfqq, then next -+ * function causes an immediate update of bfqq's weight, -+ * without waiting for next activation. As a consequence, on -+ * expiration, bfqq will be timestamped as if has never been -+ * weight-raised during this service slot, even if it has -+ * received part or even most of the service as a -+ * weight-raised queue. This inflates bfqq's timestamps, which -+ * is beneficial, as bfqq is then more willing to leave the -+ * device immediately to possible other weight-raised queues. -+ */ - bfq_update_wr_data(bfqd, bfqq); - - bfq_log_bfqq(bfqd, bfqq, -@@ -2837,9 +3714,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, - bfqd->in_service_bic = RQ_BIC(rq); - } - -- if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && -- dispatched >= bfqd->bfq_max_budget_async_rq) || -- bfq_class_idle(bfqq))) -+ if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) - goto expire; - - return dispatched; -@@ -2885,8 +3760,8 @@ static int bfq_forced_dispatch(struct bfq_data *bfqd) - st = bfq_entity_service_tree(&bfqq->entity); - - dispatched += __bfq_forced_dispatch_bfqq(bfqq); -- bfqq->max_budget = bfq_max_budget(bfqd); - -+ bfqq->max_budget = bfq_max_budget(bfqd); - bfq_forget_idle(st); - } - -@@ -2899,37 +3774,37 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - { - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq; -- int max_dispatch; - - bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -+ - if (bfqd->busy_queues == 0) - return 0; - - if (unlikely(force)) - return bfq_forced_dispatch(bfqd); - -+ /* -+ * Force device to serve one request at a time if -+ * strict_guarantees is true. Forcing this service scheme is -+ * currently the ONLY way to guarantee that the request -+ * service order enforced by the scheduler is respected by a -+ * queueing device. Otherwise the device is free even to make -+ * some unlucky request wait for as long as the device -+ * wishes. -+ * -+ * Of course, serving one request at at time may cause loss of -+ * throughput. -+ */ -+ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) -+ return 0; -+ - bfqq = bfq_select_queue(bfqd); - if (!bfqq) - return 0; - -- if (bfq_class_idle(bfqq)) -- max_dispatch = 1; -- -- if (!bfq_bfqq_sync(bfqq)) -- max_dispatch = bfqd->bfq_max_budget_async_rq; -- -- if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) { -- if (bfqd->busy_queues > 1) -- return 0; -- if (bfqq->dispatched >= 4 * max_dispatch) -- return 0; -- } -- -- if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) -- return 0; -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); - -- bfq_clear_bfqq_wait_request(bfqq); -- BUG_ON(timer_pending(&bfqd->idle_slice_timer)); -+ BUG_ON(bfq_bfqq_wait_request(bfqq)); - - if (!bfq_dispatch_request(bfqd, bfqq)) - return 0; -@@ -2937,6 +3812,8 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", - bfq_bfqq_sync(bfqq) ? "sync" : "async"); - -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); - return 1; - } - -@@ -2944,27 +3821,26 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - * Task holds one reference to the queue, dropped when task exits. Each rq - * in-flight on this queue also holds a reference, dropped when rq is freed. - * -- * Queue lock must be held here. -+ * Queue lock must be held here. Recall not to use bfqq after calling -+ * this function on it. - */ - static void bfq_put_queue(struct bfq_queue *bfqq) - { -- struct bfq_data *bfqd = bfqq->bfqd; - #ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_group *bfqg = bfqq_group(bfqq); - #endif - -- BUG_ON(atomic_read(&bfqq->ref) <= 0); -+ BUG_ON(bfqq->ref <= 0); - -- bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, -- atomic_read(&bfqq->ref)); -- if (!atomic_dec_and_test(&bfqq->ref)) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ bfqq->ref--; -+ if (bfqq->ref) - return; - - BUG_ON(rb_first(&bfqq->sort_list)); - BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); -- BUG_ON(bfqd->in_service_queue == bfqq); - - if (bfq_bfqq_sync(bfqq)) - /* -@@ -2977,7 +3853,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - */ - hlist_del_init(&bfqq->burst_list_node); - -- bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); - - kmem_cache_free(bfq_pool, bfqq); - #ifdef CONFIG_BFQ_GROUP_IOSCHED -@@ -3011,38 +3887,16 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_schedule_dispatch(bfqd); - } - -- bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, -- atomic_read(&bfqq->ref)); -+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); - - bfq_put_cooperator(bfqq); - -- bfq_put_queue(bfqq); -+ bfq_put_queue(bfqq); /* release process reference */ - } - - static void bfq_init_icq(struct io_cq *icq) - { -- struct bfq_io_cq *bic = icq_to_bic(icq); -- -- bic->ttime.last_end_request = jiffies; -- /* -- * A newly created bic indicates that the process has just -- * started doing I/O, and is probably mapping into memory its -- * executable and libraries: it definitely needs weight raising. -- * There is however the possibility that the process performs, -- * for a while, I/O close to some other process. EQM intercepts -- * this behavior and may merge the queue corresponding to the -- * process with some other queue, BEFORE the weight of the queue -- * is raised. Merged queues are not weight-raised (they are assumed -- * to belong to processes that benefit only from high throughput). -- * If the merge is basically the consequence of an accident, then -- * the queue will be split soon and will get back its old weight. -- * It is then important to write down somewhere that this queue -- * does need weight raising, even if it did not make it to get its -- * weight raised before being merged. To this purpose, we overload -- * the field raising_time_left and assign 1 to it, to mark the queue -- * as needing weight raising. -- */ -- bic->wr_time_left = 1; -+ icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); - } - - static void bfq_exit_icq(struct io_cq *icq) -@@ -3050,21 +3904,21 @@ static void bfq_exit_icq(struct io_cq *icq) - struct bfq_io_cq *bic = icq_to_bic(icq); - struct bfq_data *bfqd = bic_to_bfqd(bic); - -- if (bic->bfqq[BLK_RW_ASYNC]) { -- bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); -- bic->bfqq[BLK_RW_ASYNC] = NULL; -+ if (bic_to_bfqq(bic, false)) { -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); -+ bic_set_bfqq(bic, NULL, false); - } - -- if (bic->bfqq[BLK_RW_SYNC]) { -+ if (bic_to_bfqq(bic, true)) { - /* - * If the bic is using a shared queue, put the reference - * taken on the io_context when the bic started using a - * shared bfq_queue. - */ -- if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) -+ if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) - put_io_context(icq->ioc); -- bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); -- bic->bfqq[BLK_RW_SYNC] = NULL; -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); -+ bic_set_bfqq(bic, NULL, true); - } - } - -@@ -3072,8 +3926,8 @@ static void bfq_exit_icq(struct io_cq *icq) - * Update the entity prio values; note that the new values will not - * be used until the next (re)activation. - */ --static void --bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) -+static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) - { - struct task_struct *tsk = current; - int ioprio_class; -@@ -3081,7 +3935,7 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) - ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); - switch (ioprio_class) { - default: -- dev_err(bfqq->bfqd->queue->backing_dev_info.dev, -+ dev_err(bfqq->bfqd->queue->backing_dev_info->dev, - "bfq: bad prio class %d\n", ioprio_class); - case IOPRIO_CLASS_NONE: - /* -@@ -3105,7 +3959,7 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) - break; - } - -- if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) { -+ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { - pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", - bfqq->new_ioprio); - BUG(); -@@ -3113,45 +3967,41 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) - - bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); - bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "set_next_ioprio_data: bic_class %d prio %d class %d", -+ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); - } - - static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) - { -- struct bfq_data *bfqd; -- struct bfq_queue *bfqq, *new_bfqq; -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_queue *bfqq; - unsigned long uninitialized_var(flags); - int ioprio = bic->icq.ioc->ioprio; - -- bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), -- &flags); - /* - * This condition may trigger on a newly created bic, be sure to - * drop the lock before returning. - */ - if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) -- goto out; -+ return; - - bic->ioprio = ioprio; - -- bfqq = bic->bfqq[BLK_RW_ASYNC]; -+ bfqq = bic_to_bfqq(bic, false); - if (bfqq) { -- new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, -- GFP_ATOMIC); -- if (new_bfqq) { -- bic->bfqq[BLK_RW_ASYNC] = new_bfqq; -- bfq_log_bfqq(bfqd, bfqq, -- "check_ioprio_change: bfqq %p %d", -- bfqq, atomic_read(&bfqq->ref)); -- bfq_put_queue(bfqq); -- } -+ /* release process reference on this queue */ -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); -+ bic_set_bfqq(bic, bfqq, false); -+ bfq_log_bfqq(bfqd, bfqq, -+ "check_ioprio_change: bfqq %p %d", -+ bfqq, bfqq->ref); - } - -- bfqq = bic->bfqq[BLK_RW_SYNC]; -+ bfqq = bic_to_bfqq(bic, true); - if (bfqq) - bfq_set_next_ioprio_data(bfqq, bic); -- --out: -- bfq_put_bfqd_unlock(bfqd, &flags); - } - - static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -@@ -3160,8 +4010,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - RB_CLEAR_NODE(&bfqq->entity.rb_node); - INIT_LIST_HEAD(&bfqq->fifo); - INIT_HLIST_NODE(&bfqq->burst_list_node); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - -- atomic_set(&bfqq->ref, 0); -+ bfqq->ref = 0; - bfqq->bfqd = bfqd; - - if (bic) -@@ -3171,6 +4022,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - if (!bfq_class_idle(bfqq)) - bfq_mark_bfqq_idle_window(bfqq); - bfq_mark_bfqq_sync(bfqq); -+ bfq_mark_bfqq_just_created(bfqq); - } else - bfq_clear_bfqq_sync(bfqq); - bfq_mark_bfqq_IO_bound(bfqq); -@@ -3180,72 +4032,19 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfqq->pid = pid; - - bfqq->wr_coeff = 1; -- bfqq->last_wr_start_finish = 0; -+ bfqq->last_wr_start_finish = jiffies; -+ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); -+ bfqq->budget_timeout = bfq_smallest_from_now(); -+ bfqq->split_time = bfq_smallest_from_now(); -+ - /* - * Set to the value for which bfqq will not be deemed as - * soft rt when it becomes backlogged. - */ -- bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); --} -- --static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, -- struct bio *bio, int is_sync, -- struct bfq_io_cq *bic, -- gfp_t gfp_mask) --{ -- struct bfq_group *bfqg; -- struct bfq_queue *bfqq, *new_bfqq = NULL; -- struct blkcg *blkcg; -- --retry: -- rcu_read_lock(); -- -- blkcg = bio_blkcg(bio); -- bfqg = bfq_find_alloc_group(bfqd, blkcg); -- /* bic always exists here */ -- bfqq = bic_to_bfqq(bic, is_sync); -- -- /* -- * Always try a new alloc if we fall back to the OOM bfqq -- * originally, since it should just be a temporary situation. -- */ -- if (!bfqq || bfqq == &bfqd->oom_bfqq) { -- bfqq = NULL; -- if (new_bfqq) { -- bfqq = new_bfqq; -- new_bfqq = NULL; -- } else if (gfpflags_allow_blocking(gfp_mask)) { -- rcu_read_unlock(); -- spin_unlock_irq(bfqd->queue->queue_lock); -- new_bfqq = kmem_cache_alloc_node(bfq_pool, -- gfp_mask | __GFP_ZERO, -- bfqd->queue->node); -- spin_lock_irq(bfqd->queue->queue_lock); -- if (new_bfqq) -- goto retry; -- } else { -- bfqq = kmem_cache_alloc_node(bfq_pool, -- gfp_mask | __GFP_ZERO, -- bfqd->queue->node); -- } -+ bfqq->soft_rt_next_start = bfq_greatest_from_now(); - -- if (bfqq) { -- bfq_init_bfqq(bfqd, bfqq, bic, current->pid, -- is_sync); -- bfq_init_entity(&bfqq->entity, bfqg); -- bfq_log_bfqq(bfqd, bfqq, "allocated"); -- } else { -- bfqq = &bfqd->oom_bfqq; -- bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); -- } -- } -- -- if (new_bfqq) -- kmem_cache_free(bfq_pool, new_bfqq); -- -- rcu_read_unlock(); -- -- return bfqq; -+ /* first request is almost certainly seeky */ -+ bfqq->seek_history = 1; - } - - static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, -@@ -3268,90 +4067,93 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, - } - - static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -- struct bio *bio, int is_sync, -- struct bfq_io_cq *bic, gfp_t gfp_mask) -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic) - { - const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); - const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); - struct bfq_queue **async_bfqq = NULL; -- struct bfq_queue *bfqq = NULL; -+ struct bfq_queue *bfqq; -+ struct bfq_group *bfqg; - -- if (!is_sync) { -- struct blkcg *blkcg; -- struct bfq_group *bfqg; -+ rcu_read_lock(); -+ -+ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); -+ if (!bfqg) { -+ bfqq = &bfqd->oom_bfqq; -+ goto out; -+ } - -- rcu_read_lock(); -- blkcg = bio_blkcg(bio); -- rcu_read_unlock(); -- bfqg = bfq_find_alloc_group(bfqd, blkcg); -+ if (!is_sync) { - async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, - ioprio); - bfqq = *async_bfqq; -+ if (bfqq) -+ goto out; - } - -- if (!bfqq) -- bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask); -+ bfqq = kmem_cache_alloc_node(bfq_pool, -+ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, -+ bfqd->queue->node); -+ -+ if (bfqq) { -+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, -+ is_sync); -+ bfq_init_entity(&bfqq->entity, bfqg); -+ bfq_log_bfqq(bfqd, bfqq, "allocated"); -+ } else { -+ bfqq = &bfqd->oom_bfqq; -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); -+ goto out; -+ } - - /* - * Pin the queue now that it's allocated, scheduler exit will - * prune it. - */ -- if (!is_sync && !(*async_bfqq)) { -- atomic_inc(&bfqq->ref); -+ if (async_bfqq) { -+ bfqq->ref++; /* -+ * Extra group reference, w.r.t. sync -+ * queue. This extra reference is removed -+ * only if bfqq->bfqg disappears, to -+ * guarantee that this queue is not freed -+ * until its group goes away. -+ */ - bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", -- bfqq, atomic_read(&bfqq->ref)); -+ bfqq, bfqq->ref); - *async_bfqq = bfqq; - } - -- atomic_inc(&bfqq->ref); -- bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, -- atomic_read(&bfqq->ref)); -+out: -+ bfqq->ref++; /* get a process reference to this queue */ -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); -+ rcu_read_unlock(); - return bfqq; - } - - static void bfq_update_io_thinktime(struct bfq_data *bfqd, - struct bfq_io_cq *bic) - { -- unsigned long elapsed = jiffies - bic->ttime.last_end_request; -- unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); -+ struct bfq_ttime *ttime = &bic->ttime; -+ u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; - -- bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; -- bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; -- bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / -- bic->ttime.ttime_samples; -+ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); -+ -+ ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; -+ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); -+ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, -+ ttime->ttime_samples); - } - --static void bfq_update_io_seektime(struct bfq_data *bfqd, -- struct bfq_queue *bfqq, -- struct request *rq) -+static void -+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) - { -- sector_t sdist; -- u64 total; -- -- if (bfqq->last_request_pos < blk_rq_pos(rq)) -- sdist = blk_rq_pos(rq) - bfqq->last_request_pos; -- else -- sdist = bfqq->last_request_pos - blk_rq_pos(rq); -- -- /* -- * Don't allow the seek distance to get too large from the -- * odd fragment, pagein, etc. -- */ -- if (bfqq->seek_samples == 0) /* first request, not really a seek */ -- sdist = 0; -- else if (bfqq->seek_samples <= 60) /* second & third seek */ -- sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); -- else -- sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); -- -- bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; -- bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; -- total = bfqq->seek_total + (bfqq->seek_samples/2); -- do_div(total, bfqq->seek_samples); -- bfqq->seek_mean = (sector_t)total; -- -- bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, -- (u64)bfqq->seek_mean); -+ bfqq->seek_history <<= 1; -+ bfqq->seek_history |= -+ get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && -+ (!blk_queue_nonrot(bfqd->queue) || -+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); - } - - /* -@@ -3369,7 +4171,8 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, - return; - - /* Idle window just restored, statistics are meaningless. */ -- if (bfq_bfqq_just_split(bfqq)) -+ if (time_is_after_eq_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) - return; - - enable_idle = bfq_bfqq_idle_window(bfqq); -@@ -3409,22 +4212,13 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - - bfq_update_io_thinktime(bfqd, bic); - bfq_update_io_seektime(bfqd, bfqq, rq); -- if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { -- bfq_clear_bfqq_constantly_seeky(bfqq); -- if (!blk_queue_nonrot(bfqd->queue)) { -- BUG_ON(!bfqd->const_seeky_busy_in_flight_queues); -- bfqd->const_seeky_busy_in_flight_queues--; -- } -- } - if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || - !BFQQ_SEEKY(bfqq)) - bfq_update_idle_window(bfqd, bfqq, bic); -- bfq_clear_bfqq_just_split(bfqq); - - bfq_log_bfqq(bfqd, bfqq, -- "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", -- bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), -- (unsigned long long) bfqq->seek_mean); -+ "rq_enqueued: idle_window=%d (seeky %d)", -+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); - - bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); - -@@ -3438,14 +4232,15 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * is small and the queue is not to be expired, then - * just exit. - * -- * In this way, if the disk is being idled to wait for -- * a new request from the in-service queue, we avoid -- * unplugging the device and committing the disk to serve -- * just a small request. On the contrary, we wait for -- * the block layer to decide when to unplug the device: -- * hopefully, new requests will be merged to this one -- * quickly, then the device will be unplugged and -- * larger requests will be dispatched. -+ * In this way, if the device is being idled to wait -+ * for a new request from the in-service queue, we -+ * avoid unplugging the device and committing the -+ * device to serve just a small request. On the -+ * contrary, we wait for the block layer to decide -+ * when to unplug the device: hopefully, new requests -+ * will be merged to this one quickly, then the device -+ * will be unplugged and larger requests will be -+ * dispatched. - */ - if (small_req && !budget_timeout) - return; -@@ -3457,10 +4252,8 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * timer. - */ - bfq_clear_bfqq_wait_request(bfqq); -- del_timer(&bfqd->idle_slice_timer); --#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqg_stats_update_idle_time(bfqq_group(bfqq)); --#endif - - /* - * The queue is not empty, because a new request just -@@ -3504,28 +4297,24 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) - */ - new_bfqq->allocated[rq_data_dir(rq)]++; - bfqq->allocated[rq_data_dir(rq)]--; -- atomic_inc(&new_bfqq->ref); -- bfq_put_queue(bfqq); -+ new_bfqq->ref++; -+ bfq_clear_bfqq_just_created(bfqq); - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) - bfq_merge_bfqqs(bfqd, RQ_BIC(rq), - bfqq, new_bfqq); -+ /* -+ * rq is about to be enqueued into new_bfqq, -+ * release rq reference on bfqq -+ */ -+ bfq_put_queue(bfqq); - rq->elv.priv[1] = new_bfqq; - bfqq = new_bfqq; -- } else -- bfq_bfqq_increase_failed_cooperations(bfqq); -+ } - } - - bfq_add_request(rq); - -- /* -- * Here a newly-created bfq_queue has already started a weight-raising -- * period: clear raising_time_left to prevent bfq_bfqq_save_state() -- * from assigning it a full weight-raising period. See the detailed -- * comments about this field in bfq_init_icq(). -- */ -- if (bfqq->bic) -- bfqq->bic->wr_time_left = 0; -- rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; -+ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; - list_add_tail(&rq->queuelist, &bfqq->fifo); - - bfq_rq_enqueued(bfqd, bfqq, rq); -@@ -3533,8 +4322,8 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) - - static void bfq_update_hw_tag(struct bfq_data *bfqd) - { -- bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, -- bfqd->rq_in_driver); -+ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, -+ bfqd->rq_in_driver); - - if (bfqd->hw_tag == 1) - return; -@@ -3560,48 +4349,85 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; -- bool sync = bfq_bfqq_sync(bfqq); -+ u64 now_ns; -+ u32 delta_us; - -- bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)", -- blk_rq_sectors(rq), sync); -+ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", -+ blk_rq_sectors(rq)); - -+ assert_spin_locked(bfqd->queue->queue_lock); - bfq_update_hw_tag(bfqd); - - BUG_ON(!bfqd->rq_in_driver); - BUG_ON(!bfqq->dispatched); - bfqd->rq_in_driver--; - bfqq->dispatched--; --#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_completion(bfqq_group(bfqq), - rq_start_time_ns(rq), -- rq_io_start_time_ns(rq), rq->cmd_flags); --#endif -+ rq_io_start_time_ns(rq), -+ rq->cmd_flags); - - if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ /* -+ * Set budget_timeout (which we overload to store the -+ * time at which the queue remains with no backlog and -+ * no outstanding request; used by the weight-raising -+ * mechanism). -+ */ -+ bfqq->budget_timeout = jiffies; -+ - bfq_weights_tree_remove(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); -- if (!blk_queue_nonrot(bfqd->queue)) { -- BUG_ON(!bfqd->busy_in_flight_queues); -- bfqd->busy_in_flight_queues--; -- if (bfq_bfqq_constantly_seeky(bfqq)) { -- BUG_ON(!bfqd-> -- const_seeky_busy_in_flight_queues); -- bfqd->const_seeky_busy_in_flight_queues--; -- } -- } - } - -- if (sync) { -- bfqd->sync_flight--; -- RQ_BIC(rq)->ttime.last_end_request = jiffies; -- } -+ now_ns = ktime_get_ns(); -+ -+ RQ_BIC(rq)->ttime.last_end_request = now_ns; -+ -+ /* -+ * Using us instead of ns, to get a reasonable precision in -+ * computing rate in next check. -+ */ -+ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); -+ -+ bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", -+ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, -+ (USEC_PER_SEC* -+ (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); -+ -+ /* -+ * If the request took rather long to complete, and, according -+ * to the maximum request size recorded, this completion latency -+ * implies that the request was certainly served at a very low -+ * rate (less than 1M sectors/sec), then the whole observation -+ * interval that lasts up to this time instant cannot be a -+ * valid time interval for computing a new peak rate. Invoke -+ * bfq_update_rate_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - reset to zero samples, which will trigger a proper -+ * re-initialization of the observation interval on next -+ * dispatch -+ */ -+ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && -+ (bfqd->last_rq_max_size<last_completion = now_ns; - - /* -- * If we are waiting to discover whether the request pattern of the -- * task associated with the queue is actually isochronous, and -- * both requisites for this condition to hold are satisfied, then -- * compute soft_rt_next_start (see the comments to the function -- * bfq_bfqq_softrt_next_start()). -+ * If we are waiting to discover whether the request pattern -+ * of the task associated with the queue is actually -+ * isochronous, and both requisites for this condition to hold -+ * are now satisfied, then compute soft_rt_next_start (see the -+ * comments on the function bfq_bfqq_softrt_next_start()). We -+ * schedule this delayed check when bfqq expires, if it still -+ * has in-flight requests. - */ - if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && - RB_EMPTY_ROOT(&bfqq->sort_list)) -@@ -3613,10 +4439,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - * or if we want to idle in case it has no pending requests. - */ - if (bfqd->in_service_queue == bfqq) { -- if (bfq_bfqq_budget_new(bfqq)) -- bfq_set_budget_timeout(bfqd); -- -- if (bfq_bfqq_must_idle(bfqq)) { -+ if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { - bfq_arm_slice_timer(bfqd); - goto out; - } else if (bfq_may_expire_for_budg_timeout(bfqq)) -@@ -3646,7 +4469,7 @@ static int __bfq_may_queue(struct bfq_queue *bfqq) - return ELV_MQUEUE_MAY; - } - --static int bfq_may_queue(struct request_queue *q, int rw) -+static int bfq_may_queue(struct request_queue *q, unsigned int op) - { - struct bfq_data *bfqd = q->elevator->elevator_data; - struct task_struct *tsk = current; -@@ -3663,7 +4486,7 @@ static int bfq_may_queue(struct request_queue *q, int rw) - if (!bic) - return ELV_MQUEUE_MAY; - -- bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); -+ bfqq = bic_to_bfqq(bic, op_is_sync(op)); - if (bfqq) - return __bfq_may_queue(bfqq); - -@@ -3687,14 +4510,14 @@ static void bfq_put_request(struct request *rq) - rq->elv.priv[1] = NULL; - - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", -- bfqq, atomic_read(&bfqq->ref)); -+ bfqq, bfqq->ref); - bfq_put_queue(bfqq); - } - } - - /* - * Returns NULL if a new bfqq should be allocated, or the old bfqq if this -- * was the last process referring to said bfqq. -+ * was the last process referring to that bfqq. - */ - static struct bfq_queue * - bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) -@@ -3732,37 +4555,60 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - unsigned long flags; - bool split = false; - -- might_sleep_if(gfpflags_allow_blocking(gfp_mask)); -- -- bfq_check_ioprio_change(bic, bio); -- - spin_lock_irqsave(q->queue_lock, flags); -+ bfq_check_ioprio_change(bic, bio); - - if (!bic) - goto queue_fail; - -+ bfq_check_ioprio_change(bic, bio); -+ - bfq_bic_update_cgroup(bic, bio); - - new_queue: - bfqq = bic_to_bfqq(bic, is_sync); - if (!bfqq || bfqq == &bfqd->oom_bfqq) { -- bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); -+ if (bfqq) -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ - bic_set_bfqq(bic, bfqq, is_sync); - if (split && is_sync) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: was_in_list %d " -+ "was_in_large_burst %d " -+ "large burst in progress %d", -+ bic->was_in_burst_list, -+ bic->saved_in_large_burst, -+ bfqd->large_burst); -+ - if ((bic->was_in_burst_list && bfqd->large_burst) || -- bic->saved_in_large_burst) -+ bic->saved_in_large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: marking in " -+ "large burst"); - bfq_mark_bfqq_in_large_burst(bfqq); -- else { -+ } else { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: clearing in " -+ "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) - hlist_add_head(&bfqq->burst_list_node, - &bfqd->burst_list); - } -+ bfqq->split_time = jiffies; - } - } else { - /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); -+ -+ /* Update bic before losing reference to bfqq */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bic->saved_in_large_burst = true; -+ - bfqq = bfq_split_bfqq(bic, bfqq); - split = true; - if (!bfqq) -@@ -3771,9 +4617,8 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - } - - bfqq->allocated[rw]++; -- atomic_inc(&bfqq->ref); -- bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, -- atomic_read(&bfqq->ref)); -+ bfqq->ref++; -+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; -@@ -3788,7 +4633,6 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { - bfqq->bic = bic; - if (split) { -- bfq_mark_bfqq_just_split(bfqq); - /* - * If the queue has just been split from a shared - * queue, restore the idle window and the possible -@@ -3798,6 +4642,9 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - } - } - -+ if (unlikely(bfq_bfqq_just_created(bfqq))) -+ bfq_handle_burst(bfqd, bfqq); -+ - spin_unlock_irqrestore(q->queue_lock, flags); - - return 0; -@@ -3824,9 +4671,10 @@ static void bfq_kick_queue(struct work_struct *work) - * Handler of the expiration of the timer running if the in-service queue - * is idling inside its time slice. - */ --static void bfq_idle_slice_timer(unsigned long data) -+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) - { -- struct bfq_data *bfqd = (struct bfq_data *)data; -+ struct bfq_data *bfqd = container_of(timer, struct bfq_data, -+ idle_slice_timer); - struct bfq_queue *bfqq; - unsigned long flags; - enum bfqq_expiration reason; -@@ -3844,6 +4692,8 @@ static void bfq_idle_slice_timer(unsigned long data) - */ - if (bfqq) { - bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); -+ bfq_clear_bfqq_wait_request(bfqq); -+ - if (bfq_bfqq_budget_timeout(bfqq)) - /* - * Also here the queue can be safely expired -@@ -3869,25 +4719,26 @@ static void bfq_idle_slice_timer(unsigned long data) - bfq_schedule_dispatch(bfqd); - - spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); -+ return HRTIMER_NORESTART; - } - - static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) - { -- del_timer_sync(&bfqd->idle_slice_timer); -+ hrtimer_cancel(&bfqd->idle_slice_timer); - cancel_work_sync(&bfqd->unplug_work); - } - - static void __bfq_put_async_bfqq(struct bfq_data *bfqd, -- struct bfq_queue **bfqq_ptr) -+ struct bfq_queue **bfqq_ptr) - { - struct bfq_group *root_group = bfqd->root_group; - struct bfq_queue *bfqq = *bfqq_ptr; - - bfq_log(bfqd, "put_async_bfqq: %p", bfqq); - if (bfqq) { -- bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); -+ bfq_bfqq_move(bfqd, bfqq, root_group); - bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", -- bfqq, atomic_read(&bfqq->ref)); -+ bfqq, bfqq->ref); - bfq_put_queue(bfqq); - *bfqq_ptr = NULL; - } -@@ -3922,19 +4773,18 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(bfqd->in_service_queue); - list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) -- bfq_deactivate_bfqq(bfqd, bfqq, 0); -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); - - spin_unlock_irq(q->queue_lock); - - bfq_shutdown_timer_wq(bfqd); - -- synchronize_rcu(); -- -- BUG_ON(timer_pending(&bfqd->idle_slice_timer)); -+ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - - #ifdef CONFIG_BFQ_GROUP_IOSCHED - blkcg_deactivate_policy(q, &blkcg_policy_bfq); - #else -+ bfq_put_async_queues(bfqd, bfqd->root_group); - kfree(bfqd->root_group); - #endif - -@@ -3954,6 +4804,7 @@ static void bfq_init_root_group(struct bfq_group *root_group, - root_group->rq_pos_tree = RB_ROOT; - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) - root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ root_group->sched_data.bfq_class_idle_last_service = jiffies; - } - - static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) -@@ -3978,11 +4829,14 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - * will not attempt to free it. - */ - bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); -- atomic_inc(&bfqd->oom_bfqq.ref); -+ bfqd->oom_bfqq.ref++; - bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; - bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; - bfqd->oom_bfqq.entity.new_weight = - bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); -+ -+ /* oom_bfqq does not participate to bursts */ -+ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); - /* - * Trigger weight initialization, according to ioprio, at the - * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio -@@ -4001,13 +4855,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - goto out_free; - bfq_init_root_group(bfqd->root_group, bfqd); - bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); --#ifdef CONFIG_BFQ_GROUP_IOSCHED -- bfqd->active_numerous_groups = 0; --#endif - -- init_timer(&bfqd->idle_slice_timer); -+ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, -+ HRTIMER_MODE_REL); - bfqd->idle_slice_timer.function = bfq_idle_slice_timer; -- bfqd->idle_slice_timer.data = (unsigned long)bfqd; - - bfqd->queue_weights_tree = RB_ROOT; - bfqd->group_weights_tree = RB_ROOT; -@@ -4027,21 +4878,19 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfqd->bfq_back_max = bfq_back_max; - bfqd->bfq_back_penalty = bfq_back_penalty; - bfqd->bfq_slice_idle = bfq_slice_idle; -- bfqd->bfq_class_idle_last_service = 0; -- bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; -- bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; -- bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; -+ bfqd->bfq_timeout = bfq_timeout; - -- bfqd->bfq_coop_thresh = 2; -- bfqd->bfq_failed_cooperations = 7000; - bfqd->bfq_requests_within_timer = 120; - -- bfqd->bfq_large_burst_thresh = 11; -- bfqd->bfq_burst_interval = msecs_to_jiffies(500); -+ bfqd->bfq_large_burst_thresh = 8; -+ bfqd->bfq_burst_interval = msecs_to_jiffies(180); - - bfqd->low_latency = true; - -- bfqd->bfq_wr_coeff = 20; -+ /* -+ * Trade-off between responsiveness and fairness. -+ */ -+ bfqd->bfq_wr_coeff = 30; - bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); - bfqd->bfq_wr_max_time = 0; - bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); -@@ -4053,16 +4902,15 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - * video. - */ - bfqd->wr_busy_queues = 0; -- bfqd->busy_in_flight_queues = 0; -- bfqd->const_seeky_busy_in_flight_queues = 0; - - /* -- * Begin by assuming, optimistically, that the device peak rate is -- * equal to the highest reference rate. -+ * Begin by assuming, optimistically, that the device is a -+ * high-speed one, and that its peak rate is equal to 2/3 of -+ * the highest reference rate. - */ - bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * - T_fast[blk_queue_nonrot(bfqd->queue)]; -- bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)]; -+ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; - bfqd->device_speed = BFQ_BFQD_FAST; - - return 0; -@@ -4088,7 +4936,7 @@ static int __init bfq_slab_setup(void) - - static ssize_t bfq_var_show(unsigned int var, char *page) - { -- return sprintf(page, "%d\n", var); -+ return sprintf(page, "%u\n", var); - } - - static ssize_t bfq_var_store(unsigned long *var, const char *page, -@@ -4159,21 +5007,21 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) - static ssize_t __FUNC(struct elevator_queue *e, char *page) \ - { \ - struct bfq_data *bfqd = e->elevator_data; \ -- unsigned int __data = __VAR; \ -- if (__CONV) \ -+ u64 __data = __VAR; \ -+ if (__CONV == 1) \ - __data = jiffies_to_msecs(__data); \ -+ else if (__CONV == 2) \ -+ __data = div_u64(__data, NSEC_PER_MSEC); \ - return bfq_var_show(__data, (page)); \ - } --SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); --SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); - SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); - SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); --SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); - SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); --SHOW_FUNCTION(bfq_max_budget_async_rq_show, -- bfqd->bfq_max_budget_async_rq, 0); --SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); --SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); -+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); - SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); - SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); - SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); -@@ -4183,6 +5031,17 @@ SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, - SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); - #undef SHOW_FUNCTION - -+#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ __data = div_u64(__data, NSEC_PER_USEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); -+#undef USEC_SHOW_FUNCTION -+ - #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ - static ssize_t \ - __FUNC(struct elevator_queue *e, const char *page, size_t count) \ -@@ -4194,24 +5053,22 @@ __FUNC(struct elevator_queue *e, const char *page, size_t count) \ - __data = (MIN); \ - else if (__data > (MAX)) \ - __data = (MAX); \ -- if (__CONV) \ -+ if (__CONV == 1) \ - *(__PTR) = msecs_to_jiffies(__data); \ -+ else if (__CONV == 2) \ -+ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ - else \ - *(__PTR) = __data; \ - return ret; \ - } - STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, -- INT_MAX, 1); -+ INT_MAX, 2); - STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, -- INT_MAX, 1); -+ INT_MAX, 2); - STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); - STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, - INT_MAX, 0); --STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); --STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, -- 1, INT_MAX, 0); --STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, -- INT_MAX, 1); -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); - STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); - STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); - STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, -@@ -4224,6 +5081,23 @@ STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, - INT_MAX, 0); - #undef STORE_FUNCTION - -+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ -+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ -+ return ret; \ -+} -+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, -+ UINT_MAX); -+#undef USEC_STORE_FUNCTION -+ - /* do nothing for the moment */ - static ssize_t bfq_weights_store(struct elevator_queue *e, - const char *page, size_t count) -@@ -4231,16 +5105,6 @@ static ssize_t bfq_weights_store(struct elevator_queue *e, - return count; - } - --static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) --{ -- u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); -- -- if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) -- return bfq_calc_max_budget(bfqd->peak_rate, timeout); -- else -- return bfq_default_max_budget; --} -- - static ssize_t bfq_max_budget_store(struct elevator_queue *e, - const char *page, size_t count) - { -@@ -4249,7 +5113,7 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, - int ret = bfq_var_store(&__data, (page), count); - - if (__data == 0) -- bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); - else { - if (__data > INT_MAX) - __data = INT_MAX; -@@ -4261,6 +5125,10 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, - return ret; - } - -+/* -+ * Leaving this name to preserve name compatibility with cfq -+ * parameters, but this timeout is used for both sync and async. -+ */ - static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, - const char *page, size_t count) - { -@@ -4273,9 +5141,27 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, - else if (__data > INT_MAX) - __data = INT_MAX; - -- bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); -+ bfqd->bfq_timeout = msecs_to_jiffies(__data); - if (bfqd->bfq_user_max_budget == 0) -- bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ -+ return ret; -+} -+ -+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (!bfqd->strict_guarantees && __data == 1 -+ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) -+ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; -+ -+ bfqd->strict_guarantees = __data; - - return ret; - } -@@ -4305,10 +5191,10 @@ static struct elv_fs_entry bfq_attrs[] = { - BFQ_ATTR(back_seek_max), - BFQ_ATTR(back_seek_penalty), - BFQ_ATTR(slice_idle), -+ BFQ_ATTR(slice_idle_us), - BFQ_ATTR(max_budget), -- BFQ_ATTR(max_budget_async_rq), - BFQ_ATTR(timeout_sync), -- BFQ_ATTR(timeout_async), -+ BFQ_ATTR(strict_guarantees), - BFQ_ATTR(low_latency), - BFQ_ATTR(wr_coeff), - BFQ_ATTR(wr_max_time), -@@ -4321,14 +5207,15 @@ static struct elv_fs_entry bfq_attrs[] = { - }; - - static struct elevator_type iosched_bfq = { -- .ops = { -+ .ops.sq = { - .elevator_merge_fn = bfq_merge, - .elevator_merged_fn = bfq_merged_request, - .elevator_merge_req_fn = bfq_merged_requests, - #ifdef CONFIG_BFQ_GROUP_IOSCHED - .elevator_bio_merged_fn = bfq_bio_merged, - #endif -- .elevator_allow_merge_fn = bfq_allow_merge, -+ .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -+ .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, - .elevator_dispatch_fn = bfq_dispatch_requests, - .elevator_add_req_fn = bfq_insert_request, - .elevator_activate_req_fn = bfq_activate_request, -@@ -4351,18 +5238,28 @@ static struct elevator_type iosched_bfq = { - .elevator_owner = THIS_MODULE, - }; - -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+static struct blkcg_policy blkcg_policy_bfq = { -+ .dfl_cftypes = bfq_blkg_files, -+ .legacy_cftypes = bfq_blkcg_legacy_files, -+ -+ .cpd_alloc_fn = bfq_cpd_alloc, -+ .cpd_init_fn = bfq_cpd_init, -+ .cpd_bind_fn = bfq_cpd_init, -+ .cpd_free_fn = bfq_cpd_free, -+ -+ .pd_alloc_fn = bfq_pd_alloc, -+ .pd_init_fn = bfq_pd_init, -+ .pd_offline_fn = bfq_pd_offline, -+ .pd_free_fn = bfq_pd_free, -+ .pd_reset_stats_fn = bfq_pd_reset_stats, -+}; -+#endif -+ - static int __init bfq_init(void) - { - int ret; -- -- /* -- * Can be 0 on HZ < 1000 setups. -- */ -- if (bfq_slice_idle == 0) -- bfq_slice_idle = 1; -- -- if (bfq_timeout_async == 0) -- bfq_timeout_async = 1; -+ char msg[60] = "BFQ I/O-scheduler: v8r11"; - - #ifdef CONFIG_BFQ_GROUP_IOSCHED - ret = blkcg_policy_register(&blkcg_policy_bfq); -@@ -4375,27 +5272,46 @@ static int __init bfq_init(void) - goto err_pol_unreg; - - /* -- * Times to load large popular applications for the typical systems -- * installed on the reference devices (see the comments before the -- * definitions of the two arrays). -+ * Times to load large popular applications for the typical -+ * systems installed on the reference devices (see the -+ * comments before the definitions of the next two -+ * arrays). Actually, we use slightly slower values, as the -+ * estimated peak rate tends to be smaller than the actual -+ * peak rate. The reason for this last fact is that estimates -+ * are computed over much shorter time intervals than the long -+ * intervals typically used for benchmarking. Why? First, to -+ * adapt more quickly to variations. Second, because an I/O -+ * scheduler cannot rely on a peak-rate-evaluation workload to -+ * be run for a long time. - */ -- T_slow[0] = msecs_to_jiffies(2600); -- T_slow[1] = msecs_to_jiffies(1000); -- T_fast[0] = msecs_to_jiffies(5500); -- T_fast[1] = msecs_to_jiffies(2000); -+ T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ -+ T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */ -+ T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ -+ T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ - - /* -- * Thresholds that determine the switch between speed classes (see -- * the comments before the definition of the array). -+ * Thresholds that determine the switch between speed classes -+ * (see the comments before the definition of the array -+ * device_speed_thresh). These thresholds are biased towards -+ * transitions to the fast class. This is safer than the -+ * opposite bias. In fact, a wrong transition to the slow -+ * class results in short weight-raising periods, because the -+ * speed of the device then tends to be higher that the -+ * reference peak rate. On the opposite end, a wrong -+ * transition to the fast class tends to increase -+ * weight-raising periods, because of the opposite reason. - */ -- device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; -- device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; -+ device_speed_thresh[0] = (4 * R_slow[0]) / 3; -+ device_speed_thresh[1] = (4 * R_slow[1]) / 3; - - ret = elv_register(&iosched_bfq); - if (ret) - goto err_pol_unreg; - -- pr_info("BFQ I/O-scheduler: v7r11"); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ strcat(msg, " (with cgroups support)"); -+#endif -+ pr_info("%s", msg); - - return 0; - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index a5ed694..1fde070 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -7,28 +7,173 @@ - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * -- * Copyright (C) 2010 Paolo Valente -+ * Copyright (C) 2015 Paolo Valente -+ * -+ * Copyright (C) 2016 Paolo Valente - */ - --#ifdef CONFIG_BFQ_GROUP_IOSCHED --#define for_each_entity(entity) \ -- for (; entity ; entity = entity->parent) -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - --#define for_each_entity_safe(entity, parent) \ -- for (; entity && ({ parent = entity->parent; 1; }); entity = parent) -+/** -+ * bfq_gt - compare two timestamps. -+ * @a: first ts. -+ * @b: second ts. -+ * -+ * Return @a > @b, dealing with wrapping correctly. -+ */ -+static int bfq_gt(u64 a, u64 b) -+{ -+ return (s64)(a - b) > 0; -+} -+ -+static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) -+{ -+ struct rb_node *node = tree->rb_node; - -+ return rb_entry(node, struct bfq_entity, rb_node); -+} - --static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, -- int extract, -- struct bfq_data *bfqd); -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd); - --static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); -+ -+/** -+ * bfq_update_next_in_service - update sd->next_in_service -+ * @sd: sched_data for which to perform the update. -+ * @new_entity: if not NULL, pointer to the entity whose activation, -+ * requeueing or repositionig triggered the invocation of -+ * this function. -+ * -+ * This function is called to update sd->next_in_service, which, in -+ * its turn, may change as a consequence of the insertion or -+ * extraction of an entity into/from one of the active trees of -+ * sd. These insertions/extractions occur as a consequence of -+ * activations/deactivations of entities, with some activations being -+ * 'true' activations, and other activations being requeueings (i.e., -+ * implementing the second, requeueing phase of the mechanism used to -+ * reposition an entity in its active tree; see comments on -+ * __bfq_activate_entity and __bfq_requeue_entity for details). In -+ * both the last two activation sub-cases, new_entity points to the -+ * just activated or requeued entity. -+ * -+ * Returns true if sd->next_in_service changes in such a way that -+ * entity->parent may become the next_in_service for its parent -+ * entity. -+ */ -+static bool bfq_update_next_in_service(struct bfq_sched_data *sd, -+ struct bfq_entity *new_entity) -+{ -+ struct bfq_entity *next_in_service = sd->next_in_service; -+ struct bfq_queue *bfqq; -+ bool parent_sched_may_change = false; -+ -+ /* -+ * If this update is triggered by the activation, requeueing -+ * or repositiong of an entity that does not coincide with -+ * sd->next_in_service, then a full lookup in the active tree -+ * can be avoided. In fact, it is enough to check whether the -+ * just-modified entity has a higher priority than -+ * sd->next_in_service, or, even if it has the same priority -+ * as sd->next_in_service, is eligible and has a lower virtual -+ * finish time than sd->next_in_service. If this compound -+ * condition holds, then the new entity becomes the new -+ * next_in_service. Otherwise no change is needed. -+ */ -+ if (new_entity && new_entity != sd->next_in_service) { -+ /* -+ * Flag used to decide whether to replace -+ * sd->next_in_service with new_entity. Tentatively -+ * set to true, and left as true if -+ * sd->next_in_service is NULL. -+ */ -+ bool replace_next = true; -+ -+ /* -+ * If there is already a next_in_service candidate -+ * entity, then compare class priorities or timestamps -+ * to decide whether to replace sd->service_tree with -+ * new_entity. -+ */ -+ if (next_in_service) { -+ unsigned int new_entity_class_idx = -+ bfq_class_idx(new_entity); -+ struct bfq_service_tree *st = -+ sd->service_tree + new_entity_class_idx; -+ -+ /* -+ * For efficiency, evaluate the most likely -+ * sub-condition first. -+ */ -+ replace_next = -+ (new_entity_class_idx == -+ bfq_class_idx(next_in_service) -+ && -+ !bfq_gt(new_entity->start, st->vtime) -+ && -+ bfq_gt(next_in_service->finish, -+ new_entity->finish)) -+ || -+ new_entity_class_idx < -+ bfq_class_idx(next_in_service); -+ } -+ -+ if (replace_next) -+ next_in_service = new_entity; -+ } else /* invoked because of a deactivation: lookup needed */ -+ next_in_service = bfq_lookup_next_entity(sd); -+ -+ if (next_in_service) { -+ parent_sched_may_change = !sd->next_in_service || -+ bfq_update_parent_budget(next_in_service); -+ } else -+ parent_sched_may_change = sd->next_in_service; -+ -+ sd->next_in_service = next_in_service; -+ -+ if (!next_in_service) -+ return parent_sched_may_change; -+ -+ bfqq = bfq_entity_to_bfqq(next_in_service); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "update_next_in_service: chosen this queue"); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(next_in_service, -+ struct bfq_group, entity); - --static void bfq_update_budget(struct bfq_entity *next_in_service) -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "update_next_in_service: chosen this entity"); -+ } -+#endif -+ return parent_sched_may_change; -+} -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+/* both next loops stop at one of the child entities of the root group */ -+#define for_each_entity(entity) \ -+ for (; entity ; entity = entity->parent) -+ -+/* -+ * For each iteration, compute parent in advance, so as to be safe if -+ * entity is deallocated during the iteration. Such a deallocation may -+ * happen as a consequence of a bfq_put_queue that frees the bfq_queue -+ * containing entity. -+ */ -+#define for_each_entity_safe(entity, parent) \ -+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) -+ -+/* -+ * Returns true if this budget changes may let next_in_service->parent -+ * become the next_in_service entity for its parent entity. -+ */ -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) - { - struct bfq_entity *bfqg_entity; - struct bfq_group *bfqg; - struct bfq_sched_data *group_sd; -+ bool ret = false; - - BUG_ON(!next_in_service); - -@@ -41,60 +186,68 @@ static void bfq_update_budget(struct bfq_entity *next_in_service) - * as it must never become an in-service entity. - */ - bfqg_entity = bfqg->my_entity; -- if (bfqg_entity) -+ if (bfqg_entity) { -+ if (bfqg_entity->budget > next_in_service->budget) -+ ret = true; - bfqg_entity->budget = next_in_service->budget; -+ } -+ -+ return ret; - } - --static int bfq_update_next_in_service(struct bfq_sched_data *sd) -+/* -+ * This function tells whether entity stops being a candidate for next -+ * service, according to the following logic. -+ * -+ * This function is invoked for an entity that is about to be set in -+ * service. If such an entity is a queue, then the entity is no longer -+ * a candidate for next service (i.e, a candidate entity to serve -+ * after the in-service entity is expired). The function then returns -+ * true. -+ * -+ * In contrast, the entity could stil be a candidate for next service -+ * if it is not a queue, and has more than one child. In fact, even if -+ * one of its children is about to be set in service, other children -+ * may still be the next to serve. As a consequence, a non-queue -+ * entity is not a candidate for next-service only if it has only one -+ * child. And only if this condition holds, then the function returns -+ * true for a non-queue entity. -+ */ -+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) - { -- struct bfq_entity *next_in_service; -+ struct bfq_group *bfqg; - -- if (sd->in_service_entity) -- /* will update/requeue at the end of service */ -- return 0; -+ if (bfq_entity_to_bfqq(entity)) -+ return true; - -- /* -- * NOTE: this can be improved in many ways, such as returning -- * 1 (and thus propagating upwards the update) only when the -- * budget changes, or caching the bfqq that will be scheduled -- * next from this subtree. By now we worry more about -- * correctness than about performance... -- */ -- next_in_service = bfq_lookup_next_entity(sd, 0, NULL); -- sd->next_in_service = next_in_service; -+ bfqg = container_of(entity, struct bfq_group, entity); - -- if (next_in_service) -- bfq_update_budget(next_in_service); -+ BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); -+ BUG_ON(bfqg->active_entities == 0); -+ if (bfqg->active_entities == 1) -+ return true; - -- return 1; -+ return false; - } - --static void bfq_check_next_in_service(struct bfq_sched_data *sd, -- struct bfq_entity *entity) --{ -- BUG_ON(sd->next_in_service != entity); --} --#else -+#else /* CONFIG_BFQ_GROUP_IOSCHED */ - #define for_each_entity(entity) \ - for (; entity ; entity = NULL) - - #define for_each_entity_safe(entity, parent) \ - for (parent = NULL; entity ; entity = parent) - --static int bfq_update_next_in_service(struct bfq_sched_data *sd) -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) - { -- return 0; -+ return false; - } - --static void bfq_check_next_in_service(struct bfq_sched_data *sd, -- struct bfq_entity *entity) -+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) - { -+ return true; - } - --static void bfq_update_budget(struct bfq_entity *next_in_service) --{ --} --#endif -+#endif /* CONFIG_BFQ_GROUP_IOSCHED */ - - /* - * Shift for timestamp calculations. This actually limits the maximum -@@ -105,18 +258,6 @@ static void bfq_update_budget(struct bfq_entity *next_in_service) - */ - #define WFQ_SERVICE_SHIFT 22 - --/** -- * bfq_gt - compare two timestamps. -- * @a: first ts. -- * @b: second ts. -- * -- * Return @a > @b, dealing with wrapping correctly. -- */ --static int bfq_gt(u64 a, u64 b) --{ -- return (s64)(a - b) > 0; --} -- - static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) - { - struct bfq_queue *bfqq = NULL; -@@ -151,20 +292,36 @@ static u64 bfq_delta(unsigned long service, unsigned long weight) - static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) - { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned long long start, finish, delta; - - BUG_ON(entity->weight == 0); - - entity->finish = entity->start + - bfq_delta(service, entity->weight); - -+ start = ((entity->start>>10)*1000)>>12; -+ finish = ((entity->finish>>10)*1000)>>12; -+ delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12; -+ - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_finish: serv %lu, w %d", - service, entity->weight); - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_finish: start %llu, finish %llu, delta %llu", -- entity->start, entity->finish, -- bfq_delta(service, entity->weight)); -+ start, finish, delta); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "calc_finish group: serv %lu, w %d", -+ service, entity->weight); -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "calc_finish group: start %llu, finish %llu, delta %llu", -+ start, finish, delta); -+#endif - } - } - -@@ -293,10 +450,26 @@ static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) - static void bfq_update_active_node(struct rb_node *node) - { - struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - entity->min_start = entity->start; - bfq_update_min(entity, node->rb_right); - bfq_update_min(entity, node->rb_left); -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "update_active_node: new min_start %llu", -+ ((entity->min_start>>10)*1000)>>12); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "update_active_node: new min_start %llu", -+ ((entity->min_start>>10)*1000)>>12); -+#endif -+ } - } - - /** -@@ -386,8 +559,6 @@ static void bfq_active_insert(struct bfq_service_tree *st, - BUG_ON(!bfqg); - BUG_ON(!bfqd); - bfqg->active_entities++; -- if (bfqg->active_entities == 2) -- bfqd->active_numerous_groups++; - } - #endif - } -@@ -399,7 +570,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, - static unsigned short bfq_ioprio_to_weight(int ioprio) - { - BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); -- return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio; -+ return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; - } - - /** -@@ -422,9 +593,9 @@ static void bfq_get_entity(struct bfq_entity *entity) - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - if (bfqq) { -- atomic_inc(&bfqq->ref); -+ bfqq->ref++; - bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", -- bfqq, atomic_read(&bfqq->ref)); -+ bfqq, bfqq->ref); - } - } - -@@ -499,10 +670,6 @@ static void bfq_active_extract(struct bfq_service_tree *st, - BUG_ON(!bfqd); - BUG_ON(!bfqg->active_entities); - bfqg->active_entities--; -- if (bfqg->active_entities == 1) { -- BUG_ON(!bfqd->active_numerous_groups); -- bfqd->active_numerous_groups--; -- } - } - #endif - } -@@ -531,28 +698,32 @@ static void bfq_idle_insert(struct bfq_service_tree *st, - } - - /** -- * bfq_forget_entity - remove an entity from the wfq trees. -+ * bfq_forget_entity - do not consider entity any longer for scheduling - * @st: the service tree. - * @entity: the entity being removed. -+ * @is_in_service: true if entity is currently the in-service entity. - * -- * Update the device status and forget everything about @entity, putting -- * the device reference to it, if it is a queue. Entities belonging to -- * groups are not refcounted. -+ * Forget everything about @entity. In addition, if entity represents -+ * a queue, and the latter is not in service, then release the service -+ * reference to the queue (the one taken through bfq_get_entity). In -+ * fact, in this case, there is really no more service reference to -+ * the queue, as the latter is also outside any service tree. If, -+ * instead, the queue is in service, then __bfq_bfqd_reset_in_service -+ * will take care of putting the reference when the queue finally -+ * stops being served. - */ - static void bfq_forget_entity(struct bfq_service_tree *st, -- struct bfq_entity *entity) -+ struct bfq_entity *entity, -+ bool is_in_service) - { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -- struct bfq_sched_data *sd; -- - BUG_ON(!entity->on_st); - -- entity->on_st = 0; -+ entity->on_st = false; - st->wsum -= entity->weight; -- if (bfqq) { -- sd = entity->sched_data; -- bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", -- bfqq, atomic_read(&bfqq->ref)); -+ if (bfqq && !is_in_service) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity (before): %p %d", -+ bfqq, bfqq->ref); - bfq_put_queue(bfqq); - } - } -@@ -566,7 +737,8 @@ static void bfq_put_idle_entity(struct bfq_service_tree *st, - struct bfq_entity *entity) - { - bfq_idle_extract(st, entity); -- bfq_forget_entity(st, entity); -+ bfq_forget_entity(st, entity, -+ entity == entity->sched_data->in_service_entity); - } - - /** -@@ -602,7 +774,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - - if (entity->prio_changed) { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -- unsigned short prev_weight, new_weight; -+ unsigned int prev_weight, new_weight; - struct bfq_data *bfqd = NULL; - struct rb_root *root; - #ifdef CONFIG_BFQ_GROUP_IOSCHED -@@ -630,7 +802,10 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - entity->new_weight > BFQ_MAX_WEIGHT) { - pr_crit("update_weight_prio: new_weight %d\n", - entity->new_weight); -- BUG(); -+ if (entity->new_weight < BFQ_MIN_WEIGHT) -+ entity->new_weight = BFQ_MIN_WEIGHT; -+ else -+ entity->new_weight = BFQ_MAX_WEIGHT; - } - entity->orig_weight = entity->new_weight; - if (bfqq) -@@ -661,6 +836,13 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - * associated with its new weight. - */ - if (prev_weight != new_weight) { -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "weight changed %d %d(%d %d)", -+ prev_weight, new_weight, -+ entity->orig_weight, -+ bfqq->wr_coeff); -+ - root = bfqq ? &bfqd->queue_weights_tree : - &bfqd->group_weights_tree; - bfq_weights_tree_remove(bfqd, entity, root); -@@ -707,7 +889,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) - st = bfq_entity_service_tree(entity); - - entity->service += served; -- BUG_ON(entity->service > entity->budget); -+ - BUG_ON(st->wsum == 0); - - st->vtime += bfq_delta(served, st->wsum); -@@ -716,234 +898,589 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) - #ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); - #endif -- bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); -+ st = bfq_entity_service_tree(&bfqq->entity); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", -+ served, ((st->vtime>>10)*1000)>>12, st); - } - - /** -- * bfq_bfqq_charge_full_budget - set the service to the entity budget. -+ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length -+ * of the time interval during which bfqq has been in -+ * service. -+ * @bfqd: the device - * @bfqq: the queue that needs a service update. -+ * @time_ms: the amount of time during which the queue has received service -+ * -+ * If a queue does not consume its budget fast enough, then providing -+ * the queue with service fairness may impair throughput, more or less -+ * severely. For this reason, queues that consume their budget slowly -+ * are provided with time fairness instead of service fairness. This -+ * goal is achieved through the BFQ scheduling engine, even if such an -+ * engine works in the service, and not in the time domain. The trick -+ * is charging these queues with an inflated amount of service, equal -+ * to the amount of service that they would have received during their -+ * service slot if they had been fast, i.e., if their requests had -+ * been dispatched at a rate equal to the estimated peak rate. - * -- * When it's not possible to be fair in the service domain, because -- * a queue is not consuming its budget fast enough (the meaning of -- * fast depends on the timeout parameter), we charge it a full -- * budget. In this way we should obtain a sort of time-domain -- * fairness among all the seeky/slow queues. -+ * It is worth noting that time fairness can cause important -+ * distortions in terms of bandwidth distribution, on devices with -+ * internal queueing. The reason is that I/O requests dispatched -+ * during the service slot of a queue may be served after that service -+ * slot is finished, and may have a total processing time loosely -+ * correlated with the duration of the service slot. This is -+ * especially true for short service slots. - */ --static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) -+static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ unsigned long time_ms) - { - struct bfq_entity *entity = &bfqq->entity; -+ int tot_serv_to_charge = entity->service; -+ unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout); -+ -+ if (time_ms > 0 && time_ms < timeout_ms) -+ tot_serv_to_charge = -+ (bfqd->bfq_max_budget * time_ms) / timeout_ms; -+ -+ if (tot_serv_to_charge < entity->service) -+ tot_serv_to_charge = entity->service; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "charge_time: %lu/%u ms, %d/%d/%d sectors", -+ time_ms, timeout_ms, entity->service, -+ tot_serv_to_charge, entity->budget); -+ -+ /* Increase budget to avoid inconsistencies */ -+ if (tot_serv_to_charge > entity->budget) -+ entity->budget = tot_serv_to_charge; -+ -+ bfq_bfqq_served(bfqq, -+ max_t(int, 0, tot_serv_to_charge - entity->service)); -+} -+ -+static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, -+ struct bfq_service_tree *st, -+ bool backshifted) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct bfq_sched_data *sd = entity->sched_data; - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); -+ st = __bfq_entity_update_weight_prio(st, entity); -+ bfq_calc_finish(entity, entity->budget); -+ -+ /* -+ * If some queues enjoy backshifting for a while, then their -+ * (virtual) finish timestamps may happen to become lower and -+ * lower than the system virtual time. In particular, if -+ * these queues often happen to be idle for short time -+ * periods, and during such time periods other queues with -+ * higher timestamps happen to be busy, then the backshifted -+ * timestamps of the former queues can become much lower than -+ * the system virtual time. In fact, to serve the queues with -+ * higher timestamps while the ones with lower timestamps are -+ * idle, the system virtual time may be pushed-up to much -+ * higher values than the finish timestamps of the idle -+ * queues. As a consequence, the finish timestamps of all new -+ * or newly activated queues may end up being much larger than -+ * those of lucky queues with backshifted timestamps. The -+ * latter queues may then monopolize the device for a lot of -+ * time. This would simply break service guarantees. -+ * -+ * To reduce this problem, push up a little bit the -+ * backshifted timestamps of the queue associated with this -+ * entity (only a queue can happen to have the backshifted -+ * flag set): just enough to let the finish timestamp of the -+ * queue be equal to the current value of the system virtual -+ * time. This may introduce a little unfairness among queues -+ * with backshifted timestamps, but it does not break -+ * worst-case fairness guarantees. -+ * -+ * As a special case, if bfqq is weight-raised, push up -+ * timestamps much less, to keep very low the probability that -+ * this push up causes the backshifted finish timestamps of -+ * weight-raised queues to become higher than the backshifted -+ * finish timestamps of non weight-raised queues. -+ */ -+ if (backshifted && bfq_gt(st->vtime, entity->finish)) { -+ unsigned long delta = st->vtime - entity->finish; -+ -+ if (bfqq) -+ delta /= bfqq->wr_coeff; - -- bfq_bfqq_served(bfqq, entity->budget - entity->service); -+ entity->start += delta; -+ entity->finish += delta; -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "__activate_entity: new queue finish %llu", -+ ((entity->finish>>10)*1000)>>12); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "__activate_entity: new group finish %llu", -+ ((entity->finish>>10)*1000)>>12); -+#endif -+ } -+ } -+ -+ bfq_active_insert(st, entity); -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "__activate_entity: queue %seligible in st %p", -+ entity->start <= st->vtime ? "" : "non ", st); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "__activate_entity: group %seligible in st %p", -+ entity->start <= st->vtime ? "" : "non ", st); -+#endif -+ } -+ BUG_ON(RB_EMPTY_ROOT(&st->active)); -+ BUG_ON(&st->active != &sd->service_tree->active && -+ &st->active != &(sd->service_tree+1)->active && -+ &st->active != &(sd->service_tree+2)->active); - } - - /** -- * __bfq_activate_entity - activate an entity. -+ * __bfq_activate_entity - handle activation of entity. - * @entity: the entity being activated. -+ * @non_blocking_wait_rq: true if entity was waiting for a request -+ * -+ * Called for a 'true' activation, i.e., if entity is not active and -+ * one of its children receives a new request. - * -- * Called whenever an entity is activated, i.e., it is not active and one -- * of its children receives a new request, or has to be reactivated due to -- * budget exhaustion. It uses the current budget of the entity (and the -- * service received if @entity is active) of the queue to calculate its -- * timestamps. -+ * Basically, this function updates the timestamps of entity and -+ * inserts entity into its active tree, ater possible extracting it -+ * from its idle tree. - */ --static void __bfq_activate_entity(struct bfq_entity *entity) -+static void __bfq_activate_entity(struct bfq_entity *entity, -+ bool non_blocking_wait_rq) - { - struct bfq_sched_data *sd = entity->sched_data; - struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ bool backshifted = false; -+ unsigned long long min_vstart; - -- if (entity == sd->in_service_entity) { -- BUG_ON(entity->tree); -- /* -- * If we are requeueing the current entity we have -- * to take care of not charging to it service it has -- * not received. -- */ -- bfq_calc_finish(entity, entity->service); -- entity->start = entity->finish; -- sd->in_service_entity = NULL; -- } else if (entity->tree == &st->active) { -- /* -- * Requeueing an entity due to a change of some -- * next_in_service entity below it. We reuse the -- * old start time. -- */ -- bfq_active_extract(st, entity); -- } else if (entity->tree == &st->idle) { -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ -+ /* See comments on bfq_fqq_update_budg_for_activation */ -+ if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { -+ backshifted = true; -+ min_vstart = entity->finish; -+ } else -+ min_vstart = st->vtime; -+ -+ if (entity->tree == &st->idle) { - /* - * Must be on the idle tree, bfq_idle_extract() will - * check for that. - */ - bfq_idle_extract(st, entity); -- entity->start = bfq_gt(st->vtime, entity->finish) ? -- st->vtime : entity->finish; -+ entity->start = bfq_gt(min_vstart, entity->finish) ? -+ min_vstart : entity->finish; - } else { - /* - * The finish time of the entity may be invalid, and - * it is in the past for sure, otherwise the queue - * would have been on the idle tree. - */ -- entity->start = st->vtime; -+ entity->start = min_vstart; - st->wsum += entity->weight; -+ /* -+ * entity is about to be inserted into a service tree, -+ * and then set in service: get a reference to make -+ * sure entity does not disappear until it is no -+ * longer in service or scheduled for service. -+ */ - bfq_get_entity(entity); - -- BUG_ON(entity->on_st); -- entity->on_st = 1; -+ BUG_ON(entity->on_st && bfqq); -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ if (entity->on_st && !bfqq) { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, -+ entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, -+ bfqg, -+ "activate bug, class %d in_service %p", -+ bfq_class_idx(entity), sd->in_service_entity); -+ } -+#endif -+ BUG_ON(entity->on_st && !bfqq); -+ entity->on_st = true; - } - -- st = __bfq_entity_update_weight_prio(st, entity); -- bfq_calc_finish(entity, entity->budget); -- bfq_active_insert(st, entity); -+ bfq_update_fin_time_enqueue(entity, st, backshifted); - } - - /** -- * bfq_activate_entity - activate an entity and its ancestors if necessary. -- * @entity: the entity to activate. -+ * __bfq_requeue_entity - handle requeueing or repositioning of an entity. -+ * @entity: the entity being requeued or repositioned. -+ * -+ * Requeueing is needed if this entity stops being served, which -+ * happens if a leaf descendant entity has expired. On the other hand, -+ * repositioning is needed if the next_inservice_entity for the child -+ * entity has changed. See the comments inside the function for -+ * details. - * -- * Activate @entity and all the entities on the path from it to the root. -+ * Basically, this function: 1) removes entity from its active tree if -+ * present there, 2) updates the timestamps of entity and 3) inserts -+ * entity back into its active tree (in the new, right position for -+ * the new values of the timestamps). - */ --static void bfq_activate_entity(struct bfq_entity *entity) -+static void __bfq_requeue_entity(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ -+ BUG_ON(entity != sd->in_service_entity && -+ entity->tree != &st->active); -+ -+ if (entity == sd->in_service_entity) { -+ /* -+ * We are requeueing the current in-service entity, -+ * which may have to be done for one of the following -+ * reasons: -+ * - entity represents the in-service queue, and the -+ * in-service queue is being requeued after an -+ * expiration; -+ * - entity represents a group, and its budget has -+ * changed because one of its child entities has -+ * just been either activated or requeued for some -+ * reason; the timestamps of the entity need then to -+ * be updated, and the entity needs to be enqueued -+ * or repositioned accordingly. -+ * -+ * In particular, before requeueing, the start time of -+ * the entity must be moved forward to account for the -+ * service that the entity has received while in -+ * service. This is done by the next instructions. The -+ * finish time will then be updated according to this -+ * new value of the start time, and to the budget of -+ * the entity. -+ */ -+ bfq_calc_finish(entity, entity->service); -+ entity->start = entity->finish; -+ BUG_ON(entity->tree && entity->tree != &st->active); -+ /* -+ * In addition, if the entity had more than one child -+ * when set in service, then was not extracted from -+ * the active tree. This implies that the position of -+ * the entity in the active tree may need to be -+ * changed now, because we have just updated the start -+ * time of the entity, and we will update its finish -+ * time in a moment (the requeueing is then, more -+ * precisely, a repositioning in this case). To -+ * implement this repositioning, we: 1) dequeue the -+ * entity here, 2) update the finish time and -+ * requeue the entity according to the new -+ * timestamps below. -+ */ -+ if (entity->tree) -+ bfq_active_extract(st, entity); -+ } else { /* The entity is already active, and not in service */ -+ /* -+ * In this case, this function gets called only if the -+ * next_in_service entity below this entity has -+ * changed, and this change has caused the budget of -+ * this entity to change, which, finally implies that -+ * the finish time of this entity must be -+ * updated. Such an update may cause the scheduling, -+ * i.e., the position in the active tree, of this -+ * entity to change. We handle this change by: 1) -+ * dequeueing the entity here, 2) updating the finish -+ * time and requeueing the entity according to the new -+ * timestamps below. This is the same approach as the -+ * non-extracted-entity sub-case above. -+ */ -+ bfq_active_extract(st, entity); -+ } -+ -+ bfq_update_fin_time_enqueue(entity, st, false); -+} -+ -+static void __bfq_activate_requeue_entity(struct bfq_entity *entity, -+ struct bfq_sched_data *sd, -+ bool non_blocking_wait_rq) -+{ -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ if (sd->in_service_entity == entity || entity->tree == &st->active) -+ /* -+ * in service or already queued on the active tree, -+ * requeue or reposition -+ */ -+ __bfq_requeue_entity(entity); -+ else -+ /* -+ * Not in service and not queued on its active tree: -+ * the activity is idle and this is a true activation. -+ */ -+ __bfq_activate_entity(entity, non_blocking_wait_rq); -+} -+ -+ -+/** -+ * bfq_activate_entity - activate or requeue an entity representing a bfq_queue, -+ * and activate, requeue or reposition all ancestors -+ * for which such an update becomes necessary. -+ * @entity: the entity to activate. -+ * @non_blocking_wait_rq: true if this entity was waiting for a request -+ * @requeue: true if this is a requeue, which implies that bfqq is -+ * being expired; thus ALL its ancestors stop being served and must -+ * therefore be requeued -+ */ -+static void bfq_activate_requeue_entity(struct bfq_entity *entity, -+ bool non_blocking_wait_rq, -+ bool requeue) - { - struct bfq_sched_data *sd; - - for_each_entity(entity) { -- __bfq_activate_entity(entity); -- -+ BUG_ON(!entity); - sd = entity->sched_data; -- if (!bfq_update_next_in_service(sd)) -- /* -- * No need to propagate the activation to the -- * upper entities, as they will be updated when -- * the in-service entity is rescheduled. -- */ -+ __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); -+ -+ BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) && -+ RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && -+ RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); -+ -+ if (!bfq_update_next_in_service(sd, entity) && !requeue) { -+ BUG_ON(!sd->next_in_service); - break; -+ } -+ BUG_ON(!sd->next_in_service); - } - } - - /** - * __bfq_deactivate_entity - deactivate an entity from its service tree. - * @entity: the entity to deactivate. -- * @requeue: if false, the entity will not be put into the idle tree. -- * -- * Deactivate an entity, independently from its previous state. If the -- * entity was not on a service tree just return, otherwise if it is on -- * any scheduler tree, extract it from that tree, and if necessary -- * and if the caller did not specify @requeue, put it on the idle tree. -+ * @ins_into_idle_tree: if false, the entity will not be put into the -+ * idle tree. - * -- * Return %1 if the caller should update the entity hierarchy, i.e., -- * if the entity was in service or if it was the next_in_service for -- * its sched_data; return %0 otherwise. -+ * Deactivates an entity, independently from its previous state. Must -+ * be invoked only if entity is on a service tree. Extracts the entity -+ * from that tree, and if necessary and allowed, puts it on the idle -+ * tree. - */ --static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) -+static bool __bfq_deactivate_entity(struct bfq_entity *entity, -+ bool ins_into_idle_tree) - { - struct bfq_sched_data *sd = entity->sched_data; - struct bfq_service_tree *st; -- int was_in_service; -- int ret = 0; -+ bool is_in_service; - -- if (sd == NULL || !entity->on_st) /* never activated, or inactive */ -- return 0; -+ if (!entity->on_st) { /* entity never activated, or already inactive */ -+ BUG_ON(sd && entity == sd->in_service_entity); -+ return false; -+ } - -+ /* -+ * If we get here, then entity is active, which implies that -+ * bfq_group_set_parent has already been invoked for the group -+ * represented by entity. Therefore, the field -+ * entity->sched_data has been set, and we can safely use it. -+ */ - st = bfq_entity_service_tree(entity); -- was_in_service = entity == sd->in_service_entity; -+ is_in_service = entity == sd->in_service_entity; - -- BUG_ON(was_in_service && entity->tree); -+ BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); - -- if (was_in_service) { -+ if (is_in_service) - bfq_calc_finish(entity, entity->service); -- sd->in_service_entity = NULL; -- } else if (entity->tree == &st->active) -+ -+ if (entity->tree == &st->active) - bfq_active_extract(st, entity); -- else if (entity->tree == &st->idle) -+ else if (!is_in_service && entity->tree == &st->idle) - bfq_idle_extract(st, entity); - else if (entity->tree) - BUG(); - -- if (was_in_service || sd->next_in_service == entity) -- ret = bfq_update_next_in_service(sd); -- -- if (!requeue || !bfq_gt(entity->finish, st->vtime)) -- bfq_forget_entity(st, entity); -+ if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime)) -+ bfq_forget_entity(st, entity, is_in_service); - else - bfq_idle_insert(st, entity); - -- BUG_ON(sd->in_service_entity == entity); -- BUG_ON(sd->next_in_service == entity); -- -- return ret; -+ return true; - } - - /** -- * bfq_deactivate_entity - deactivate an entity. -+ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. - * @entity: the entity to deactivate. -- * @requeue: true if the entity can be put on the idle tree -+ * @ins_into_idle_tree: true if the entity can be put on the idle tree - */ --static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) -+static void bfq_deactivate_entity(struct bfq_entity *entity, -+ bool ins_into_idle_tree, -+ bool expiration) - { - struct bfq_sched_data *sd; -- struct bfq_entity *parent; -+ struct bfq_entity *parent = NULL; - - for_each_entity_safe(entity, parent) { - sd = entity->sched_data; - -- if (!__bfq_deactivate_entity(entity, requeue)) -+ BUG_ON(sd == NULL); /* -+ * It would mean that this is the -+ * root group. -+ */ -+ -+ BUG_ON(expiration && entity != sd->in_service_entity); -+ -+ BUG_ON(entity != sd->in_service_entity && -+ entity->tree == -+ &bfq_entity_service_tree(entity)->active && -+ !sd->next_in_service); -+ -+ if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) { - /* -- * The parent entity is still backlogged, and -- * we don't need to update it as it is still -- * in service. -+ * entity is not in any tree any more, so -+ * this deactivation is a no-op, and there is -+ * nothing to change for upper-level entities -+ * (in case of expiration, this can never -+ * happen). - */ -- break; -+ BUG_ON(expiration); /* -+ * entity cannot be already out of -+ * any tree -+ */ -+ return; -+ } -+ -+ if (sd->next_in_service == entity) -+ /* -+ * entity was the next_in_service entity, -+ * then, since entity has just been -+ * deactivated, a new one must be found. -+ */ -+ bfq_update_next_in_service(sd, NULL); - -- if (sd->next_in_service) -+ if (sd->next_in_service) { - /* -- * The parent entity is still backlogged and -- * the budgets on the path towards the root -- * need to be updated. -+ * The parent entity is still backlogged, -+ * because next_in_service is not NULL. So, no -+ * further upwards deactivation must be -+ * performed. Yet, next_in_service has -+ * changed. Then the schedule does need to be -+ * updated upwards. - */ -- goto update; -+ BUG_ON(sd->next_in_service == entity); -+ break; -+ } - - /* -- * If we reach there the parent is no more backlogged and -- * we want to propagate the dequeue upwards. -+ * If we get here, then the parent is no more -+ * backlogged and we need to propagate the -+ * deactivation upwards. Thus let the loop go on. - */ -- requeue = 1; -- } - -- return; -+ /* -+ * Also let parent be queued into the idle tree on -+ * deactivation, to preserve service guarantees, and -+ * assuming that who invoked this function does not -+ * need parent entities too to be removed completely. -+ */ -+ ins_into_idle_tree = true; -+ } - --update: -+ /* -+ * If the deactivation loop is fully executed, then there are -+ * no more entities to touch and next loop is not executed at -+ * all. Otherwise, requeue remaining entities if they are -+ * about to stop receiving service, or reposition them if this -+ * is not the case. -+ */ - entity = parent; - for_each_entity(entity) { -- __bfq_activate_entity(entity); -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ /* -+ * Invoke __bfq_requeue_entity on entity, even if -+ * already active, to requeue/reposition it in the -+ * active tree (because sd->next_in_service has -+ * changed) -+ */ -+ __bfq_requeue_entity(entity); - - sd = entity->sched_data; -- if (!bfq_update_next_in_service(sd)) -+ BUG_ON(expiration && sd->in_service_entity != entity); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "invoking udpdate_next for this queue"); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, -+ struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "invoking udpdate_next for this entity"); -+ } -+#endif -+ if (!bfq_update_next_in_service(sd, entity) && -+ !expiration) -+ /* -+ * next_in_service unchanged or not causing -+ * any change in entity->parent->sd, and no -+ * requeueing needed for expiration: stop -+ * here. -+ */ - break; - } - } - - /** -- * bfq_update_vtime - update vtime if necessary. -+ * bfq_calc_vtime_jump - compute the value to which the vtime should jump, -+ * if needed, to have at least one entity eligible. - * @st: the service tree to act upon. - * -- * If necessary update the service tree vtime to have at least one -- * eligible entity, skipping to its start time. Assumes that the -- * active tree of the device is not empty. -- * -- * NOTE: this hierarchical implementation updates vtimes quite often, -- * we may end up with reactivated processes getting timestamps after a -- * vtime skip done because we needed a ->first_active entity on some -- * intermediate node. -+ * Assumes that st is not empty. - */ --static void bfq_update_vtime(struct bfq_service_tree *st) -+static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) - { -- struct bfq_entity *entry; -- struct rb_node *node = st->active.rb_node; -+ struct bfq_entity *root_entity = bfq_root_active_entity(&st->active); -+ -+ if (bfq_gt(root_entity->min_start, st->vtime)) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity); - -- entry = rb_entry(node, struct bfq_entity, rb_node); -- if (bfq_gt(entry->min_start, st->vtime)) { -- st->vtime = entry->min_start; -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "calc_vtime_jump: new value %llu", -+ root_entity->min_start); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(root_entity, struct bfq_group, -+ entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "calc_vtime_jump: new value %llu", -+ root_entity->min_start); -+ } -+#endif -+ return root_entity->min_start; -+ } -+ return st->vtime; -+} -+ -+static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value) -+{ -+ if (new_value > st->vtime) { -+ st->vtime = new_value; - bfq_forget_idle(st); - } - } -@@ -952,6 +1489,7 @@ static void bfq_update_vtime(struct bfq_service_tree *st) - * bfq_first_active_entity - find the eligible entity with - * the smallest finish time - * @st: the service tree to select from. -+ * @vtime: the system virtual to use as a reference for eligibility - * - * This function searches the first schedulable entity, starting from the - * root of the tree and going on the left every time on this side there is -@@ -959,7 +1497,8 @@ static void bfq_update_vtime(struct bfq_service_tree *st) - * the right is followed only if a) the left subtree contains no eligible - * entities and b) no eligible entity has been found yet. - */ --static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) -+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, -+ u64 vtime) - { - struct bfq_entity *entry, *first = NULL; - struct rb_node *node = st->active.rb_node; -@@ -967,15 +1506,15 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) - while (node) { - entry = rb_entry(node, struct bfq_entity, rb_node); - left: -- if (!bfq_gt(entry->start, st->vtime)) -+ if (!bfq_gt(entry->start, vtime)) - first = entry; - -- BUG_ON(bfq_gt(entry->min_start, st->vtime)); -+ BUG_ON(bfq_gt(entry->min_start, vtime)); - - if (node->rb_left) { - entry = rb_entry(node->rb_left, - struct bfq_entity, rb_node); -- if (!bfq_gt(entry->min_start, st->vtime)) { -+ if (!bfq_gt(entry->min_start, vtime)) { - node = node->rb_left; - goto left; - } -@@ -993,31 +1532,84 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) - * __bfq_lookup_next_entity - return the first eligible entity in @st. - * @st: the service tree. - * -- * Update the virtual time in @st and return the first eligible entity -- * it contains. -+ * If there is no in-service entity for the sched_data st belongs to, -+ * then return the entity that will be set in service if: -+ * 1) the parent entity this st belongs to is set in service; -+ * 2) no entity belonging to such parent entity undergoes a state change -+ * that would influence the timestamps of the entity (e.g., becomes idle, -+ * becomes backlogged, changes its budget, ...). -+ * -+ * In this first case, update the virtual time in @st too (see the -+ * comments on this update inside the function). -+ * -+ * In constrast, if there is an in-service entity, then return the -+ * entity that would be set in service if not only the above -+ * conditions, but also the next one held true: the currently -+ * in-service entity, on expiration, -+ * 1) gets a finish time equal to the current one, or -+ * 2) is not eligible any more, or -+ * 3) is idle. - */ --static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, -- bool force) -+static struct bfq_entity * -+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service -+#if 0 -+ , bool force -+#endif -+ ) - { -- struct bfq_entity *entity, *new_next_in_service = NULL; -+ struct bfq_entity *entity -+#if 0 -+ , *new_next_in_service = NULL -+#endif -+ ; -+ u64 new_vtime; -+ struct bfq_queue *bfqq; - - if (RB_EMPTY_ROOT(&st->active)) - return NULL; - -- bfq_update_vtime(st); -- entity = bfq_first_active_entity(st); -- BUG_ON(bfq_gt(entity->start, st->vtime)); -+ /* -+ * Get the value of the system virtual time for which at -+ * least one entity is eligible. -+ */ -+ new_vtime = bfq_calc_vtime_jump(st); - - /* -- * If the chosen entity does not match with the sched_data's -- * next_in_service and we are forcedly serving the IDLE priority -- * class tree, bubble up budget update. -+ * If there is no in-service entity for the sched_data this -+ * active tree belongs to, then push the system virtual time -+ * up to the value that guarantees that at least one entity is -+ * eligible. If, instead, there is an in-service entity, then -+ * do not make any such update, because there is already an -+ * eligible entity, namely the in-service one (even if the -+ * entity is not on st, because it was extracted when set in -+ * service). - */ -- if (unlikely(force && entity != entity->sched_data->next_in_service)) { -- new_next_in_service = entity; -- for_each_entity(new_next_in_service) -- bfq_update_budget(new_next_in_service); -+ if (!in_service) -+ bfq_update_vtime(st, new_vtime); -+ -+ entity = bfq_first_active_entity(st, new_vtime); -+ BUG_ON(bfq_gt(entity->start, new_vtime)); -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "__lookup_next: start %llu vtime %llu st %p", -+ ((entity->start>>10)*1000)>>12, -+ ((new_vtime>>10)*1000)>>12, st); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "__lookup_next: start %llu vtime %llu st %p", -+ ((entity->start>>10)*1000)>>12, -+ ((new_vtime>>10)*1000)>>12, st); - } -+#endif -+ -+ BUG_ON(!entity); - - return entity; - } -@@ -1025,50 +1617,81 @@ static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, - /** - * bfq_lookup_next_entity - return the first eligible entity in @sd. - * @sd: the sched_data. -- * @extract: if true the returned entity will be also extracted from @sd. - * -- * NOTE: since we cache the next_in_service entity at each level of the -- * hierarchy, the complexity of the lookup can be decreased with -- * absolutely no effort just returning the cached next_in_service value; -- * we prefer to do full lookups to test the consistency of * the data -- * structures. -+ * This function is invoked when there has been a change in the trees -+ * for sd, and we need know what is the new next entity after this -+ * change. - */ --static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, -- int extract, -- struct bfq_data *bfqd) -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) - { - struct bfq_service_tree *st = sd->service_tree; -- struct bfq_entity *entity; -- int i = 0; -- -- BUG_ON(sd->in_service_entity); -+ struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); -+ struct bfq_entity *entity = NULL; -+ struct bfq_queue *bfqq; -+ int class_idx = 0; - -- if (bfqd && -- jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { -- entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, -- true); -- if (entity) { -- i = BFQ_IOPRIO_CLASSES - 1; -- bfqd->bfq_class_idle_last_service = jiffies; -- sd->next_in_service = entity; -- } -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ /* -+ * Choose from idle class, if needed to guarantee a minimum -+ * bandwidth to this class (and if there is some active entity -+ * in idle class). This should also mitigate -+ * priority-inversion problems in case a low priority task is -+ * holding file system resources. -+ */ -+ if (time_is_before_jiffies(sd->bfq_class_idle_last_service + -+ BFQ_CL_IDLE_TIMEOUT)) { -+ if (!RB_EMPTY_ROOT(&idle_class_st->active)) -+ class_idx = BFQ_IOPRIO_CLASSES - 1; -+ /* About to be served if backlogged, or not yet backlogged */ -+ sd->bfq_class_idle_last_service = jiffies; - } -- for (; i < BFQ_IOPRIO_CLASSES; i++) { -- entity = __bfq_lookup_next_entity(st + i, false); -- if (entity) { -- if (extract) { -- bfq_check_next_in_service(sd, entity); -- bfq_active_extract(st + i, entity); -- sd->in_service_entity = entity; -- sd->next_in_service = NULL; -- } -+ -+ /* -+ * Find the next entity to serve for the highest-priority -+ * class, unless the idle class needs to be served. -+ */ -+ for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { -+ entity = __bfq_lookup_next_entity(st + class_idx, -+ sd->in_service_entity); -+ -+ if (entity) - break; -- } - } - -+ BUG_ON(!entity && -+ (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) || -+ !RB_EMPTY_ROOT(&(st+2)->active))); -+ -+ if (!entity) -+ return NULL; -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", -+ st + class_idx, class_idx); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "chosen from st %p %d", -+ st + class_idx, class_idx); -+ } -+#endif -+ - return entity; - } - -+static bool next_queue_may_preempt(struct bfq_data *bfqd) -+{ -+ struct bfq_sched_data *sd = &bfqd->root_group->sched_data; -+ -+ return sd->next_in_service != sd->in_service_entity; -+} -+ - /* - * Get next queue for service. - */ -@@ -1083,58 +1706,273 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - if (bfqd->busy_queues == 0) - return NULL; - -+ /* -+ * Traverse the path from the root to the leaf entity to -+ * serve. Set in service all the entities visited along the -+ * way. -+ */ - sd = &bfqd->root_group->sched_data; - for (; sd ; sd = entity->my_sched_data) { -- entity = bfq_lookup_next_entity(sd, 1, bfqd); -- BUG_ON(!entity); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ if (entity) { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg(bfqd, bfqg, -+ "get_next_queue: lookup in this group"); -+ if (!sd->next_in_service) -+ pr_crit("get_next_queue: lookup in this group"); -+ } else { -+ bfq_log_bfqg(bfqd, bfqd->root_group, -+ "get_next_queue: lookup in root group"); -+ if (!sd->next_in_service) -+ pr_crit("get_next_queue: lookup in root group"); -+ } -+#endif -+ -+ BUG_ON(!sd->next_in_service); -+ -+ /* -+ * WARNING. We are about to set the in-service entity -+ * to sd->next_in_service, i.e., to the (cached) value -+ * returned by bfq_lookup_next_entity(sd) the last -+ * time it was invoked, i.e., the last time when the -+ * service order in sd changed as a consequence of the -+ * activation or deactivation of an entity. In this -+ * respect, if we execute bfq_lookup_next_entity(sd) -+ * in this very moment, it may, although with low -+ * probability, yield a different entity than that -+ * pointed to by sd->next_in_service. This rare event -+ * happens in case there was no CLASS_IDLE entity to -+ * serve for sd when bfq_lookup_next_entity(sd) was -+ * invoked for the last time, while there is now one -+ * such entity. -+ * -+ * If the above event happens, then the scheduling of -+ * such entity in CLASS_IDLE is postponed until the -+ * service of the sd->next_in_service entity -+ * finishes. In fact, when the latter is expired, -+ * bfq_lookup_next_entity(sd) gets called again, -+ * exactly to update sd->next_in_service. -+ */ -+ -+ /* Make next_in_service entity become in_service_entity */ -+ entity = sd->next_in_service; -+ sd->in_service_entity = entity; -+ -+ /* -+ * Reset the accumulator of the amount of service that -+ * the entity is about to receive. -+ */ - entity->service = 0; -+ -+ /* -+ * If entity is no longer a candidate for next -+ * service, then we extract it from its active tree, -+ * for the following reason. To further boost the -+ * throughput in some special case, BFQ needs to know -+ * which is the next candidate entity to serve, while -+ * there is already an entity in service. In this -+ * respect, to make it easy to compute/update the next -+ * candidate entity to serve after the current -+ * candidate has been set in service, there is a case -+ * where it is necessary to extract the current -+ * candidate from its service tree. Such a case is -+ * when the entity just set in service cannot be also -+ * a candidate for next service. Details about when -+ * this conditions holds are reported in the comments -+ * on the function bfq_no_longer_next_in_service() -+ * invoked below. -+ */ -+ if (bfq_no_longer_next_in_service(entity)) -+ bfq_active_extract(bfq_entity_service_tree(entity), -+ entity); -+ -+ /* -+ * For the same reason why we may have just extracted -+ * entity from its active tree, we may need to update -+ * next_in_service for the sched_data of entity too, -+ * regardless of whether entity has been extracted. -+ * In fact, even if entity has not been extracted, a -+ * descendant entity may get extracted. Such an event -+ * would cause a change in next_in_service for the -+ * level of the descendant entity, and thus possibly -+ * back to upper levels. -+ * -+ * We cannot perform the resulting needed update -+ * before the end of this loop, because, to know which -+ * is the correct next-to-serve candidate entity for -+ * each level, we need first to find the leaf entity -+ * to set in service. In fact, only after we know -+ * which is the next-to-serve leaf entity, we can -+ * discover whether the parent entity of the leaf -+ * entity becomes the next-to-serve, and so on. -+ */ -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_next_queue: this queue, finish %llu", -+ (((entity->finish>>10)*1000)>>10)>>2); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg(bfqd, bfqg, -+ "get_next_queue: this entity, finish %llu", -+ (((entity->finish>>10)*1000)>>10)>>2); -+ } -+#endif -+ - } - -+ BUG_ON(!entity); - bfqq = bfq_entity_to_bfqq(entity); - BUG_ON(!bfqq); - -+ /* -+ * We can finally update all next-to-serve entities along the -+ * path from the leaf entity just set in service to the root. -+ */ -+ for_each_entity(entity) { -+ struct bfq_sched_data *sd = entity->sched_data; -+ -+ if(!bfq_update_next_in_service(sd, NULL)) -+ break; -+ } -+ - return bfqq; - } - - static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) - { -+ struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue; -+ struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; -+ struct bfq_entity *entity = in_serv_entity; -+ - if (bfqd->in_service_bic) { - put_io_context(bfqd->in_service_bic->icq.ioc); - bfqd->in_service_bic = NULL; - } - -+ bfq_clear_bfqq_wait_request(in_serv_bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqd->in_service_queue = NULL; -- del_timer(&bfqd->idle_slice_timer); -+ -+ /* -+ * When this function is called, all in-service entities have -+ * been properly deactivated or requeued, so we can safely -+ * execute the final step: reset in_service_entity along the -+ * path from entity to the root. -+ */ -+ for_each_entity(entity) -+ entity->sched_data->in_service_entity = NULL; -+ -+ /* -+ * in_serv_entity is no longer in service, so, if it is in no -+ * service tree either, then release the service reference to -+ * the queue it represents (taken with bfq_get_entity). -+ */ -+ if (!in_serv_entity->on_st) -+ bfq_put_queue(in_serv_bfqq); -+} -+ -+static void set_next_in_service_bfqq(struct bfq_data *bfqd) -+{ -+ struct bfq_entity *entity = NULL; -+ struct bfq_queue *bfqq; -+ struct bfq_sched_data *sd = &bfqd->root_group->sched_data; -+ -+ BUG_ON(!sd); -+ -+ /* Traverse the path from the root to the in-service leaf entity */ -+ for (; sd ; sd = entity->my_sched_data) { -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ if (entity) { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg(bfqd, bfqg, -+ "set_next_in_service_bfqq: lookup in this group"); -+ } else -+ bfq_log_bfqg(bfqd, bfqd->root_group, -+ "set_next_in_service_bfqq: lookup in root group"); -+#endif -+ -+ entity = sd->next_in_service; -+ -+ if (!entity) { -+ bfqd->next_in_service_queue = NULL; -+ return; -+ } -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_next_in_service_bfqq: this queue, finish %llu", -+ (((entity->finish>>10)*1000)>>10)>>2); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg(bfqd, bfqg, -+ "set_next_in_service_bfqq: this entity, finish %llu", -+ (((entity->finish>>10)*1000)>>10)>>2); -+ } -+#endif -+ -+ } -+ BUG_ON(!bfq_entity_to_bfqq(entity)); -+ -+ bfqd->next_in_service_queue = bfq_entity_to_bfqq(entity); - } - - static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -- int requeue) -+ bool ins_into_idle_tree, bool expiration) - { - struct bfq_entity *entity = &bfqq->entity; - -- if (bfqq == bfqd->in_service_queue) -- __bfq_bfqd_reset_in_service(bfqd); -- -- bfq_deactivate_entity(entity, requeue); -+ bfq_deactivate_entity(entity, ins_into_idle_tree, expiration); -+ set_next_in_service_bfqq(bfqd); - } - - static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - { - struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(entity->tree != &st->active && entity->tree != &st->idle && -+ entity->on_st); - -- bfq_activate_entity(entity); -+ bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), -+ false); -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+ set_next_in_service_bfqq(bfqd); -+} -+ -+static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ bfq_activate_requeue_entity(entity, false, -+ bfqq == bfqd->in_service_queue); -+ set_next_in_service_bfqq(bfqd); - } - --#ifdef CONFIG_BFQ_GROUP_IOSCHED - static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); --#endif - - /* - * Called when the bfqq no longer has requests pending, remove it from -- * the service tree. -+ * the service tree. As a special case, it can be invoked during an -+ * expiration. - */ - static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, -- int requeue) -+ bool expiration) - { - BUG_ON(!bfq_bfqq_busy(bfqq)); - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -@@ -1146,27 +1984,18 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, - BUG_ON(bfqd->busy_queues == 0); - bfqd->busy_queues--; - -- if (!bfqq->dispatched) { -+ if (!bfqq->dispatched) - bfq_weights_tree_remove(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); -- if (!blk_queue_nonrot(bfqd->queue)) { -- BUG_ON(!bfqd->busy_in_flight_queues); -- bfqd->busy_in_flight_queues--; -- if (bfq_bfqq_constantly_seeky(bfqq)) { -- BUG_ON(!bfqd-> -- const_seeky_busy_in_flight_queues); -- bfqd->const_seeky_busy_in_flight_queues--; -- } -- } -- } -+ - if (bfqq->wr_coeff > 1) - bfqd->wr_busy_queues--; - --#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_dequeue(bfqq_group(bfqq)); --#endif - -- bfq_deactivate_bfqq(bfqd, bfqq, requeue); -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); - } - - /* -@@ -1184,16 +2013,11 @@ static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_mark_bfqq_busy(bfqq); - bfqd->busy_queues++; - -- if (!bfqq->dispatched) { -+ if (!bfqq->dispatched) - if (bfqq->wr_coeff == 1) - bfq_weights_tree_add(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); -- if (!blk_queue_nonrot(bfqd->queue)) { -- bfqd->busy_in_flight_queues++; -- if (bfq_bfqq_constantly_seeky(bfqq)) -- bfqd->const_seeky_busy_in_flight_queues++; -- } -- } -+ - if (bfqq->wr_coeff > 1) - bfqd->wr_busy_queues++; - } -diff --git a/block/bfq.h b/block/bfq.h -index fcce855..5f08990 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -1,5 +1,5 @@ - /* -- * BFQ-v7r11 for 4.5.0: data structures and common functions prototypes. -+ * BFQ v8r11 for 4.11.0: data structures and common functions prototypes. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe -@@ -7,7 +7,9 @@ - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * -- * Copyright (C) 2010 Paolo Valente -+ * Copyright (C) 2015 Paolo Valente -+ * -+ * Copyright (C) 2017 Paolo Valente - */ - - #ifndef _BFQ_H -@@ -15,8 +17,6 @@ - - #include - #include --#include --#include - #include - - #define BFQ_IOPRIO_CLASSES 3 -@@ -28,20 +28,21 @@ - - #define BFQ_DEFAULT_QUEUE_IOPRIO 4 - --#define BFQ_DEFAULT_GRP_WEIGHT 10 -+#define BFQ_WEIGHT_LEGACY_DFL 100 - #define BFQ_DEFAULT_GRP_IOPRIO 0 - #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE - -+/* -+ * Soft real-time applications are extremely more latency sensitive -+ * than interactive ones. Over-raise the weight of the former to -+ * privilege them against the latter. -+ */ -+#define BFQ_SOFTRT_WEIGHT_FACTOR 100 -+ - struct bfq_entity; - - /** - * struct bfq_service_tree - per ioprio_class service tree. -- * @active: tree for active entities (i.e., those backlogged). -- * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). -- * @first_idle: idle entity with minimum F_i. -- * @last_idle: idle entity with maximum F_i. -- * @vtime: scheduler virtual time. -- * @wsum: scheduler weight sum; active and idle entities contribute to it. - * - * Each service tree represents a B-WF2Q+ scheduler on its own. Each - * ioprio_class has its own independent scheduler, and so its own -@@ -49,27 +50,28 @@ struct bfq_entity; - * of the containing bfqd. - */ - struct bfq_service_tree { -+ /* tree for active entities (i.e., those backlogged) */ - struct rb_root active; -+ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ - struct rb_root idle; - -- struct bfq_entity *first_idle; -- struct bfq_entity *last_idle; -+ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ -+ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ - -- u64 vtime; -+ u64 vtime; /* scheduler virtual time */ -+ /* scheduler weight sum; active and idle entities contribute to it */ - unsigned long wsum; - }; - - /** - * struct bfq_sched_data - multi-class scheduler. -- * @in_service_entity: entity in service. -- * @next_in_service: head-of-the-line entity in the scheduler. -- * @service_tree: array of service trees, one per ioprio_class. - * - * bfq_sched_data is the basic scheduler queue. It supports three -- * ioprio_classes, and can be used either as a toplevel queue or as -- * an intermediate queue on a hierarchical setup. -- * @next_in_service points to the active entity of the sched_data -- * service trees that will be scheduled next. -+ * ioprio_classes, and can be used either as a toplevel queue or as an -+ * intermediate queue on a hierarchical setup. @next_in_service -+ * points to the active entity of the sched_data service trees that -+ * will be scheduled next. It is used to reduce the number of steps -+ * needed for each hierarchical-schedule update. - * - * The supported ioprio_classes are the same as in CFQ, in descending - * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. -@@ -79,48 +81,32 @@ struct bfq_service_tree { - * All the fields are protected by the queue lock of the containing bfqd. - */ - struct bfq_sched_data { -- struct bfq_entity *in_service_entity; -+ struct bfq_entity *in_service_entity; /* entity in service */ -+ /* head-of-the-line entity in the scheduler (see comments above) */ - struct bfq_entity *next_in_service; -+ /* array of service trees, one per ioprio_class */ - struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; -+ /* last time CLASS_IDLE was served */ -+ unsigned long bfq_class_idle_last_service; -+ - }; - - /** - * struct bfq_weight_counter - counter of the number of all active entities - * with a given weight. -- * @weight: weight of the entities that this counter refers to. -- * @num_active: number of active entities with this weight. -- * @weights_node: weights tree member (see bfq_data's @queue_weights_tree -- * and @group_weights_tree). - */ - struct bfq_weight_counter { -- short int weight; -- unsigned int num_active; -+ unsigned int weight; /* weight of the entities this counter refers to */ -+ unsigned int num_active; /* nr of active entities with this weight */ -+ /* -+ * Weights tree member (see bfq_data's @queue_weights_tree and -+ * @group_weights_tree) -+ */ - struct rb_node weights_node; - }; - - /** - * struct bfq_entity - schedulable entity. -- * @rb_node: service_tree member. -- * @weight_counter: pointer to the weight counter associated with this entity. -- * @on_st: flag, true if the entity is on a tree (either the active or -- * the idle one of its service_tree). -- * @finish: B-WF2Q+ finish timestamp (aka F_i). -- * @start: B-WF2Q+ start timestamp (aka S_i). -- * @tree: tree the entity is enqueued into; %NULL if not on a tree. -- * @min_start: minimum start time of the (active) subtree rooted at -- * this entity; used for O(log N) lookups into active trees. -- * @service: service received during the last round of service. -- * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. -- * @weight: weight of the queue -- * @parent: parent entity, for hierarchical scheduling. -- * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the -- * associated scheduler queue, %NULL on leaf nodes. -- * @sched_data: the scheduler queue this entity belongs to. -- * @ioprio: the ioprio in use. -- * @new_weight: when a weight change is requested, the new weight value. -- * @orig_weight: original weight, used to implement weight boosting -- * @prio_changed: flag, true when the user requested a weight, ioprio or -- * ioprio_class change. - * - * A bfq_entity is used to represent either a bfq_queue (leaf node in the - * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each -@@ -147,27 +133,52 @@ struct bfq_weight_counter { - * containing bfqd. - */ - struct bfq_entity { -- struct rb_node rb_node; -+ struct rb_node rb_node; /* service_tree member */ -+ /* pointer to the weight counter associated with this entity */ - struct bfq_weight_counter *weight_counter; - -- int on_st; -+ /* -+ * Flag, true if the entity is on a tree (either the active or -+ * the idle one of its service_tree) or is in service. -+ */ -+ bool on_st; - -- u64 finish; -- u64 start; -+ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ -+ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ - -+ /* tree the entity is enqueued into; %NULL if not on a tree */ - struct rb_root *tree; - -+ /* -+ * minimum start time of the (active) subtree rooted at this -+ * entity; used for O(log N) lookups into active trees -+ */ - u64 min_start; - -- int service, budget; -- unsigned short weight, new_weight; -- unsigned short orig_weight; -+ /* amount of service received during the last service slot */ -+ int service; -+ -+ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ -+ int budget; -+ -+ unsigned int weight; /* weight of the queue */ -+ unsigned int new_weight; /* next weight if a change is in progress */ -+ -+ /* original weight, used to implement weight boosting */ -+ unsigned int orig_weight; - -+ /* parent entity, for hierarchical scheduling */ - struct bfq_entity *parent; - -+ /* -+ * For non-leaf nodes in the hierarchy, the associated -+ * scheduler queue, %NULL on leaf nodes. -+ */ - struct bfq_sched_data *my_sched_data; -+ /* the scheduler queue this entity belongs to */ - struct bfq_sched_data *sched_data; - -+ /* flag, set to request a weight, ioprio or ioprio_class change */ - int prio_changed; - }; - -@@ -175,56 +186,6 @@ struct bfq_group; - - /** - * struct bfq_queue - leaf schedulable entity. -- * @ref: reference counter. -- * @bfqd: parent bfq_data. -- * @new_ioprio: when an ioprio change is requested, the new ioprio value. -- * @ioprio_class: the ioprio_class in use. -- * @new_ioprio_class: when an ioprio_class change is requested, the new -- * ioprio_class value. -- * @new_bfqq: shared bfq_queue if queue is cooperating with -- * one or more other queues. -- * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree). -- * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree). -- * @sort_list: sorted list of pending requests. -- * @next_rq: if fifo isn't expired, next request to serve. -- * @queued: nr of requests queued in @sort_list. -- * @allocated: currently allocated requests. -- * @meta_pending: pending metadata requests. -- * @fifo: fifo list of requests in sort_list. -- * @entity: entity representing this queue in the scheduler. -- * @max_budget: maximum budget allowed from the feedback mechanism. -- * @budget_timeout: budget expiration (in jiffies). -- * @dispatched: number of requests on the dispatch list or inside driver. -- * @flags: status flags. -- * @bfqq_list: node for active/idle bfqq list inside our bfqd. -- * @burst_list_node: node for the device's burst list. -- * @seek_samples: number of seeks sampled -- * @seek_total: sum of the distances of the seeks sampled -- * @seek_mean: mean seek distance -- * @last_request_pos: position of the last request enqueued -- * @requests_within_timer: number of consecutive pairs of request completion -- * and arrival, such that the queue becomes idle -- * after the completion, but the next request arrives -- * within an idle time slice; used only if the queue's -- * IO_bound has been cleared. -- * @pid: pid of the process owning the queue, used for logging purposes. -- * @last_wr_start_finish: start time of the current weight-raising period if -- * the @bfq-queue is being weight-raised, otherwise -- * finish time of the last weight-raising period -- * @wr_cur_max_time: current max raising time for this queue -- * @soft_rt_next_start: minimum time instant such that, only if a new -- * request is enqueued after this time instant in an -- * idle @bfq_queue with no outstanding requests, then -- * the task associated with the queue it is deemed as -- * soft real-time (see the comments to the function -- * bfq_bfqq_softrt_next_start()) -- * @last_idle_bklogged: time of the last transition of the @bfq_queue from -- * idle to backlogged -- * @service_from_backlogged: cumulative service received from the @bfq_queue -- * since the last transition from idle to -- * backlogged -- * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the -- * queue is shared - * - * A bfq_queue is a leaf request queue; it can be associated with an - * io_context or more, if it is async or shared between cooperating -@@ -235,117 +196,175 @@ struct bfq_group; - * All the fields are protected by the queue lock of the containing bfqd. - */ - struct bfq_queue { -- atomic_t ref; -+ /* reference counter */ -+ int ref; -+ /* parent bfq_data */ - struct bfq_data *bfqd; - -- unsigned short ioprio, new_ioprio; -- unsigned short ioprio_class, new_ioprio_class; -+ /* current ioprio and ioprio class */ -+ unsigned short ioprio, ioprio_class; -+ /* next ioprio and ioprio class if a change is in progress */ -+ unsigned short new_ioprio, new_ioprio_class; - -- /* fields for cooperating queues handling */ -+ /* -+ * Shared bfq_queue if queue is cooperating with one or more -+ * other queues. -+ */ - struct bfq_queue *new_bfqq; -+ /* request-position tree member (see bfq_group's @rq_pos_tree) */ - struct rb_node pos_node; -+ /* request-position tree root (see bfq_group's @rq_pos_tree) */ - struct rb_root *pos_root; - -+ /* sorted list of pending requests */ - struct rb_root sort_list; -+ /* if fifo isn't expired, next request to serve */ - struct request *next_rq; -+ /* number of sync and async requests queued */ - int queued[2]; -+ /* number of sync and async requests currently allocated */ - int allocated[2]; -+ /* number of pending metadata requests */ - int meta_pending; -+ /* fifo list of requests in sort_list */ - struct list_head fifo; - -+ /* entity representing this queue in the scheduler */ - struct bfq_entity entity; - -+ /* maximum budget allowed from the feedback mechanism */ - int max_budget; -+ /* budget expiration (in jiffies) */ - unsigned long budget_timeout; - -+ /* number of requests on the dispatch list or inside driver */ - int dispatched; - -- unsigned int flags; -+ unsigned int flags; /* status flags.*/ - -+ /* node for active/idle bfqq list inside parent bfqd */ - struct list_head bfqq_list; - -+ /* bit vector: a 1 for each seeky requests in history */ -+ u32 seek_history; -+ -+ /* node for the device's burst list */ - struct hlist_node burst_list_node; - -- unsigned int seek_samples; -- u64 seek_total; -- sector_t seek_mean; -+ /* position of the last request enqueued */ - sector_t last_request_pos; - -+ /* Number of consecutive pairs of request completion and -+ * arrival, such that the queue becomes idle after the -+ * completion, but the next request arrives within an idle -+ * time slice; used only if the queue's IO_bound flag has been -+ * cleared. -+ */ - unsigned int requests_within_timer; - -+ /* pid of the process owning the queue, used for logging purposes */ - pid_t pid; -+ -+ /* -+ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL -+ * if the queue is shared. -+ */ - struct bfq_io_cq *bic; - -- /* weight-raising fields */ -+ /* current maximum weight-raising time for this queue */ - unsigned long wr_cur_max_time; -+ /* -+ * Minimum time instant such that, only if a new request is -+ * enqueued after this time instant in an idle @bfq_queue with -+ * no outstanding requests, then the task associated with the -+ * queue it is deemed as soft real-time (see the comments on -+ * the function bfq_bfqq_softrt_next_start()) -+ */ - unsigned long soft_rt_next_start; -+ /* -+ * Start time of the current weight-raising period if -+ * the @bfq-queue is being weight-raised, otherwise -+ * finish time of the last weight-raising period. -+ */ - unsigned long last_wr_start_finish; -+ /* factor by which the weight of this queue is multiplied */ - unsigned int wr_coeff; -+ /* -+ * Time of the last transition of the @bfq_queue from idle to -+ * backlogged. -+ */ - unsigned long last_idle_bklogged; -+ /* -+ * Cumulative service received from the @bfq_queue since the -+ * last transition from idle to backlogged. -+ */ - unsigned long service_from_backlogged; -+ /* -+ * Value of wr start time when switching to soft rt -+ */ -+ unsigned long wr_start_at_switch_to_srt; -+ -+ unsigned long split_time; /* time of last split */ - }; - - /** - * struct bfq_ttime - per process thinktime stats. -- * @ttime_total: total process thinktime -- * @ttime_samples: number of thinktime samples -- * @ttime_mean: average process thinktime - */ - struct bfq_ttime { -- unsigned long last_end_request; -+ u64 last_end_request; /* completion time of last request */ -+ -+ u64 ttime_total; /* total process thinktime */ -+ unsigned long ttime_samples; /* number of thinktime samples */ -+ u64 ttime_mean; /* average process thinktime */ - -- unsigned long ttime_total; -- unsigned long ttime_samples; -- unsigned long ttime_mean; - }; - - /** - * struct bfq_io_cq - per (request_queue, io_context) structure. -- * @icq: associated io_cq structure -- * @bfqq: array of two process queues, the sync and the async -- * @ttime: associated @bfq_ttime struct -- * @ioprio: per (request_queue, blkcg) ioprio. -- * @blkcg_id: id of the blkcg the related io_cq belongs to. -- * @wr_time_left: snapshot of the time left before weight raising ends -- * for the sync queue associated to this process; this -- * snapshot is taken to remember this value while the weight -- * raising is suspended because the queue is merged with a -- * shared queue, and is used to set @raising_cur_max_time -- * when the queue is split from the shared queue and its -- * weight is raised again -- * @saved_idle_window: same purpose as the previous field for the idle -- * window -- * @saved_IO_bound: same purpose as the previous two fields for the I/O -- * bound classification of a queue -- * @saved_in_large_burst: same purpose as the previous fields for the -- * value of the field keeping the queue's belonging -- * to a large burst -- * @was_in_burst_list: true if the queue belonged to a burst list -- * before its merge with another cooperating queue -- * @cooperations: counter of consecutive successful queue merges underwent -- * by any of the process' @bfq_queues -- * @failed_cooperations: counter of consecutive failed queue merges of any -- * of the process' @bfq_queues - */ - struct bfq_io_cq { -+ /* associated io_cq structure */ - struct io_cq icq; /* must be the first member */ -+ /* array of two process queues, the sync and the async */ - struct bfq_queue *bfqq[2]; -+ /* associated @bfq_ttime struct */ - struct bfq_ttime ttime; -+ /* per (request_queue, blkcg) ioprio */ - int ioprio; -- - #ifdef CONFIG_BFQ_GROUP_IOSCHED -- uint64_t blkcg_id; /* the current blkcg ID */ -+ uint64_t blkcg_serial_nr; /* the current blkcg serial */ - #endif - -- unsigned int wr_time_left; -+ /* -+ * Snapshot of the idle window before merging; taken to -+ * remember this value while the queue is merged, so as to be -+ * able to restore it in case of split. -+ */ - bool saved_idle_window; -+ /* -+ * Same purpose as the previous two fields for the I/O bound -+ * classification of a queue. -+ */ - bool saved_IO_bound; - -+ /* -+ * Same purpose as the previous fields for the value of the -+ * field keeping the queue's belonging to a large burst -+ */ - bool saved_in_large_burst; -+ /* -+ * True if the queue belonged to a burst list before its merge -+ * with another cooperating queue. -+ */ - bool was_in_burst_list; - -- unsigned int cooperations; -- unsigned int failed_cooperations; -+ /* -+ * Similar to previous fields: save wr information. -+ */ -+ unsigned long saved_wr_coeff; -+ unsigned long saved_last_wr_start_finish; -+ unsigned long saved_wr_start_at_switch_to_srt; -+ unsigned int saved_wr_cur_max_time; - }; - - enum bfq_device_speed { -@@ -354,224 +373,234 @@ enum bfq_device_speed { - }; - - /** -- * struct bfq_data - per device data structure. -- * @queue: request queue for the managed device. -- * @root_group: root bfq_group for the device. -- * @active_numerous_groups: number of bfq_groups containing more than one -- * active @bfq_entity. -- * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by -- * weight. Used to keep track of whether all @bfq_queues -- * have the same weight. The tree contains one counter -- * for each distinct weight associated to some active -- * and not weight-raised @bfq_queue (see the comments to -- * the functions bfq_weights_tree_[add|remove] for -- * further details). -- * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted -- * by weight. Used to keep track of whether all -- * @bfq_groups have the same weight. The tree contains -- * one counter for each distinct weight associated to -- * some active @bfq_group (see the comments to the -- * functions bfq_weights_tree_[add|remove] for further -- * details). -- * @busy_queues: number of bfq_queues containing requests (including the -- * queue in service, even if it is idling). -- * @busy_in_flight_queues: number of @bfq_queues containing pending or -- * in-flight requests, plus the @bfq_queue in -- * service, even if idle but waiting for the -- * possible arrival of its next sync request. This -- * field is updated only if the device is rotational, -- * but used only if the device is also NCQ-capable. -- * The reason why the field is updated also for non- -- * NCQ-capable rotational devices is related to the -- * fact that the value of @hw_tag may be set also -- * later than when busy_in_flight_queues may need to -- * be incremented for the first time(s). Taking also -- * this possibility into account, to avoid unbalanced -- * increments/decrements, would imply more overhead -- * than just updating busy_in_flight_queues -- * regardless of the value of @hw_tag. -- * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues -- * (that is, seeky queues that expired -- * for budget timeout at least once) -- * containing pending or in-flight -- * requests, including the in-service -- * @bfq_queue if constantly seeky. This -- * field is updated only if the device -- * is rotational, but used only if the -- * device is also NCQ-capable (see the -- * comments to @busy_in_flight_queues). -- * @wr_busy_queues: number of weight-raised busy @bfq_queues. -- * @queued: number of queued requests. -- * @rq_in_driver: number of requests dispatched and waiting for completion. -- * @sync_flight: number of sync requests in the driver. -- * @max_rq_in_driver: max number of reqs in driver in the last -- * @hw_tag_samples completed requests. -- * @hw_tag_samples: nr of samples used to calculate hw_tag. -- * @hw_tag: flag set to one if the driver is showing a queueing behavior. -- * @budgets_assigned: number of budgets assigned. -- * @idle_slice_timer: timer set when idling for the next sequential request -- * from the queue in service. -- * @unplug_work: delayed work to restart dispatching on the request queue. -- * @in_service_queue: bfq_queue in service. -- * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. -- * @last_position: on-disk position of the last served request. -- * @last_budget_start: beginning of the last budget. -- * @last_idling_start: beginning of the last idle slice. -- * @peak_rate: peak transfer rate observed for a budget. -- * @peak_rate_samples: number of samples used to calculate @peak_rate. -- * @bfq_max_budget: maximum budget allotted to a bfq_queue before -- * rescheduling. -- * @active_list: list of all the bfq_queues active on the device. -- * @idle_list: list of all the bfq_queues idle on the device. -- * @bfq_fifo_expire: timeout for async/sync requests; when it expires -- * requests are served in fifo order. -- * @bfq_back_penalty: weight of backward seeks wrt forward ones. -- * @bfq_back_max: maximum allowed backward seek. -- * @bfq_slice_idle: maximum idling time. -- * @bfq_user_max_budget: user-configured max budget value -- * (0 for auto-tuning). -- * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to -- * async queues. -- * @bfq_timeout: timeout for bfq_queues to consume their budget; used to -- * to prevent seeky queues to impose long latencies to well -- * behaved ones (this also implies that seeky queues cannot -- * receive guarantees in the service domain; after a timeout -- * they are charged for the whole allocated budget, to try -- * to preserve a behavior reasonably fair among them, but -- * without service-domain guarantees). -- * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is -- * no more granted any weight-raising. -- * @bfq_failed_cooperations: number of consecutive failed cooperation -- * chances after which weight-raising is restored -- * to a queue subject to more than bfq_coop_thresh -- * queue merges. -- * @bfq_requests_within_timer: number of consecutive requests that must be -- * issued within the idle time slice to set -- * again idling to a queue which was marked as -- * non-I/O-bound (see the definition of the -- * IO_bound flag for further details). -- * @last_ins_in_burst: last time at which a queue entered the current -- * burst of queues being activated shortly after -- * each other; for more details about this and the -- * following parameters related to a burst of -- * activations, see the comments to the function -- * @bfq_handle_burst. -- * @bfq_burst_interval: reference time interval used to decide whether a -- * queue has been activated shortly after -- * @last_ins_in_burst. -- * @burst_size: number of queues in the current burst of queue activations. -- * @bfq_large_burst_thresh: maximum burst size above which the current -- * queue-activation burst is deemed as 'large'. -- * @large_burst: true if a large queue-activation burst is in progress. -- * @burst_list: head of the burst list (as for the above fields, more details -- * in the comments to the function bfq_handle_burst). -- * @low_latency: if set to true, low-latency heuristics are enabled. -- * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised -- * queue is multiplied. -- * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies). -- * @bfq_wr_rt_max_time: maximum duration for soft real-time processes. -- * @bfq_wr_min_idle_time: minimum idle period after which weight-raising -- * may be reactivated for a queue (in jiffies). -- * @bfq_wr_min_inter_arr_async: minimum period between request arrivals -- * after which weight-raising may be -- * reactivated for an already busy queue -- * (in jiffies). -- * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue, -- * sectors per seconds. -- * @RT_prod: cached value of the product R*T used for computing the maximum -- * duration of the weight raising automatically. -- * @device_speed: device-speed class for the low-latency heuristic. -- * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions. -+ * struct bfq_data - per-device data structure. - * - * All the fields are protected by the @queue lock. - */ - struct bfq_data { -+ /* request queue for the device */ - struct request_queue *queue; - -+ /* root bfq_group for the device */ - struct bfq_group *root_group; - --#ifdef CONFIG_BFQ_GROUP_IOSCHED -- int active_numerous_groups; --#endif -- -+ /* -+ * rbtree of weight counters of @bfq_queues, sorted by -+ * weight. Used to keep track of whether all @bfq_queues have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active and not -+ * weight-raised @bfq_queue (see the comments to the functions -+ * bfq_weights_tree_[add|remove] for further details). -+ */ - struct rb_root queue_weights_tree; -+ /* -+ * rbtree of non-queue @bfq_entity weight counters, sorted by -+ * weight. Used to keep track of whether all @bfq_groups have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active @bfq_group (see -+ * the comments to the functions bfq_weights_tree_[add|remove] -+ * for further details). -+ */ - struct rb_root group_weights_tree; - -+ /* -+ * Number of bfq_queues containing requests (including the -+ * queue in service, even if it is idling). -+ */ - int busy_queues; -- int busy_in_flight_queues; -- int const_seeky_busy_in_flight_queues; -+ /* number of weight-raised busy @bfq_queues */ - int wr_busy_queues; -+ /* number of queued requests */ - int queued; -+ /* number of requests dispatched and waiting for completion */ - int rq_in_driver; -- int sync_flight; - -+ /* -+ * Maximum number of requests in driver in the last -+ * @hw_tag_samples completed requests. -+ */ - int max_rq_in_driver; -+ /* number of samples used to calculate hw_tag */ - int hw_tag_samples; -+ /* flag set to one if the driver is showing a queueing behavior */ - int hw_tag; - -+ /* number of budgets assigned */ - int budgets_assigned; - -- struct timer_list idle_slice_timer; -+ /* -+ * Timer set when idling (waiting) for the next request from -+ * the queue in service. -+ */ -+ struct hrtimer idle_slice_timer; -+ /* delayed work to restart dispatching on the request queue */ - struct work_struct unplug_work; - -+ /* bfq_queue in service */ - struct bfq_queue *in_service_queue; -+ /* candidate bfq_queue to become the next in-service queue */ -+ struct bfq_queue *next_in_service_queue; -+ /* bfq_io_cq (bic) associated with the @in_service_queue */ - struct bfq_io_cq *in_service_bic; - -+ /* on-disk position of the last served request */ - sector_t last_position; - -+ /* time of last request completion (ns) */ -+ u64 last_completion; -+ -+ /* time of first rq dispatch in current observation interval (ns) */ -+ u64 first_dispatch; -+ /* time of last rq dispatch in current observation interval (ns) */ -+ u64 last_dispatch; -+ -+ /* beginning of the last budget */ - ktime_t last_budget_start; -+ /* beginning of the last idle slice */ - ktime_t last_idling_start; -+ -+ /* number of samples in current observation interval */ - int peak_rate_samples; -- u64 peak_rate; -+ /* num of samples of seq dispatches in current observation interval */ -+ u32 sequential_samples; -+ /* total num of sectors transferred in current observation interval */ -+ u64 tot_sectors_dispatched; -+ /* max rq size seen during current observation interval (sectors) */ -+ u32 last_rq_max_size; -+ /* time elapsed from first dispatch in current observ. interval (us) */ -+ u64 delta_from_first; -+ /* current estimate of device peak rate */ -+ u32 peak_rate; -+ -+ /* maximum budget allotted to a bfq_queue before rescheduling */ - int bfq_max_budget; - -+ /* list of all the bfq_queues active on the device */ - struct list_head active_list; -+ /* list of all the bfq_queues idle on the device */ - struct list_head idle_list; - -- unsigned int bfq_fifo_expire[2]; -+ /* -+ * Timeout for async/sync requests; when it fires, requests -+ * are served in fifo order. -+ */ -+ u64 bfq_fifo_expire[2]; -+ /* weight of backward seeks wrt forward ones */ - unsigned int bfq_back_penalty; -+ /* maximum allowed backward seek */ - unsigned int bfq_back_max; -- unsigned int bfq_slice_idle; -- u64 bfq_class_idle_last_service; -+ /* maximum idling time */ -+ u32 bfq_slice_idle; - -+ /* user-configured max budget value (0 for auto-tuning) */ - int bfq_user_max_budget; -- int bfq_max_budget_async_rq; -- unsigned int bfq_timeout[2]; -- -- unsigned int bfq_coop_thresh; -- unsigned int bfq_failed_cooperations; -+ /* -+ * Timeout for bfq_queues to consume their budget; used to -+ * prevent seeky queues from imposing long latencies to -+ * sequential or quasi-sequential ones (this also implies that -+ * seeky queues cannot receive guarantees in the service -+ * domain; after a timeout they are charged for the time they -+ * have been in service, to preserve fairness among them, but -+ * without service-domain guarantees). -+ */ -+ unsigned int bfq_timeout; -+ -+ /* -+ * Number of consecutive requests that must be issued within -+ * the idle time slice to set again idling to a queue which -+ * was marked as non-I/O-bound (see the definition of the -+ * IO_bound flag for further details). -+ */ - unsigned int bfq_requests_within_timer; - -+ /* -+ * Force device idling whenever needed to provide accurate -+ * service guarantees, without caring about throughput -+ * issues. CAVEAT: this may even increase latencies, in case -+ * of useless idling for processes that did stop doing I/O. -+ */ -+ bool strict_guarantees; -+ -+ /* -+ * Last time at which a queue entered the current burst of -+ * queues being activated shortly after each other; for more -+ * details about this and the following parameters related to -+ * a burst of activations, see the comments on the function -+ * bfq_handle_burst. -+ */ - unsigned long last_ins_in_burst; -+ /* -+ * Reference time interval used to decide whether a queue has -+ * been activated shortly after @last_ins_in_burst. -+ */ - unsigned long bfq_burst_interval; -+ /* number of queues in the current burst of queue activations */ - int burst_size; -+ -+ /* common parent entity for the queues in the burst */ -+ struct bfq_entity *burst_parent_entity; -+ /* Maximum burst size above which the current queue-activation -+ * burst is deemed as 'large'. -+ */ - unsigned long bfq_large_burst_thresh; -+ /* true if a large queue-activation burst is in progress */ - bool large_burst; -+ /* -+ * Head of the burst list (as for the above fields, more -+ * details in the comments on the function bfq_handle_burst). -+ */ - struct hlist_head burst_list; - -+ /* if set to true, low-latency heuristics are enabled */ - bool low_latency; -- -- /* parameters of the low_latency heuristics */ -+ /* -+ * Maximum factor by which the weight of a weight-raised queue -+ * is multiplied. -+ */ - unsigned int bfq_wr_coeff; -+ /* maximum duration of a weight-raising period (jiffies) */ - unsigned int bfq_wr_max_time; -+ -+ /* Maximum weight-raising duration for soft real-time processes */ - unsigned int bfq_wr_rt_max_time; -+ /* -+ * Minimum idle period after which weight-raising may be -+ * reactivated for a queue (in jiffies). -+ */ - unsigned int bfq_wr_min_idle_time; -+ /* -+ * Minimum period between request arrivals after which -+ * weight-raising may be reactivated for an already busy async -+ * queue (in jiffies). -+ */ - unsigned long bfq_wr_min_inter_arr_async; -+ -+ /* Max service-rate for a soft real-time queue, in sectors/sec */ - unsigned int bfq_wr_max_softrt_rate; -+ /* -+ * Cached value of the product R*T, used for computing the -+ * maximum duration of weight raising automatically. -+ */ - u64 RT_prod; -+ /* device-speed class for the low-latency heuristic */ - enum bfq_device_speed device_speed; - -+ /* fallback dummy bfqq for extreme OOM conditions */ - struct bfq_queue oom_bfqq; - }; - - enum bfqq_state_flags { -- BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ -+ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ -+ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ - BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ -+ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* -+ * waiting for a request -+ * without idling the device -+ */ - BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ - BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ - BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ - BFQ_BFQQ_FLAG_sync, /* synchronous queue */ -- BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ - BFQ_BFQQ_FLAG_IO_bound, /* - * bfqq has timed-out at least once - * having consumed at most 2/10 of -@@ -581,17 +610,12 @@ enum bfqq_state_flags { - * bfqq activated in a large burst, - * see comments to bfq_handle_burst. - */ -- BFQ_BFQQ_FLAG_constantly_seeky, /* -- * bfqq has proved to be slow and -- * seeky until budget timeout -- */ - BFQ_BFQQ_FLAG_softrt_update, /* - * may need softrt-next-start - * update - */ - BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ -- BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ -- BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ -+ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ - }; - - #define BFQ_BFQQ_FNS(name) \ -@@ -608,28 +632,94 @@ static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ - return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ - } - -+BFQ_BFQQ_FNS(just_created); - BFQ_BFQQ_FNS(busy); - BFQ_BFQQ_FNS(wait_request); -+BFQ_BFQQ_FNS(non_blocking_wait_rq); - BFQ_BFQQ_FNS(must_alloc); - BFQ_BFQQ_FNS(fifo_expire); - BFQ_BFQQ_FNS(idle_window); - BFQ_BFQQ_FNS(sync); --BFQ_BFQQ_FNS(budget_new); - BFQ_BFQQ_FNS(IO_bound); - BFQ_BFQQ_FNS(in_large_burst); --BFQ_BFQQ_FNS(constantly_seeky); - BFQ_BFQQ_FNS(coop); - BFQ_BFQQ_FNS(split_coop); --BFQ_BFQQ_FNS(just_split); - BFQ_BFQQ_FNS(softrt_update); - #undef BFQ_BFQQ_FNS - - /* Logging facilities. */ --#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -- blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) -+#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("bfq%d%c %s " fmt "\n", \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("%s " fmt "\n", __pbuf, ##args); \ -+} while (0) -+ -+#else /* CONFIG_BFQ_GROUP_IOSCHED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ pr_crit("bfq%d%c " fmt "\n", (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* CONFIG_BFQ_GROUP_IOSCHED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ pr_crit("bfq " fmt "\n", ##args) -+ -+#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ -+} while (0) -+ -+#else /* CONFIG_BFQ_GROUP_IOSCHED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* CONFIG_BFQ_GROUP_IOSCHED */ - - #define bfq_log(bfqd, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ - - /* Expiration reasons. */ - enum bfqq_expiration { -@@ -640,15 +730,12 @@ enum bfqq_expiration { - BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ - BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ - BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ -+ BFQ_BFQQ_PREEMPTED /* preemption in progress */ - }; - --#ifdef CONFIG_BFQ_GROUP_IOSCHED - - struct bfqg_stats { -- /* total bytes transferred */ -- struct blkg_rwstat service_bytes; -- /* total IOs serviced, post merge */ -- struct blkg_rwstat serviced; -+#ifdef CONFIG_BFQ_GROUP_IOSCHED - /* number of ios merged */ - struct blkg_rwstat merged; - /* total time spent on device in ns, may not be accurate w/ queueing */ -@@ -657,12 +744,8 @@ struct bfqg_stats { - struct blkg_rwstat wait_time; - /* number of IOs queued up */ - struct blkg_rwstat queued; -- /* total sectors transferred */ -- struct blkg_stat sectors; - /* total disk time and nr sectors dispatched by this group */ - struct blkg_stat time; -- /* time not charged to this cgroup */ -- struct blkg_stat unaccounted_time; - /* sum of number of ios queued across all samples */ - struct blkg_stat avg_queue_size_sum; - /* count of samples taken for average */ -@@ -680,8 +763,10 @@ struct bfqg_stats { - uint64_t start_idle_time; - uint64_t start_empty_time; - uint16_t flags; -+#endif - }; - -+#ifdef CONFIG_BFQ_GROUP_IOSCHED - /* - * struct bfq_group_data - per-blkcg storage for the blkio subsystem. - * -@@ -692,7 +777,7 @@ struct bfq_group_data { - /* must be the first member */ - struct blkcg_policy_data pd; - -- unsigned short weight; -+ unsigned int weight; - }; - - /** -@@ -712,7 +797,7 @@ struct bfq_group_data { - * unused for the root group. Used to know whether there - * are groups with more than one active @bfq_entity - * (see the comments to the function -- * bfq_bfqq_must_not_expire()). -+ * bfq_bfqq_may_idle()). - * @rq_pos_tree: rbtree sorted by next_request position, used when - * determining if two or more queues have interleaving - * requests (see bfq_find_close_cooperator()). -@@ -745,7 +830,6 @@ struct bfq_group { - struct rb_root rq_pos_tree; - - struct bfqg_stats stats; -- struct bfqg_stats dead_stats; /* stats pushed from dead children */ - }; - - #else -@@ -761,17 +845,38 @@ struct bfq_group { - - static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); - -+static unsigned int bfq_class_idx(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ return bfqq ? bfqq->ioprio_class - 1 : -+ BFQ_DEFAULT_GRP_CLASS - 1; -+} -+ - static struct bfq_service_tree * - bfq_entity_service_tree(struct bfq_entity *entity) - { - struct bfq_sched_data *sched_data = entity->sched_data; - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -- unsigned int idx = bfqq ? bfqq->ioprio_class - 1 : -- BFQ_DEFAULT_GRP_CLASS; -+ unsigned int idx = bfq_class_idx(entity); - - BUG_ON(idx >= BFQ_IOPRIO_CLASSES); - BUG_ON(sched_data == NULL); - -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "entity_service_tree %p %d", -+ sched_data->service_tree + idx, idx); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "entity_service_tree %p %d", -+ sched_data->service_tree + idx, idx); -+ } -+#endif - return sched_data->service_tree + idx; - } - -@@ -791,47 +896,6 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) - return bic->icq.q->elevator->elevator_data; - } - --/** -- * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. -- * @ptr: a pointer to a bfqd. -- * @flags: storage for the flags to be saved. -- * -- * This function allows bfqg->bfqd to be protected by the -- * queue lock of the bfqd they reference; the pointer is dereferenced -- * under RCU, so the storage for bfqd is assured to be safe as long -- * as the RCU read side critical section does not end. After the -- * bfqd->queue->queue_lock is taken the pointer is rechecked, to be -- * sure that no other writer accessed it. If we raced with a writer, -- * the function returns NULL, with the queue unlocked, otherwise it -- * returns the dereferenced pointer, with the queue locked. -- */ --static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags) --{ -- struct bfq_data *bfqd; -- -- rcu_read_lock(); -- bfqd = rcu_dereference(*(struct bfq_data **)ptr); -- -- if (bfqd != NULL) { -- spin_lock_irqsave(bfqd->queue->queue_lock, *flags); -- if (ptr == NULL) -- printk(KERN_CRIT "get_bfqd_locked pointer NULL\n"); -- else if (*ptr == bfqd) -- goto out; -- spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); -- } -- -- bfqd = NULL; --out: -- rcu_read_unlock(); -- return bfqd; --} -- --static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) --{ -- spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); --} -- - #ifdef CONFIG_BFQ_GROUP_IOSCHED - - static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -@@ -857,11 +921,13 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); - static void bfq_put_queue(struct bfq_queue *bfqq); - static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); - static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -- struct bio *bio, int is_sync, -- struct bfq_io_cq *bic, gfp_t gfp_mask); -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic); - static void bfq_end_wr_async_queues(struct bfq_data *bfqd, - struct bfq_group *bfqg); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED - static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -+#endif - static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); - - #endif /* _BFQ_H */ --- -2.10.0 - diff --git a/README.BFQ b/README.BFQ deleted file mode 100644 index 4aa2425..0000000 --- a/README.BFQ +++ /dev/null @@ -1,786 +0,0 @@ -Budget Fair Queueing I/O Scheduler -================================== - -This patchset introduces BFQ-v8r11 into Linux 4.11.0. -For further information: http://algogroup.unimore.it/people/paolo/disk_sched/ - -The overall diffstat is the following: - - Documentation/block/00-INDEX | 2 + - Documentation/block/bfq-iosched.txt | 530 ++++++ - Makefile | 2 +- - block/Kconfig.iosched | 30 + - block/Makefile | 1 + - block/bfq-cgroup.c | 1191 +++++++++++++ - block/bfq-ioc.c | 36 + - block/bfq-iosched.c | 5306 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - block/bfq-sched.c | 1989 ++++++++++++++++++++++ - block/bfq.h | 935 +++++++++++ - include/linux/blkdev.h | 2 +- - 11 files changed, 10022 insertions(+), 2 deletions(-) - -CHANGELOG - -BFQ v8r11 - - BUGFIX: use pointer entity->sched_data only if set. - In the function __bfq_deactivate_entity, the pointer - entity->sched_data could happen to be used before being properly - initialized. This led to a NULL pointer dereference. This commit fixes - this bug by just using this pointer only where it is safe to do so. - Reported-by: Tom Harrison - Tested-by: Tom Harrison - - BUGFIX: don't dereference bic before null checking it. - The call to bfq_check_ioprio_change will dereference bic, however, - the null check for bic is after this call. Move the the null - check on bic to before the call to avoid any potential null - pointer dereference issues. - Detected by CoverityScan, CID#1430138 ("Dereference before null check") - Signed-off-by: Colin Ian King - Signed-off-by: Jens Axboe - -BFQ v8r10 - - BUGFIX: Handle failure of weight-counter allocation - Reported-by: Bart Van Assche - - BUGFIX: Remove problematic check on max service duration - Reported-by: Bart Van Assche - -BFQ v8r9 - - BUGIFX: remove use of bfq queues after free - bfq queues occasionally happened to be used after being freed, because - they were accessed after some invocations of bfq_put_queue that could - cause them to be freed. This commit refactors code, when needed, to - avoid any occurrence of such a use-after-free of a bfq queue. This - commit also adds comments to make references to bfq queues easier to - follow. - -BFQ v8r8 - -. BUGFIX: Removed a wrong compilation warning, due to the compiler - not taking into account short circuit in a condition. - -. BUGIFX: Added several forgotten static qualifiers in function - definitions (completely harmless issue). - -. BUGFIX: Put async queues on exit also without cgroups - -. BUGFIX: The putting of async queues on scheduler exit was missing in case - cgroups support was not active. This fix adds the missing operation. - -. BUGFIX: In the peak-rate estimator, there was a serious error in the - check that the percentage of sequential I/O-request dispatches was high - enough to trigger an update of the peak-rate estimate. This commit fixes - that check. - -. IMPROVEMENT Luca Miccio has run a few responsiveness tests on recent - Android systems with average-speed storage devices. These tests have - shown that the following BFQ parameter was too low for these - systems: reference duration for slow storage devices of weight - raising for interactive applications. This commit raises that - duration to a value that is yelding optimal results in our - tests. Contributed by Luca Miccio. - -. IMPROVEMENT This commit anticipates the complete check of budget - exhaustion, for the in-service bfq_queue, to when the next bfq_queue - to serve is selected (during a dispatch operation). This enables a - new bfq_queue to be immediately selected for service in case the - in-service bfq_queue has actually exhausted its budget. As a - consequence, a second dispatch invocation is not needed any more, to - have a new request dispatched. To implement this improvement, this - commit implements a further improvement too: the field next_rq of a - bfq_queue now always contains the actual next request to dispatch - (or NULL if the bfq_queue is empty). - -. BUGFIX Make bfq_bic_update_cgroup() return nothing if - CONFIG_BFQ_GROUP_IOSCHED is disabled, as it happens if this option - is enabled. Contributed by Oleksandr Natalenko. - -BFQ v8r7 - -. BUGFIX: make BFQ compile also without hierarchical support - -BFQ v8r6 - -. BUGFIX Removed the check that, when the new queue to set in service - must be selected, the cached next_in_service entities coincide with - the entities chosen by __bfq_lookup_next_entity. This check, issuing - a warning on failure, was wrong, because the cached and the newly - chosen entity could differ in case of a CLASS_IDLE timeout. - -. EFFICIENCY IMPROVEMENT (this improvement is related to the above - BUGFIX) The cached next_in_service entities are now really used to - select the next queue to serve when the in-service queue - expires. Before this change, the cached values were used only for - extra (and in general wrong) consistency checks. This caused - additional overhead instead of reducing it. - -. EFFICIENCY IMPROVEMENT The next entity to serve, for each level of - the hierarchy, is now updated on every event that may change it, - i.e., on every activation or deactivation of any entity. This finer - granularity is not strictly needed for corectness, because it is - only on queue expirations that BFQ needs to know what are the next - entities to serve. Yet this change makes it possible to implement - optimizations in which it is necessary to know the next queue to - serve before the in-service queue expires. - -. SERVICE-ACCURACY IMPROVEMENT The per-device CLASS_IDLE service - timeout has been turned into a much more accurate per-group timeout. - -. CODE-QUALITY IMPROVEMENT The non-trivial parts touched by the above - improvements have been partially rewritten, and enriched of - comments, so as to improve their transparency and understandability. - -. IMPROVEMENT Ported and improved CFQ commit 41647e7a Before this - improvememtn, BFQ used the same logic for detecting seeky queues for - rotational disks and SSDs. This logic is appropriate for the former, - as it takes into account only inter-request distance, and the latter - is the dominant latency factor on a rotational device. Yet things - change with flash-based devices, where serving a large request still - yields a high throughput, even the request is far from the previous - request served. This commits extends seeky detection to take into - accoutn also this fact with flash-based devices. In particular, this - commit is an improved port of the original commit 41647e7a for CFQ. - -. CODE IMPROVEMENT Remove useless parameter from bfq_del_bfqq_busy - -. OPTIMIZATION Optimize the update of next_in_service entity. If the - update of the next_in_service candidate entity is triggered by the - activation of an entity, then it is not necessary to perform full - lookups in the active trees to update next_in_service. In fact, it - is enough to check whether the just-activated entity has a higher - priority than next_in_service, or, even if it has the same priority - as next_in_service, is eligible and has a lower virtual finish time - than next_in_service. If this compound condition holds, then the new - entity can be set as the new next_in_service. Otherwise no change is - needed. This commit implements this optimization. - -. BUGFIX Fix bug causing occasional loss of weight raising. When a - bfq_queue, say bfqq, is split after a merging with another - bfq_queue, BFQ checks whether it has to restore for bfqq the - weight-raising state that bfqq had before being merged. In - particular, the weight-raising is restored only if, according to the - weight-raising duration decided for bfqq when it started to be - weight-raised (before being merged), bfqq would not have already - finished its weight-raising period. Yet, by mistake, such a - duration was not saved when bfqq is merged. So, if bfqq was freed - and reallocated when it was split, then this duration was wrongly - set to zero on the split. As a consequence, the weight-raising state - of bfqq was wrongly not restored, which caused BFQ to fail in - guaranteeing a low latency to bfqq. This commit fixes this bug by - saving weight-raising duration when bfqq is merged, and correctly - restoring it when bfqq is split. - -. BUGFIX Fix wrong reset of in-service entities In-service entities - were reset with an indirect logic, which happened to be even buggy - for some cases. This commit fixes this bug in two important - steps. First, by replacing this indirect logic with a direct logic, - in which all involved entities are immediately reset, with a - bubble-up loop, when the in-service queue is reset. Second, by - restructuring the code related to this change, so as to become not - only correct with respect to this change, but also cleaner and - hopefully clearer. - -. CODE IMPROVEMENT Add code to be able to redirect trace log to - console. - -. BUGFIX Fixed bug in optimized update of next_in_service entity. - There was a case where bfq_update_next_in_service did not update - next_in_service, even if it might need to be changed: in case of - requeueing or repositioning of the entity that happened to be - pointed exactly by next_in_service. This could result in violation - of service guarantees, because, after a change of timestamps for - such an entity, it might be the case that next_in_service had to - point to a different entity. This commit fixes this bug. - -. OPTIMIZATION Stop bubble-up of next_in_service update if possible. - -. BUGFIX Fixed a false-positive warning for uninitialized var - -BFQ-v8r5 - -. DOCUMENTATION IMPROVEMENT Added documentation of BFQ benefits, inner - workings, interface and tunables. - -. BUGFIX: Replaced max wrongly used for modulo numbers. - -. DOCUMENTATION IMPROVEMENT Improved help message in Kconfig.iosched. - -. BUGFIX: Removed wrong conversion in use of bfq_fifo_expire. - -. CODE IMPROVEMENT Added parentheses to complex macros. - -v8r4 - -. BUGFIX The function bfq_find_set_group may return a NULL pointer, - which happened not to properly handled in the function - __bfq_bic_change_cgroup. This fix handles this case. Contributed by - Lee Tibbert. - -. BUGFIX Fix recovery of lost service for soft real-time - applications. This recovery is important for soft real-time - application to continue enjoying proper weight raising even if their - service happens to be delayed for a while. Contributed by Luca - Miccio. - -. BUGFIX Fix handling of wait_request state. The semantics of - hrtimers makes the following assumption false after invoking - hrtimer_try_to_cancel: the timer results as non - active. Unfortunately this assumption was used in the previous - version of the code. This change lets code comply with the new - semantics. - -. IMPROVEMENT Improve the peak-rate estimator. This change is a - complete rewrite of the peak-rate estimation algorithm. It is both - an improvement and a simplification: in particular it replaces the - previous, less effective, stable and clear algorithm for estimating - the peak rate. The previous algorihtm approximated the service rate - using the individual dispatch rates observed during the service - slots of queues. As such, it took into account not only just - individual queue workloads, but also rather short time intervals. - The new algorithm considers the global workload served by the - device, and computes the peak rate over much larger time - intervals. This makes the new algorihtm extremely more effective - with queueing devices and, in general, with devices with a - fluctuating bandwidth, either physical or virtual. - -. IMPROVEMENT Force the device to serve one request at a time if - strict_guarantees is true. Forcing this service scheme is currently - the ONLY way to guarantee that the request service order enforced by - the scheduler is respected by a queueing device. Otherwise the - device is free even to make some unlucky request wait for as long as - the device wishes. - Of course, serving one request at at time may cause loss of throughput. - -. IMPROVEMENT Let weight raising start for a soft real-time - application even while the application is till enjoying - weight-raising for interactive tasks. This allows soft real-time - applications to start enjoying the benefits of their special weight - raising as soon as possible. - -v8r3 - -. BUGFIX Update weight-raising coefficient when switching from - interactive to soft real-time. - -v8r2 - -. BUGFIX Removed variables that are not used if tracing is - disabled. Reported by Lee Tibbert - -. IMPROVEMENT Ported commit ae11889636: turned blkg_lookup_create into - blkg_lookup. As a side benefit, this finally enables BFQ to be used - as a module even with full hierarchical support. - -v8r1 - -. BUGFIX Fixed incorrect invariant check - -. IMPROVEMENT Privileged soft real-time applications against - interactive ones, to guarantee a lower and more stable latency to - the former - -v8 - -. BUGFIX: Fixed incorrect rcu locking in bfq_bic_update_cgroup - -. BUGFIX Fixed a few cgroups-related bugs, causing sporadic crashes - -. BUGFIX Fixed wrong computation of queue weights as a function of ioprios - -. BUGFIX Fixed wrong Kconfig.iosched dependency for BFQ_GROUP_IOSCHED - -. IMPROVEMENT Preemption-based, idle-less service guarantees. If - several processes are competing for the device at the same time, but - all processes and groups have the same weight, then the mechanism - introduced by this improvement enables BFQ to guarantee the expected - throughput distribution without ever idling the device. Throughput - is then much higher in this common scenario. - -. IMPROVEMENT Made burst handling more robust - -. IMPROVEMENT Reduced false positives in EQM - -. IMPROVEMENT Let queues preserve weight-raising also when shared - -. IMPROVEMENT Improved peak-rate estimation and autotuning of the - parameters related to the device rate - -. IMPROVEMENT Improved the weight-raising mechanism so as to further - reduce latency and to increase robustness - -. IMPROVEMENT Added a strict-guarantees tunable. If this tunable is - set, then device-idling is forced whenever needed to provide - accurate service guarantees. CAVEAT: idling unconditionally may even - increase latencies, in case of processes that did stop doing I/O. - -. IMPROVEMENT Improved handling of async (write) I/O requests - -. IMPROVEMENT Ported several good CFQ commits - -. CHANGE Changed default group weight to 100 - -. CODE IMPROVEMENT Refactored I/O-request-insertion code - -v7r11: -. BUGFIX Remove the group_list data structure, which ended up in an - inconsistent state if BFQ happened to be activated for some device - when some blkio groups already existed (these groups where not added - to the list). The blkg list for the request queue is now used where - the removed group_list was used. - -. BUGFIX Init and reset also dead_stats. - -. BUGFIX Added, in __bfq_deactivate_entity, the correct handling of the - case where the entity to deactivate has not yet been activated at all. - -. BUGFIX Added missing free of the root group for the case where full - hierarchical support is not activated. - -. IMPROVEMENT Removed the now useless bfq_disconnect_groups - function. The same functionality is achieved through multiple - invocations of bfq_pd_offline (which are in their turn guaranteed to - be executed, when needed, by the blk-cgroups code). - -v7r10 : -. BUGFIX: Fixed wrong check on whether cooperating processes belong - to the same cgroup. - -v7r9: -. IMPROVEMENT: Changed BFQ to use the blkio controller instead of its - own controller. BFQ now registers itself as a policy to the blkio - controller and implements its hierarchical scheduling support using - data structures that already exist in blk-cgroup. The bfqio - controller's code is completely removed. - -. CODE IMPROVEMENTS: Applied all suggestions from Tejun Heo, received - on the last submission to lkml: https://lkml.org/lkml/2014/5/27/314. - -v7r8: -. BUGFIX: Let weight-related fields of a bfq_entity be correctly initialized - (also) when the I/O priority of the entity is changed before the first - request is inserted into the bfq_queue associated to the entity. -. BUGFIX: When merging requests belonging to different bfq_queues, avoid - repositioning the surviving request. In fact, in this case the repositioning - may result in the surviving request being moved across bfq_queues, which - would ultimately cause bfq_queues' data structures to become inconsistent. -. BUGFIX: When merging requests belonging to the same bfq_queue, reposition - the surviving request so that it gets in the correct position, namely the - position of the dropped request, instead of always being moved to the head - of the FIFO of the bfq_queue (which means to let the request be considered - the eldest one). -. BUGFIX: Reduce the idling slice for seeky queues only if the scenario is - symmetric. This guarantees that also processes associated to seeky queues - do receive their reserved share of the throughput. - Contributed by Riccardo Pizzetti and Samuele Zecchini. -. IMPROVEMENT: Always perform device idling if the scenario is asymmetric in - terms of throughput distribution among processes. - This extends throughput-distribution guarantees to any process, regardless - of the properties of its request pattern and of the request patterns of the - other processes, and regardless of whether the device is NCQ-capable. -. IMPROVEMENT: Remove the current limitation on the maximum number of in-flight - requests allowed for a sync queue (limitation set in place for fairness - issues in CFQ, inherited by the first version of BFQ, but made unnecessary - by the latest accurate fairness strategies added to BFQ). Removing this - limitation enables devices with long internal queues to fill their queues - as much as they deem appropriate, also with sync requests. This avoids - throughput losses on these devices, because, to achieve a high throughput, - they often need to have a high number of requests queued internally. -. CODE IMPROVEMENT: Simplify I/O priority change logic by turning it into a - single-step procedure instead of a two-step one; improve readability by - rethinking the names of the functions involved in changing the I/O priority - of a bfq_queue. - -v7r7: -. BUGFIX: Prevent the OOM queue from being involved in the queue - cooperation mechanism. In fact, since the requests temporarily - redirected to the OOM queue could be redirected again to dedicated - queues at any time, the state needed to correctly handle merging - with the OOM queue would be quite complex and expensive to - maintain. Besides, in such a critical condition as an out of - memory, the benefits of queue merging may be little relevant, or - even negligible. -. IMPROVEMENT: Let the OOM queue be initialized only once. Previously, - the OOM queue was reinitialized, at each request enqueue, with the - parameters related to the process that issued that request. - Depending on the parameters of the processes doing I/O, this could - easily cause the OOM queue to be moved continuously across service - trees, or even across groups. It also caused the parameters of the - OOM queue to be continuously reset in any case. -. CODE IMPROVEMENT. Performed some minor code cleanups, and added some - BUG_ON()s that, if the weight of an entity becomes inconsistent, - should better help understand why. - -v7r6: -. IMPROVEMENT: Introduced a new mechanism that helps get the job done - more quickly with services and applications that create or reactivate - many parallel I/O-bound processes. This is the case, for example, with - systemd at boot, or with commands like git grep. -. CODE IMPROVEMENTS: Small code cleanups and improvements. - -v7r5: -. IMPROVEMENT: Improve throughput boosting by idling the device - only for processes that, in addition to perform sequential I/O, - are I/O-bound (apart from weight-raised queues, for which idling - is always performed to guarantee them a low latency). -. IMPROVEMENT: Improve throughput boosting by depriving processes - that cooperate often of weight-raising. -. CODE IMPROVEMENT: Pass of improvement of the readability of both - comments and actual code. - -v7r4: -. BUGFIX. Modified the code so as to be robust against late detection of - NCQ support for a rotational device. -. BUGFIX. Removed a bug that hindered the correct throughput distribution - on flash-based devices when not every process had to receive the same - fraction of the throughput. This fix entailed also a little efficiency - improvement, because it implied the removal of a short function executed - in a hot path. -. CODESTYLE IMPROVEMENT: removed quoted strings split across lines. - -v7r3: -. IMPROVEMENT: Improved throughput boosting with NCQ-capable HDDs and - random workloads. The mechanism that further boosts throghput with - these devices and workloads is activated only in the cases where it - does not cause any violation of throughput-distribution and latency - guarantees. -. IMPROVEMENT: Generalized the computation of the parameters of the - low-latency heuristic for interactive applications, so as to fit also - slower storage devices. The purpose of this improvement is to preserve - low-latency guarantees for interactive applications also on slower - devices, such as portable hard disks, multimedia and SD cards. -. BUGFIX: Re-added MODULE_LICENSE macro. -. CODE IMPROVEMENTS: Small code cleanups; introduced a coherent naming - scheme for all identifiers related to weight raising; refactored and - optimized a few hot paths. - -v7r2: -. BUGFIX/IMPROVEMENT. One of the requirements for an application to be - deemed as soft real-time is that it issues its requests in batches, and - stops doing I/O for a well-defined amount of time before issuing a new - batch. Imposing this minimum idle time allows BFQ to filter out I/O-bound - applications that may otherwise be incorrectly deemed as soft real-time - (under the circumstances described in detail in the comments to the - function bfq_bfqq_softrt_next_start()). Unfortunately, BFQ could however - start counting this idle time from two different events: either from the - expiration of the queue, if all requests of the queue had also been already - completed when the queue expired, or, if the previous condition did not - hold, from the first completion of one of the still outstanding requests. - In the second case, an application had more chances to be deemed as soft - real-time. - Actually, there was no reason for this differentiated treatment. We - addressed this issue by defining more precisely the above requirement for - an application to be deemed as soft real-time, and changing the code - consequently: a well-defined amount of time must elapse between the - completion of *all the requests* of the current pending batch and the - issuing of the first request of the next batch (this is, in the end, what - happens with a true soft real-time application). This change further - reduced false positives, and, as such, improved responsiveness and reduced - latency for actual soft real-time applications. -. CODE IMPROVEMENT. We cleaned up the code a little bit and addressed - some issues pointed out by the checkpatch.pl script. - -v7r1: -. BUGFIX. Replace the old value used to approximate 'infinity', with - the correct one to use in case times are compared through the macro - time_is_before_jiffies(). In fact, this macro, designed to take - wraparound issues into account, easily returns anomalous results if - its argument is equal to the value that we used as an approximation - of 'infinity', namely ((unsigned long) (-1)). The consequence was - that the logical expression used to determine whether a queue - belongs to a soft real-time application often yielded an incorrect - result. In the end, some application happened to be incorrectly - deemed as soft real-time and hence weight-raised. This affected both - throughput and latency guarantees. -. BUGFIX. Fixed a scriverner's error made in an attempt to use the - above macro in a logical expression. -. IMPROVEMENT/BUGFIX. On the expiration of a queue, use a more general - condition to allow a weight-raising period to start if the queue is - soft real-time. The previous condition could prevent an empty, - soft-real time queue from being correctly deemed as soft real-time. -. IMPROVEMENT/MINOR BUGFIX. Use jiffies-comparison macros also in the - following cases: - . to establish whether an application initially deemed as interactive - is now meeting the requirements for being classified as soft - real-time; - . to determine if a weight-raising period must be ended. -. CODE IMPROVEMENT. Change the type of the time quantities used in the - weight-raising heuristics to unsigned long, as the type of the time - (jiffies) is unsigned long. - -v7: -- IMPROVEMENT: In the presence of weight-raised queues and if the - device is NCQ-enabled, device idling is now disabled for non-raised - readers, i.e., for their associated sync queues. Hence a sync queue - is expired immediately if it becomes empty, and a new queue is - served. As explained in detail in the papers about BFQ, not idling - the device for sync queues when the latter become empty causes BFQ to - assign higher timestamps to these queues when they get backlogged - again, and hence to serve these queues less frequently. This fact, - plus to the fact that, because of the immediate expiration itself, - these queues get less service while they are granted access to the - disk, reduces the relative rate at which the processes associated to - these queues ask for requests from the I/O request pool. If the pool - is saturated, as it happens in the presence of write hogs, reducing - the above relative rate increases the probability that a request is - available (soon) in the pool when a weight-raised process needs it. - This change does seem to mitigate the typical starvation problems - that occur in the presence of write hogs and NCQ, and hence to - guarantee a higher application and system responsiveness in these - hostile scenarios. -- IMPROVEMENT/BUGFIX: Introduced a new classification rule to the soft - real-time heuristic, which takes into account also the isochronous - nature of such applications. The computation of next_start has been - fixed as well. Now it is correctly done from the time of the last - transition from idle to backlogged; the next_start is therefore - computed from the service received by the queue from its last - transition from idle to backlogged. Finally, the code which - preserved weight-raising for a soft real-time queue even with no - idle->backlogged transition has been removed. -- IMPROVEMENT: Add a few jiffies to the reference time interval used to - establish whether an application is greedy or not. This reference - interval was, by default, HZ/125 seconds, which could generate false - positives in the following two cases (especially if both cases occur): - 1) If HZ is so low that the duration of a jiffie is comparable to or - higher than the above reference time interval. This happens, e.g., - on slow devices with HZ=100. - 2) If jiffies, instead of increasing at a constant rate, may stop - increasing for some time, then suddenly 'jump' by several units to - recover the lost increments. This seems to happen, e.g., in virtual - machines. - The added number of jiffies has been found experimentally. In particular, - according to our experiments, adding this number of jiffies seems to make - the filter quite precise also in embedded systems and KVM/QEMU virtual - machines. Also contributed by - Alexander Spyridakis . -- IMPROVEMENT/BUGFIX: Keep disk idling also for NCQ-provided - rotational devices, which boosts the throughput on NCQ-enabled - rotational devices. -- BUGFIX: The budget-timeout condition in the bfq_rq_enqueued() function - was checked only if the request is large enough to provoke an unplug. As - a consequence, for a process always issuing small I/O requests the - budget timeout was never checked. The queue associated to the process - therefore expired only when its budget was exhausted, even if the - queue had already incurred a budget timeout from a while. - This fix lets a queue be checked for budget timeout at each request - enqueue, and, if needed, expires the queue accordingly even if the - request is small. -- BUGFIX: Make sure that weight-raising is resumed for a split queue, - if it was merged when already weight-raised. -- MINOR BUGFIX: Let bfq_end_raising_async() correctly end weight-raising - also for the queues belonging to the root group. -- IMPROVEMENT: Get rid of the some_coop_idle flag, which in its turn - was used to decide whether to disable idling for an in-service - shared queue whose seek mean decreased. In fact, disabling idling - for such a queue turned out to be useless. -- CODE IMPROVEMENT: The bfq_bfqq_must_idle() function and the - bfq_select_queue() function may not change the current in-service - queue in various cases. We have cleaned up the involved conditions, - by factoring out the common parts and getting rid of the useless - ones. -- MINOR CODE IMPROVEMENT: The idle_for_long_time condition in the - bfq_add_rq_rb() function should be evaluated only on an - idle->backlogged transition. Now the condition is set to false - by default, evaluating it only if the queue was not busy on a - request insertion. -- MINOR CODE IMPROVEMENT: Added a comment describing the rationale - behind the condition evaluated in the function - bfq_bfqq_must_not_expire(). - -v6r2: -- Fairness fix: the case of queue expiration for budget timeout is - now correctly handled also for sync queues, thus allowing also - the processes corresponding to these queues to be guaranteed their - reserved share of the disk throughput. -- Fixed a bug that prevented group weights from being correctly - set via the sysfs interface. -- Fixed a bug that cleared a previously-set group weight if the - same value was re-inserted via the sysfs interface. -- Fixed an EQM bug that allowed a newly-started process to skip - its initial weight-raising period if its queue was merged before - its first request was inserted. -- Fixed a bug that preserved already-started weight-raising periods - even if the low_latency tunable was disabled. -- The raising_max_time tunable now shows, more user-friendly, the - maximum raising time in milliseconds. - -v6r1: -- Fix use-after-free of queues in __bfq_bfqq_expire(). It may happen that - a call to bfq_del_bfqq_busy() puts the last reference taken on a queue - and frees it. Subsequent accesses to that same queue would result in a - use-after-free. Make sure that a queue that has just been deleted from - busy is no more touched. -- Use the uninitialized_var() macro when needed. It may happen that a - variable is initialized in a function that is called by the function - that defined it. Use the uninitialized_var() macro in these cases. - -v6: -- Replacement of the cooperating-queue merging mechanism borrowed from - CFQ with Early Queue Merge (EQM), a unified mechanism to get a - sequential read pattern, and hence a high throughput, with any set of - processes performing interleaved I/O. EQM also preserves low latency. - (see http://algogroup.unimore.it/people/paolo/disk_sched/description.php - for more details). Contributed by Mauro Andreolini and Arianna Avanzini. - The code for detecting whether two queues have to be merged is a - slightly modified version of the CFQ code for detecting whether two - queues belong to cooperating processes and whether the service of a - queue should be preempted to boost the throughput. -- Fix a bug that caused the peak rate of a disk to be computed as zero - in case of multiple I/O errors. Subsequent estimations of the weight - raising duration caused a division-by-zero error. - -v5r1: -- BUG FIX: Fixed stall occurring when the active queue is moved to - a different group while idling (this caused the idling timer to be - cancelled and hence no new queue to be selected, and no new - request to be dispatched). -- BUG FIX: Fixed wrong assignment of too high budgets to queues during - the first few seconds after initialization. -- BUG FIX: Added proper locking to the function handling the "weights" - tunable. - -v5: -- Added an heuristic that, if the tunable raising_max_time is set to - 0, automatically computes the duration of the weight raising - according to the estimated peak rate of the device. This enables - flash-based devices to reach maximum throughput as soon as possible, - without sacrificing latency. - -v4: -- Throughput-boosting for flash-based devices: improved version of commits - a68bbdd and f7d7b7a, which boosts the throughput while still preserving - latency guarantees for interactive and soft real-time applications. -- Better identification of NCQ-capable disks: port of commit e459dd0. - -v3-r4: -- Bugfixes - * Removed an important memory leak: under some circumstances the process references - to a queue were not decremented correctly, which prevented unused shared bfq_queue - to be correctly deallocated. - * Fixed various errors related to hierarchical scheduling: - * Removed an error causing tasks to be attached to the bfqio cgroup - controller even when BFQ was not the active scheduler - * Corrected wrong update of the budgets from the leaf to the root upon - forced selection of a service tree or a bfq_queue - * Fixed the way how active leaf entities are moved to the root group before - the group entity is deactivated when a cgroup is destroyed -- Throughput-boosting improvement for cooperating queues: close detection is now based - on a fixed threshold instead of the queue's average seek. This is a port of one of - the changes in the CFQ commit 3dde36d by Corrado Zoccolo. - -v3-r3: -- Bugfix: removed an important error causing occasional kernel panics when - moving a process to a new cgroup. The panic occurred if: - 1) the queue associated to the process was idle when the process was moved - and - 2) a new disk request was inserted into the queue just after the move. -- Further latency improvement through a better treatment of low-bandwidth - async queues. - -v3-r2: -- Bugfix: added a forgotten condition that prevents weights of low-bw async - queues from being raised when low_latency is off. -- Latency improvement: low-bw async queues are now better identified. - -v3-r1: -- Fixed an important request-dispatch bug causing occasional IO hangs. -- Added a new mechanism to reduce the latency of low-bw async queues. - This reduces the latency of also the sync queues synchronized with - the above async queues. -- Fixed a minor bug in iocontext locking (port of commits 9b50902 and 3181faa - from CFQ). - -v3: - -- Improved low-latency mechanisms, including a more accurate criterion to - distinguish between greedy-but-seeky and soft real-time applications. - Interactive applications now enjoy noticeably lower latencies. - -- Switch to the simpler one-request-dispatch-at-a-time scheme as in CFQ. - -- Ported cooperating-queues merging from CFQ (6d048f5, 1afba04, - d9e7620, a36e71f, 04dc6e7, 26a2ac0, 3ac6c9f, f2d1f0a, 83096eb, - 2e46e8b, df5fe3e, b3b6d04, e6c5bc7, c0324a0, f04a642, 8682e1f, - b9d8f4c, 2f7a2d8, ae54abe, e9ce335, 39c01b2, d02a2c0, c10b61f). - Contributed by Arianna Avanzini. Queues of processes performing IO - on interleaved, yet contiguous disk zones are merged to boost the - throughput. Some little optimizations to get a more stable throughput - have been added to the original CFQ version. - -- Added static fallback queue for extreme OOM conditions (porting of - CFQ commits d5036d7, 6118b70, b706f64, 32f2e80). Port contributed by - Francesco Allertsen. - -- Ported CFQ commits b0b78f8, 40bb54d, 30996f4, dddb745, ad5ebd2, cf7c25c; - mainly code cleanup and fix of minor bugs. Port contributed by - Francesco Allertsen. - -v2: - -- An issue that may cause little throughput loss on fast disks has been solved. - BFQ-v1 and CFQ may suffer from this problem. -- The disk-idling timeout has been better tuned to further file latency - (especially for the idle- or light-loaded-disk scenarios). -- One of the parameters of the low-latency heuristics has been tuned a little - bit more, so as to reduce the probability that a disk-bound process may - hamper the reduction of the latency of interactive and soft real-time - applications. - - - Same low-latency guarantees with and without NCQ. - - - Latency for interactive applications about halved with respect to BFQ-v1. - - - When the low_latency tunable is set, also soft real-time applications - now enjoy reduced latency. - - - A very little minimum bandwidth is now guaranteed to the - Idle IO-scheduling class also when the other classes are - backlogged, just to prevent them from starving. - -v1: - -This is a new version of BFQ with respect to the versions you can -find on Fabio's site: http://feanor.sssup.it/~fabio/linux/bfq. -Here is what we changed with respect to the previous versions: - -1) re-tuned the budget feedback mechanism: it is now slighlty more -biased toward assigning high budgets, to boost the aggregated -throughput more, and more quickly as new processes are started - -2) introduced more tolerance toward seeky queues (I verified that the -phenomena described below used to occur systematically): - - 2a: if a queue is expired after having received very little - service, then it is not punished as a seeky queue, even if it - occurred to consume that little service too slowly; the - rationale is that, if the new active queue has been served for - a too short time interval, then its possible sequential - accesses may not yet prevail on the initial latencies for - moving the disk head on the first sector requested - - 2b: the waiting time (disk idling) of a queue detected as seeky as - a function of the position of the requests it issued is reduced - to a very low value only after the queue has consumed a minimum - fraction of the assigned budget; this prevents processes - generating (partly) seeky workloads from being too ill-treated - - 2c: if a queue has consumed 'enough' budget upon a budget timeout, then, - even if it did not consume all of its budget, that queue is not punished - as any seeky queue; the rationale is that, depending on the disk zones, - a queue may be served at a lower rate than the estimated peak rate. - - Changes 2a and 2b have been critical in lowering latencies, whereas - change 2c, in addition to change 1, helped a lot increase the disk - throughput. - -3) slightly changed the peak rate estimator: a low-pass filter is now -used instead of just keeping the highest rate sampled; the rationale -is that the peak rate of a disk should be quite stable, so the filter -should converge more or less smoothly to the right value; it seemed to -correctly catch the peak rate with all disks we used - -4) added the low latency mechanism described in detail in -http://algogroup.unimore.it/people/paolo/disk_sched/description.php. diff --git a/fs-aufs4.patch b/fs-aufs4.patch index c7c9b7d..32587a5 100644 --- a/fs-aufs4.patch +++ b/fs-aufs4.patch @@ -1,6 +1,6 @@ diff --git a/Documentation/ABI/testing/debugfs-aufs b/Documentation/ABI/testing/debugfs-aufs new file mode 100644 -index 0000000..99642d1 +index 000000000000..99642d1055a2 --- /dev/null +++ b/Documentation/ABI/testing/debugfs-aufs @@ -0,0 +1,50 @@ @@ -56,7 +56,7 @@ index 0000000..99642d1 + will be empty. About XINO files, see the aufs manual. diff --git a/Documentation/ABI/testing/sysfs-aufs b/Documentation/ABI/testing/sysfs-aufs new file mode 100644 -index 0000000..82f9518 +index 000000000000..82f9518495ea --- /dev/null +++ b/Documentation/ABI/testing/sysfs-aufs @@ -0,0 +1,31 @@ @@ -93,7 +93,7 @@ index 0000000..82f9518 + will be empty. About XINO files, see the aufs manual. diff --git a/Documentation/filesystems/aufs/README b/Documentation/filesystems/aufs/README new file mode 100644 -index 0000000..fa82b63 +index 000000000000..fa82b6394f69 --- /dev/null +++ b/Documentation/filesystems/aufs/README @@ -0,0 +1,393 @@ @@ -492,10 +492,10 @@ index 0000000..fa82b63 +# End: ; diff --git a/Documentation/filesystems/aufs/design/01intro.txt b/Documentation/filesystems/aufs/design/01intro.txt new file mode 100644 -index 0000000..988772e +index 000000000000..34da01b3a2b8 --- /dev/null +++ b/Documentation/filesystems/aufs/design/01intro.txt -@@ -0,0 +1,170 @@ +@@ -0,0 +1,171 @@ + +# Copyright (C) 2005-2017 Junjiro R. Okajima +# @@ -521,6 +521,7 @@ index 0000000..988772e +3. abbrev. for "auf das" in German which means "on the" in English. + Ex. "Butter aufs Brot"(G) means "butter onto bread"(E). + But "Filesystem aufs Filesystem" is hard to understand. ++4. abbrev. for "African Urban Fashion Show". + +AUFS is a filesystem with features: +- multi layered stackable unification filesystem, the member directory @@ -668,7 +669,7 @@ index 0000000..988772e +about it. But currently I have implemented it in kernel space. diff --git a/Documentation/filesystems/aufs/design/02struct.txt b/Documentation/filesystems/aufs/design/02struct.txt new file mode 100644 -index 0000000..1d1ccde +index 000000000000..1d1ccde5de21 --- /dev/null +++ b/Documentation/filesystems/aufs/design/02struct.txt @@ -0,0 +1,258 @@ @@ -932,7 +933,7 @@ index 0000000..1d1ccde +For this purpose, use "aumvdown" command in aufs-util.git. diff --git a/Documentation/filesystems/aufs/design/03atomic_open.txt b/Documentation/filesystems/aufs/design/03atomic_open.txt new file mode 100644 -index 0000000..5f0aca4 +index 000000000000..5f0aca4421a6 --- /dev/null +++ b/Documentation/filesystems/aufs/design/03atomic_open.txt @@ -0,0 +1,85 @@ @@ -1023,7 +1024,7 @@ index 0000000..5f0aca4 + be implemented in aufs, but not all I am afraid. diff --git a/Documentation/filesystems/aufs/design/03lookup.txt b/Documentation/filesystems/aufs/design/03lookup.txt new file mode 100644 -index 0000000..8b8ac6e +index 000000000000..8b8ac6e0e273 --- /dev/null +++ b/Documentation/filesystems/aufs/design/03lookup.txt @@ -0,0 +1,113 @@ @@ -1142,7 +1143,7 @@ index 0000000..8b8ac6e + by over-mounting something (or another method). diff --git a/Documentation/filesystems/aufs/design/04branch.txt b/Documentation/filesystems/aufs/design/04branch.txt new file mode 100644 -index 0000000..5604ff8 +index 000000000000..5604ff8eb616 --- /dev/null +++ b/Documentation/filesystems/aufs/design/04branch.txt @@ -0,0 +1,74 @@ @@ -1222,7 +1223,7 @@ index 0000000..5604ff8 + same named entry on the upper branch. diff --git a/Documentation/filesystems/aufs/design/05wbr_policy.txt b/Documentation/filesystems/aufs/design/05wbr_policy.txt new file mode 100644 -index 0000000..1578469 +index 000000000000..1578469b32e9 --- /dev/null +++ b/Documentation/filesystems/aufs/design/05wbr_policy.txt @@ -0,0 +1,64 @@ @@ -1292,7 +1293,7 @@ index 0000000..1578469 + copyup policy. diff --git a/Documentation/filesystems/aufs/design/06fhsm.txt b/Documentation/filesystems/aufs/design/06fhsm.txt new file mode 100644 -index 0000000..9216478 +index 000000000000..9216478d803d --- /dev/null +++ b/Documentation/filesystems/aufs/design/06fhsm.txt @@ -0,0 +1,120 @@ @@ -1418,7 +1419,7 @@ index 0000000..9216478 +should restore the original file state after an error happens. diff --git a/Documentation/filesystems/aufs/design/06mmap.txt b/Documentation/filesystems/aufs/design/06mmap.txt new file mode 100644 -index 0000000..8fe4b6c +index 000000000000..8fe4b6cd379d --- /dev/null +++ b/Documentation/filesystems/aufs/design/06mmap.txt @@ -0,0 +1,72 @@ @@ -1496,7 +1497,7 @@ index 0000000..8fe4b6c +I have to give up this "looks-smater" approach. diff --git a/Documentation/filesystems/aufs/design/06xattr.txt b/Documentation/filesystems/aufs/design/06xattr.txt new file mode 100644 -index 0000000..37cdb4e +index 000000000000..37cdb4e795e4 --- /dev/null +++ b/Documentation/filesystems/aufs/design/06xattr.txt @@ -0,0 +1,96 @@ @@ -1598,7 +1599,7 @@ index 0000000..37cdb4e +now, aufs implements the branch attributes to ignore the error. diff --git a/Documentation/filesystems/aufs/design/07export.txt b/Documentation/filesystems/aufs/design/07export.txt new file mode 100644 -index 0000000..cd4ee6b +index 000000000000..cd4ee6b61833 --- /dev/null +++ b/Documentation/filesystems/aufs/design/07export.txt @@ -0,0 +1,58 @@ @@ -1662,7 +1663,7 @@ index 0000000..cd4ee6b + lookup_one_len(), vfs_getattr(), encode_fh() and others. diff --git a/Documentation/filesystems/aufs/design/08shwh.txt b/Documentation/filesystems/aufs/design/08shwh.txt new file mode 100644 -index 0000000..7e07e26 +index 000000000000..7e07e2609ec3 --- /dev/null +++ b/Documentation/filesystems/aufs/design/08shwh.txt @@ -0,0 +1,52 @@ @@ -1720,7 +1721,7 @@ index 0000000..7e07e26 +initramfs will use it to replace the old one at the next boot. diff --git a/Documentation/filesystems/aufs/design/10dynop.txt b/Documentation/filesystems/aufs/design/10dynop.txt new file mode 100644 -index 0000000..b7ba75d +index 000000000000..b7ba75d8843a --- /dev/null +++ b/Documentation/filesystems/aufs/design/10dynop.txt @@ -0,0 +1,47 @@ @@ -1772,10 +1773,10 @@ index 0000000..b7ba75d +Currently this approach is applied to address_space_operations for +regular files only. diff --git a/MAINTAINERS b/MAINTAINERS -index 38d3e4e..c2b31bf 100644 +index 767e9d202adf..2dd7fa4d2c35 100644 --- a/MAINTAINERS +++ b/MAINTAINERS -@@ -2319,6 +2319,19 @@ F: include/linux/audit.h +@@ -2348,6 +2348,19 @@ F: include/linux/audit.h F: include/uapi/linux/audit.h F: kernel/audit* @@ -1796,10 +1797,10 @@ index 38d3e4e..c2b31bf 100644 M: Miguel Ojeda Sandonis W: http://miguelojeda.es/auxdisplay.htm diff --git a/drivers/block/loop.c b/drivers/block/loop.c -index 0ecb646..cf39138 100644 +index ebbd0c3fe0ed..6b8a6a4cf749 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c -@@ -701,6 +701,24 @@ static inline int is_loop_device(struct file *file) +@@ -700,6 +700,24 @@ static inline int is_loop_device(struct file *file) return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR; } @@ -1825,10 +1826,10 @@ index 0ecb646..cf39138 100644 static ssize_t loop_attr_show(struct device *dev, char *page, diff --git a/fs/Kconfig b/fs/Kconfig -index 83eab52..31f16c4 100644 +index b0e42b6a96b9..7fa4b682b474 100644 --- a/fs/Kconfig +++ b/fs/Kconfig -@@ -248,6 +248,7 @@ source "fs/pstore/Kconfig" +@@ -249,6 +249,7 @@ source "fs/pstore/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" @@ -1837,7 +1838,7 @@ index 83eab52..31f16c4 100644 endif # MISC_FILESYSTEMS diff --git a/fs/Makefile b/fs/Makefile -index 7bbaca9..a026491 100644 +index 7bbaca9c67b1..a02649177328 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -128,3 +128,4 @@ obj-y += exofs/ # Multiple modules @@ -1847,7 +1848,7 @@ index 7bbaca9..a026491 100644 +obj-$(CONFIG_AUFS_FS) += aufs/ diff --git a/fs/aufs/Kconfig b/fs/aufs/Kconfig new file mode 100644 -index 0000000..63560ce +index 000000000000..63560ceda3a4 --- /dev/null +++ b/fs/aufs/Kconfig @@ -0,0 +1,185 @@ @@ -2038,7 +2039,7 @@ index 0000000..63560ce +endif diff --git a/fs/aufs/Makefile b/fs/aufs/Makefile new file mode 100644 -index 0000000..c7a501e +index 000000000000..c7a501e3718e --- /dev/null +++ b/fs/aufs/Makefile @@ -0,0 +1,44 @@ @@ -2088,7 +2089,7 @@ index 0000000..c7a501e +aufs-$(CONFIG_AUFS_MAGIC_SYSRQ) += sysrq.o diff --git a/fs/aufs/aufs.h b/fs/aufs/aufs.h new file mode 100644 -index 0000000..7f5eb78 +index 000000000000..7f5eb7890bc0 --- /dev/null +++ b/fs/aufs/aufs.h @@ -0,0 +1,59 @@ @@ -2153,10 +2154,10 @@ index 0000000..7f5eb78 +#endif /* __AUFS_H__ */ diff --git a/fs/aufs/branch.c b/fs/aufs/branch.c new file mode 100644 -index 0000000..1cca020 +index 000000000000..0947a2a3907c --- /dev/null +++ b/fs/aufs/branch.c -@@ -0,0 +1,1423 @@ +@@ -0,0 +1,1422 @@ +/* + * Copyright (C) 2005-2017 Junjiro R. Okajima + * @@ -2226,8 +2227,7 @@ index 0000000..1cca020 + lockdep_off(); + path_put(&br->br_path); + lockdep_on(); -+ if (wbr) -+ kfree(wbr); ++ kfree(wbr); + kfree(br); +} + @@ -2326,14 +2326,14 @@ index 0000000..1cca020 + err = au_di_realloc(au_di(root), new_nbranch, /*may_shrink*/0); + if (!err) { + inode = d_inode(root); -+ err = au_hinode_realloc(au_ii(inode), new_nbranch, /*may_shrink*/0); ++ err = au_hinode_realloc(au_ii(inode), new_nbranch, ++ /*may_shrink*/0); + } + if (!err) + return add_branch; /* success */ + +out_wbr: -+ if (add_branch->br_wbr) -+ kfree(add_branch->br_wbr); ++ kfree(add_branch->br_wbr); +out_hnotify: + au_hnotify_fin_br(add_branch); +out_xinondir: @@ -3582,7 +3582,7 @@ index 0000000..1cca020 +} diff --git a/fs/aufs/branch.h b/fs/aufs/branch.h new file mode 100644 -index 0000000..e9591cf +index 000000000000..e9591cfa2568 --- /dev/null +++ b/fs/aufs/branch.h @@ -0,0 +1,321 @@ @@ -3909,7 +3909,7 @@ index 0000000..e9591cf +#endif /* __AUFS_BRANCH_H__ */ diff --git a/fs/aufs/conf.mk b/fs/aufs/conf.mk new file mode 100644 -index 0000000..0bbb2d3 +index 000000000000..0bbb2d3a5285 --- /dev/null +++ b/fs/aufs/conf.mk @@ -0,0 +1,38 @@ @@ -3953,10 +3953,10 @@ index 0000000..0bbb2d3 +-include ${srctree}/${src}/conf_priv.mk diff --git a/fs/aufs/cpup.c b/fs/aufs/cpup.c new file mode 100644 -index 0000000..87ae97f0 +index 000000000000..1c4156962391 --- /dev/null +++ b/fs/aufs/cpup.c -@@ -0,0 +1,1407 @@ +@@ -0,0 +1,1442 @@ +/* + * Copyright (C) 2005-2017 Junjiro R. Okajima + * @@ -4324,6 +4324,59 @@ index 0000000..87ae97f0 + return err; +} + ++static int au_do_copy(struct file *dst, struct file *src, loff_t len) ++{ ++ int err; ++ struct super_block *h_src_sb; ++ struct inode *h_src_inode; ++ ++ h_src_inode = file_inode(src); ++ h_src_sb = h_src_inode->i_sb; ++ ++ /* XFS acquires inode_lock */ ++ if (!au_test_xfs(h_src_sb)) ++ err = au_copy_file(dst, src, len); ++ else { ++ inode_unlock(h_src_inode); ++ err = au_copy_file(dst, src, len); ++ inode_lock(h_src_inode); ++ } ++ ++ return err; ++} ++ ++static int au_clone_or_copy(struct file *dst, struct file *src, loff_t len) ++{ ++ int err; ++ struct super_block *h_src_sb; ++ struct inode *h_src_inode; ++ ++ h_src_inode = file_inode(src); ++ h_src_sb = h_src_inode->i_sb; ++ if (h_src_sb != file_inode(dst)->i_sb ++ || !dst->f_op->clone_file_range) { ++ err = au_do_copy(dst, src, len); ++ goto out; ++ } ++ ++ if (!au_test_nfs(h_src_sb)) { ++ inode_unlock(h_src_inode); ++ err = vfsub_clone_file_range(src, dst, len); ++ inode_lock(h_src_inode); ++ } else ++ err = vfsub_clone_file_range(src, dst, len); ++ /* older XFS has a condition in cloning */ ++ if (unlikely(err != -EOPNOTSUPP)) ++ goto out; ++ ++ /* the backend fs on NFS may not support cloning */ ++ err = au_do_copy(dst, src, len); ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ +/* + * to support a sparse file which is opened with O_APPEND, + * we need to close the file. @@ -4373,25 +4426,7 @@ index 0000000..87ae97f0 + h_src_sb = h_src_inode->i_sb; + if (!au_test_nfs(h_src_sb)) + IMustLock(h_src_inode); -+ -+ if (h_src_sb != file_inode(file[DST].file)->i_sb -+ || !file[DST].file->f_op->clone_file_range) -+ err = au_copy_file(file[DST].file, file[SRC].file, cpg->len); -+ else { -+ if (!au_test_nfs(h_src_sb)) { -+ inode_unlock(h_src_inode); -+ err = vfsub_clone_file_range(file[SRC].file, -+ file[DST].file, cpg->len); -+ inode_lock(h_src_inode); -+ } else -+ err = vfsub_clone_file_range(file[SRC].file, -+ file[DST].file, cpg->len); -+ if (unlikely(err == -EOPNOTSUPP && au_test_nfs(h_src_sb))) -+ /* the backend fs on NFS may not support cloning */ -+ err = au_copy_file(file[DST].file, file[SRC].file, -+ cpg->len); -+ AuTraceErr(err); -+ } ++ err = au_clone_or_copy(file[DST].file, file[SRC].file, cpg->len); + + /* i wonder if we had O_NO_DELAY_FPUT flag */ + if (tsk->flags & PF_KTHREAD) @@ -5366,7 +5401,7 @@ index 0000000..87ae97f0 +} diff --git a/fs/aufs/cpup.h b/fs/aufs/cpup.h new file mode 100644 -index 0000000..9c20116 +index 000000000000..9c20116cc413 --- /dev/null +++ b/fs/aufs/cpup.h @@ -0,0 +1,94 @@ @@ -5466,7 +5501,7 @@ index 0000000..9c20116 +#endif /* __AUFS_CPUP_H__ */ diff --git a/fs/aufs/dbgaufs.c b/fs/aufs/dbgaufs.c new file mode 100644 -index 0000000..97a4950 +index 000000000000..dde0f9151a0a --- /dev/null +++ b/fs/aufs/dbgaufs.c @@ -0,0 +1,438 @@ @@ -5538,11 +5573,11 @@ index 0000000..97a4950 + if (!err) { + if (do_fcnt) + p->n = snprintf -+ (p->a, sizeof(p->a), "%ld, %llux%lu %lld\n", ++ (p->a, sizeof(p->a), "%ld, %llux%u %lld\n", + (long)file_count(xf), st.blocks, st.blksize, + (long long)st.size); + else -+ p->n = snprintf(p->a, sizeof(p->a), "%llux%lu %lld\n", ++ p->n = snprintf(p->a, sizeof(p->a), "%llux%u %lld\n", + st.blocks, st.blksize, + (long long)st.size); + AuDebugOn(p->n >= sizeof(p->a)); @@ -5910,7 +5945,7 @@ index 0000000..97a4950 +} diff --git a/fs/aufs/dbgaufs.h b/fs/aufs/dbgaufs.h new file mode 100644 -index 0000000..d0c01c8 +index 000000000000..d0c01c89c878 --- /dev/null +++ b/fs/aufs/dbgaufs.h @@ -0,0 +1,48 @@ @@ -5964,7 +5999,7 @@ index 0000000..d0c01c8 +#endif /* __DBGAUFS_H__ */ diff --git a/fs/aufs/dcsub.c b/fs/aufs/dcsub.c new file mode 100644 -index 0000000..0e02131 +index 000000000000..0e02131bd6f3 --- /dev/null +++ b/fs/aufs/dcsub.c @@ -0,0 +1,225 @@ @@ -6195,7 +6230,7 @@ index 0000000..0e02131 +} diff --git a/fs/aufs/dcsub.h b/fs/aufs/dcsub.h new file mode 100644 -index 0000000..92d6f91 +index 000000000000..92d6f91107d1 --- /dev/null +++ b/fs/aufs/dcsub.h @@ -0,0 +1,136 @@ @@ -6337,7 +6372,7 @@ index 0000000..92d6f91 +#endif /* __AUFS_DCSUB_H__ */ diff --git a/fs/aufs/debug.c b/fs/aufs/debug.c new file mode 100644 -index 0000000..12cc993 +index 000000000000..6cfcb14945f6 --- /dev/null +++ b/fs/aufs/debug.c @@ -0,0 +1,440 @@ @@ -6689,7 +6724,7 @@ index 0000000..12cc993 + return; + dpri("nw %d, gen %u, kobj %d\n", + atomic_read(&sbinfo->si_nowait.nw_len), sbinfo->si_generation, -+ atomic_read(&sbinfo->si_kobj.kref.refcount)); ++ kref_read(&sbinfo->si_kobj.kref)); + for (bindex = 0; bindex <= sbinfo->si_bbot; bindex++) + do_pri_br(bindex, sbinfo->si_branch[0 + bindex]); +} @@ -6783,7 +6818,7 @@ index 0000000..12cc993 +} diff --git a/fs/aufs/debug.h b/fs/aufs/debug.h new file mode 100644 -index 0000000..270628d +index 000000000000..270628d747bb --- /dev/null +++ b/fs/aufs/debug.h @@ -0,0 +1,225 @@ @@ -7014,7 +7049,7 @@ index 0000000..270628d +#endif /* __AUFS_DEBUG_H__ */ diff --git a/fs/aufs/dentry.c b/fs/aufs/dentry.c new file mode 100644 -index 0000000..91952bd +index 000000000000..91952bda557e --- /dev/null +++ b/fs/aufs/dentry.c @@ -0,0 +1,1130 @@ @@ -8150,7 +8185,7 @@ index 0000000..91952bd +}; diff --git a/fs/aufs/dentry.h b/fs/aufs/dentry.h new file mode 100644 -index 0000000..adb40ed +index 000000000000..adb40ed5918e --- /dev/null +++ b/fs/aufs/dentry.h @@ -0,0 +1,252 @@ @@ -8408,7 +8443,7 @@ index 0000000..adb40ed +#endif /* __AUFS_DENTRY_H__ */ diff --git a/fs/aufs/dinfo.c b/fs/aufs/dinfo.c new file mode 100644 -index 0000000..8b19f94 +index 000000000000..8b19f94c5654 --- /dev/null +++ b/fs/aufs/dinfo.c @@ -0,0 +1,553 @@ @@ -8967,7 +9002,7 @@ index 0000000..8b19f94 +} diff --git a/fs/aufs/dir.c b/fs/aufs/dir.c new file mode 100644 -index 0000000..a6b7e96 +index 000000000000..a6b7e962ce2d --- /dev/null +++ b/fs/aufs/dir.c @@ -0,0 +1,759 @@ @@ -9732,7 +9767,7 @@ index 0000000..a6b7e96 +}; diff --git a/fs/aufs/dir.h b/fs/aufs/dir.h new file mode 100644 -index 0000000..b107309 +index 000000000000..b107309a3d60 --- /dev/null +++ b/fs/aufs/dir.h @@ -0,0 +1,131 @@ @@ -9869,7 +9904,7 @@ index 0000000..b107309 +#endif /* __AUFS_DIR_H__ */ diff --git a/fs/aufs/dynop.c b/fs/aufs/dynop.c new file mode 100644 -index 0000000..443791a +index 000000000000..443791a76239 --- /dev/null +++ b/fs/aufs/dynop.c @@ -0,0 +1,371 @@ @@ -10246,7 +10281,7 @@ index 0000000..443791a +} diff --git a/fs/aufs/dynop.h b/fs/aufs/dynop.h new file mode 100644 -index 0000000..c19c675 +index 000000000000..c19c675d60a9 --- /dev/null +++ b/fs/aufs/dynop.h @@ -0,0 +1,74 @@ @@ -10326,7 +10361,7 @@ index 0000000..c19c675 +#endif /* __AUFS_DYNOP_H__ */ diff --git a/fs/aufs/export.c b/fs/aufs/export.c new file mode 100644 -index 0000000..34b391c +index 000000000000..34b391c0dc47 --- /dev/null +++ b/fs/aufs/export.c @@ -0,0 +1,836 @@ @@ -11168,10 +11203,10 @@ index 0000000..34b391c +} diff --git a/fs/aufs/f_op.c b/fs/aufs/f_op.c new file mode 100644 -index 0000000..97c2f19 +index 000000000000..bf9f7bb005bb --- /dev/null +++ b/fs/aufs/f_op.c -@@ -0,0 +1,816 @@ +@@ -0,0 +1,817 @@ +/* + * Copyright (C) 2005-2017 Junjiro R. Okajima + * @@ -11841,7 +11876,7 @@ index 0000000..97c2f19 + * au_flag_conv(vma->vm_flags)); + */ + if (!err) -+ err = h_file->f_op->mmap(h_file, vma); ++ err = call_mmap(h_file, vma); + if (!err) { + au_vm_prfile_set(vma, file); + fsstack_copy_attr_atime(inode, file_inode(h_file)); @@ -11933,7 +11968,8 @@ index 0000000..97c2f19 + if (IS_ERR(h_file)) + goto out; + -+ arg |= vfsub_file_flags(file) & FASYNC; /* stop calling h_file->fasync */ ++ /* stop calling h_file->fasync */ ++ arg |= vfsub_file_flags(file) & FASYNC; + err = setfl(/*unused fd*/-1, h_file, arg); + fput(h_file); /* instead of au_read_post() */ + @@ -11990,7 +12026,7 @@ index 0000000..97c2f19 +}; diff --git a/fs/aufs/fhsm.c b/fs/aufs/fhsm.c new file mode 100644 -index 0000000..ef6f99e +index 000000000000..ef6f99e985c8 --- /dev/null +++ b/fs/aufs/fhsm.c @@ -0,0 +1,426 @@ @@ -12422,7 +12458,7 @@ index 0000000..ef6f99e +} diff --git a/fs/aufs/file.c b/fs/aufs/file.c new file mode 100644 -index 0000000..daaecab +index 000000000000..daaecab0af72 --- /dev/null +++ b/fs/aufs/file.c @@ -0,0 +1,858 @@ @@ -13286,7 +13322,7 @@ index 0000000..daaecab +}; diff --git a/fs/aufs/file.h b/fs/aufs/file.h new file mode 100644 -index 0000000..d12fd97 +index 000000000000..d12fd97af059 --- /dev/null +++ b/fs/aufs/file.h @@ -0,0 +1,330 @@ @@ -13622,7 +13658,7 @@ index 0000000..d12fd97 +#endif /* __AUFS_FILE_H__ */ diff --git a/fs/aufs/finfo.c b/fs/aufs/finfo.c new file mode 100644 -index 0000000..3a8131d4 +index 000000000000..3a8131d43a86 --- /dev/null +++ b/fs/aufs/finfo.c @@ -0,0 +1,148 @@ @@ -13776,7 +13812,7 @@ index 0000000..3a8131d4 +} diff --git a/fs/aufs/fstype.h b/fs/aufs/fstype.h new file mode 100644 -index 0000000..4624f1e +index 000000000000..4624f1ef222f --- /dev/null +++ b/fs/aufs/fstype.h @@ -0,0 +1,400 @@ @@ -14182,7 +14218,7 @@ index 0000000..4624f1e +#endif /* __AUFS_FSTYPE_H__ */ diff --git a/fs/aufs/hfsnotify.c b/fs/aufs/hfsnotify.c new file mode 100644 -index 0000000..7298c57 +index 000000000000..7298c575c662 --- /dev/null +++ b/fs/aufs/hfsnotify.c @@ -0,0 +1,287 @@ @@ -14475,7 +14511,7 @@ index 0000000..7298c57 +}; diff --git a/fs/aufs/hfsplus.c b/fs/aufs/hfsplus.c new file mode 100644 -index 0000000..b5b6547 +index 000000000000..b5b6547024e5 --- /dev/null +++ b/fs/aufs/hfsplus.c @@ -0,0 +1,56 @@ @@ -14537,7 +14573,7 @@ index 0000000..b5b6547 +} diff --git a/fs/aufs/hnotify.c b/fs/aufs/hnotify.c new file mode 100644 -index 0000000..773a1d2 +index 000000000000..773a1d2945c5 --- /dev/null +++ b/fs/aufs/hnotify.c @@ -0,0 +1,711 @@ @@ -15254,7 +15290,7 @@ index 0000000..773a1d2 +} diff --git a/fs/aufs/i_op.c b/fs/aufs/i_op.c new file mode 100644 -index 0000000..bfd2df9 +index 000000000000..bfd2df9af841 --- /dev/null +++ b/fs/aufs/i_op.c @@ -0,0 +1,1452 @@ @@ -16712,10 +16748,10 @@ index 0000000..bfd2df9 +}; diff --git a/fs/aufs/i_op_add.c b/fs/aufs/i_op_add.c new file mode 100644 -index 0000000..c3bd0f0 +index 000000000000..a678e723c6a0 --- /dev/null +++ b/fs/aufs/i_op_add.c -@@ -0,0 +1,928 @@ +@@ -0,0 +1,920 @@ +/* + * Copyright (C) 2005-2017 Junjiro R. Okajima + * @@ -17173,18 +17209,11 @@ index 0000000..c3bd0f0 + goto out_parent; + + h_parent = au_h_dptr(parent, bindex); -+ err = inode_permission(d_inode(h_parent), MAY_WRITE | MAY_EXEC); -+ if (unlikely(err)) ++ h_dentry = vfs_tmpfile(h_parent, mode, /*open_flag*/0); ++ if (IS_ERR(h_dentry)) { ++ err = PTR_ERR(h_dentry); + goto out_mnt; -+ -+ err = -ENOMEM; -+ h_dentry = d_alloc(h_parent, &dentry->d_name); -+ if (unlikely(!h_dentry)) -+ goto out_mnt; -+ -+ err = h_dir->i_op->tmpfile(h_dir, h_dentry, mode); -+ if (unlikely(err)) -+ goto out_dentry; ++ } + + au_set_dbtop(dentry, bindex); + au_set_dbbot(dentry, bindex); @@ -17205,9 +17234,8 @@ index 0000000..c3bd0f0 + if (au_ibtop(dir) == au_dbtop(dentry)) + au_cpup_attr_timesizes(dir); + } -+ -+out_dentry: + dput(h_dentry); ++ +out_mnt: + vfsub_mnt_drop_write(h_mnt); +out_parent: @@ -17646,7 +17674,7 @@ index 0000000..c3bd0f0 +} diff --git a/fs/aufs/i_op_del.c b/fs/aufs/i_op_del.c new file mode 100644 -index 0000000..f67b74b +index 000000000000..f67b74b2eb3a --- /dev/null +++ b/fs/aufs/i_op_del.c @@ -0,0 +1,511 @@ @@ -18163,7 +18191,7 @@ index 0000000..f67b74b +} diff --git a/fs/aufs/i_op_ren.c b/fs/aufs/i_op_ren.c new file mode 100644 -index 0000000..0c30670 +index 000000000000..0c3067013bc4 --- /dev/null +++ b/fs/aufs/i_op_ren.c @@ -0,0 +1,1165 @@ @@ -19334,7 +19362,7 @@ index 0000000..0c30670 +} diff --git a/fs/aufs/iinfo.c b/fs/aufs/iinfo.c new file mode 100644 -index 0000000..4d3a55c +index 000000000000..4d3a55cb196a --- /dev/null +++ b/fs/aufs/iinfo.c @@ -0,0 +1,285 @@ @@ -19625,7 +19653,7 @@ index 0000000..4d3a55c +} diff --git a/fs/aufs/inode.c b/fs/aufs/inode.c new file mode 100644 -index 0000000..d361e25 +index 000000000000..d361e25280dc --- /dev/null +++ b/fs/aufs/inode.c @@ -0,0 +1,527 @@ @@ -20158,7 +20186,7 @@ index 0000000..d361e25 +} diff --git a/fs/aufs/inode.h b/fs/aufs/inode.h new file mode 100644 -index 0000000..debe3ce +index 000000000000..aa8ab74921f9 --- /dev/null +++ b/fs/aufs/inode.h @@ -0,0 +1,686 @@ @@ -20242,7 +20270,7 @@ index 0000000..debe3ce +struct au_icntnr { + struct au_iinfo iinfo; + struct inode vfs_inode; -+ struct hlist_node plink; ++ struct hlist_node plink; +} ____cacheline_aligned_in_smp; + +/* au_pin flags */ @@ -20850,7 +20878,7 @@ index 0000000..debe3ce +#endif /* __AUFS_INODE_H__ */ diff --git a/fs/aufs/ioctl.c b/fs/aufs/ioctl.c new file mode 100644 -index 0000000..5e501c5 +index 000000000000..5e501c5d4ead --- /dev/null +++ b/fs/aufs/ioctl.c @@ -0,0 +1,219 @@ @@ -21075,7 +21103,7 @@ index 0000000..5e501c5 +#endif diff --git a/fs/aufs/loop.c b/fs/aufs/loop.c new file mode 100644 -index 0000000..1acb82f +index 000000000000..1acb82f0bf07 --- /dev/null +++ b/fs/aufs/loop.c @@ -0,0 +1,147 @@ @@ -21228,7 +21256,7 @@ index 0000000..1acb82f +} diff --git a/fs/aufs/loop.h b/fs/aufs/loop.h new file mode 100644 -index 0000000..9b02d32 +index 000000000000..9b02d32905f4 --- /dev/null +++ b/fs/aufs/loop.h @@ -0,0 +1,52 @@ @@ -21286,7 +21314,7 @@ index 0000000..9b02d32 +#endif /* __AUFS_LOOP_H__ */ diff --git a/fs/aufs/magic.mk b/fs/aufs/magic.mk new file mode 100644 -index 0000000..4f83bdf +index 000000000000..4f83bdf1dd12 --- /dev/null +++ b/fs/aufs/magic.mk @@ -0,0 +1,30 @@ @@ -21322,10 +21350,10 @@ index 0000000..4f83bdf +endif diff --git a/fs/aufs/module.c b/fs/aufs/module.c new file mode 100644 -index 0000000..35027d5 +index 000000000000..3ca7c705b2dd --- /dev/null +++ b/fs/aufs/module.c -@@ -0,0 +1,269 @@ +@@ -0,0 +1,266 @@ +/* + * Copyright (C) 2005-2017 Junjiro R. Okajima + * @@ -21410,7 +21438,6 @@ index 0000000..35027d5 +/* + * aufs caches + */ -+ +struct kmem_cache *au_cache[AuCache_Last]; + +static void au_cache_fin(void) @@ -21522,9 +21549,7 @@ index 0000000..35027d5 + for (i = 0; i < AuIop_Last; i++) + aufs_iop_nogetattr[i].getattr = NULL; + -+ /* First, initialize au_cache */ -+ for (i = 0; i < AuCache_Last; i++) /* including hnotify */ -+ au_cache[i] = NULL; ++ memset(au_cache, 0, sizeof(au_cache)); /* including hnotify */ + + au_sbilist_init(); + sysaufs_brs_init(); @@ -21597,10 +21622,10 @@ index 0000000..35027d5 +module_exit(aufs_exit); diff --git a/fs/aufs/module.h b/fs/aufs/module.h new file mode 100644 -index 0000000..d099cdf +index 000000000000..4f5727cb8088 --- /dev/null +++ b/fs/aufs/module.h -@@ -0,0 +1,102 @@ +@@ -0,0 +1,101 @@ +/* + * Copyright (C) 2005-2017 Junjiro R. Okajima + * @@ -21628,7 +21653,6 @@ index 0000000..d099cdf +#ifdef __KERNEL__ + +#include -+#include "debug.h" + +struct path; +struct seq_file; @@ -21705,7 +21729,7 @@ index 0000000..d099cdf +#endif /* __AUFS_MODULE_H__ */ diff --git a/fs/aufs/mvdown.c b/fs/aufs/mvdown.c new file mode 100644 -index 0000000..0fb18b8 +index 000000000000..0fb18b841e94 --- /dev/null +++ b/fs/aufs/mvdown.c @@ -0,0 +1,704 @@ @@ -22415,10 +22439,10 @@ index 0000000..0fb18b8 +} diff --git a/fs/aufs/opts.c b/fs/aufs/opts.c new file mode 100644 -index 0000000..069d0d1 +index 000000000000..717cb282868c --- /dev/null +++ b/fs/aufs/opts.c -@@ -0,0 +1,1848 @@ +@@ -0,0 +1,1846 @@ +/* + * Copyright (C) 2005-2017 Junjiro R. Okajima + * @@ -23926,7 +23950,6 @@ index 0000000..069d0d1 + } + break; + } -+ + return err; +} + @@ -24087,8 +24110,7 @@ index 0000000..069d0d1 + au_hn_inode_unlock(hdir); + + if (!err && do_free) { -+ if (wbr) -+ kfree(wbr); ++ kfree(wbr); + br->br_wbr = NULL; + } + } @@ -24269,10 +24291,10 @@ index 0000000..069d0d1 +} diff --git a/fs/aufs/opts.h b/fs/aufs/opts.h new file mode 100644 -index 0000000..1cc990e +index 000000000000..d50e65fd5e46 --- /dev/null +++ b/fs/aufs/opts.h -@@ -0,0 +1,213 @@ +@@ -0,0 +1,212 @@ +/* + * Copyright (C) 2005-2017 Junjiro R. Okajima + * @@ -24302,7 +24324,6 @@ index 0000000..1cc990e +#include + +struct file; -+struct super_block; + +/* ---------------------------------------------------------------------- */ + @@ -24488,7 +24509,7 @@ index 0000000..1cc990e +#endif /* __AUFS_OPTS_H__ */ diff --git a/fs/aufs/plink.c b/fs/aufs/plink.c new file mode 100644 -index 0000000..8f3dd67 +index 000000000000..8f3dd6761b04 --- /dev/null +++ b/fs/aufs/plink.c @@ -0,0 +1,514 @@ @@ -25008,7 +25029,7 @@ index 0000000..8f3dd67 +} diff --git a/fs/aufs/poll.c b/fs/aufs/poll.c new file mode 100644 -index 0000000..1aea194 +index 000000000000..1aea1948fd39 --- /dev/null +++ b/fs/aufs/poll.c @@ -0,0 +1,52 @@ @@ -25066,7 +25087,7 @@ index 0000000..1aea194 +} diff --git a/fs/aufs/posix_acl.c b/fs/aufs/posix_acl.c new file mode 100644 -index 0000000..816a47c +index 000000000000..816a47c7df84 --- /dev/null +++ b/fs/aufs/posix_acl.c @@ -0,0 +1,102 @@ @@ -25174,7 +25195,7 @@ index 0000000..816a47c +} diff --git a/fs/aufs/procfs.c b/fs/aufs/procfs.c new file mode 100644 -index 0000000..b94c003 +index 000000000000..b94c0038c0e0 --- /dev/null +++ b/fs/aufs/procfs.c @@ -0,0 +1,169 @@ @@ -25349,7 +25370,7 @@ index 0000000..b94c003 +} diff --git a/fs/aufs/rdu.c b/fs/aufs/rdu.c new file mode 100644 -index 0000000..1f0d8c6 +index 000000000000..1f0d8c65e1f7 --- /dev/null +++ b/fs/aufs/rdu.c @@ -0,0 +1,381 @@ @@ -25736,7 +25757,7 @@ index 0000000..1f0d8c6 +#endif diff --git a/fs/aufs/rwsem.h b/fs/aufs/rwsem.h new file mode 100644 -index 0000000..2abe89f +index 000000000000..2abe89fb29ff --- /dev/null +++ b/fs/aufs/rwsem.h @@ -0,0 +1,198 @@ @@ -25940,7 +25961,7 @@ index 0000000..2abe89f +#endif /* __AUFS_RWSEM_H__ */ diff --git a/fs/aufs/sbinfo.c b/fs/aufs/sbinfo.c new file mode 100644 -index 0000000..995514e +index 000000000000..995514e901b2 --- /dev/null +++ b/fs/aufs/sbinfo.c @@ -0,0 +1,304 @@ @@ -26250,7 +26271,7 @@ index 0000000..995514e +} diff --git a/fs/aufs/spl.h b/fs/aufs/spl.h new file mode 100644 -index 0000000..2845873 +index 000000000000..2845873ef250 --- /dev/null +++ b/fs/aufs/spl.h @@ -0,0 +1,113 @@ @@ -26369,7 +26390,7 @@ index 0000000..2845873 +#endif /* __AUFS_SPL_H__ */ diff --git a/fs/aufs/super.c b/fs/aufs/super.c new file mode 100644 -index 0000000..d893730 +index 000000000000..d89373028a8f --- /dev/null +++ b/fs/aufs/super.c @@ -0,0 +1,1044 @@ @@ -27419,7 +27440,7 @@ index 0000000..d893730 +}; diff --git a/fs/aufs/super.h b/fs/aufs/super.h new file mode 100644 -index 0000000..dede05b +index 000000000000..dede05b266d0 --- /dev/null +++ b/fs/aufs/super.h @@ -0,0 +1,617 @@ @@ -28042,7 +28063,7 @@ index 0000000..dede05b +#endif /* __AUFS_SUPER_H__ */ diff --git a/fs/aufs/sysaufs.c b/fs/aufs/sysaufs.c new file mode 100644 -index 0000000..3f172fd +index 000000000000..3f172fd022e4 --- /dev/null +++ b/fs/aufs/sysaufs.c @@ -0,0 +1,104 @@ @@ -28152,7 +28173,7 @@ index 0000000..3f172fd +} diff --git a/fs/aufs/sysaufs.h b/fs/aufs/sysaufs.h new file mode 100644 -index 0000000..3330733 +index 000000000000..33307336ff75 --- /dev/null +++ b/fs/aufs/sysaufs.h @@ -0,0 +1,101 @@ @@ -28259,7 +28280,7 @@ index 0000000..3330733 +#endif /* __SYSAUFS_H__ */ diff --git a/fs/aufs/sysfs.c b/fs/aufs/sysfs.c new file mode 100644 -index 0000000..096bde9 +index 000000000000..096bde996740 --- /dev/null +++ b/fs/aufs/sysfs.c @@ -0,0 +1,376 @@ @@ -28641,7 +28662,7 @@ index 0000000..096bde9 +} diff --git a/fs/aufs/sysrq.c b/fs/aufs/sysrq.c new file mode 100644 -index 0000000..98d5ad2 +index 000000000000..98d5ad2a28aa --- /dev/null +++ b/fs/aufs/sysrq.c @@ -0,0 +1,157 @@ @@ -28804,7 +28825,7 @@ index 0000000..98d5ad2 +} diff --git a/fs/aufs/vdir.c b/fs/aufs/vdir.c new file mode 100644 -index 0000000..b7583e9 +index 000000000000..b7583e9f0d36 --- /dev/null +++ b/fs/aufs/vdir.c @@ -0,0 +1,892 @@ @@ -29702,10 +29723,10 @@ index 0000000..b7583e9 +} diff --git a/fs/aufs/vfsub.c b/fs/aufs/vfsub.c new file mode 100644 -index 0000000..2bdaf5c +index 000000000000..da07be8099fa --- /dev/null +++ b/fs/aufs/vfsub.c -@@ -0,0 +1,899 @@ +@@ -0,0 +1,900 @@ +/* + * Copyright (C) 2005-2017 Junjiro R. Okajima + * @@ -30306,6 +30327,7 @@ index 0000000..2bdaf5c + lockdep_on(); + if (err >= 0) + vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ ++ + return err; +} + @@ -30607,7 +30629,7 @@ index 0000000..2bdaf5c +} diff --git a/fs/aufs/vfsub.h b/fs/aufs/vfsub.h new file mode 100644 -index 0000000..2216871 +index 000000000000..221687152c8d --- /dev/null +++ b/fs/aufs/vfsub.h @@ -0,0 +1,353 @@ @@ -30966,7 +30988,7 @@ index 0000000..2216871 +#endif /* __AUFS_VFSUB_H__ */ diff --git a/fs/aufs/wbr_policy.c b/fs/aufs/wbr_policy.c new file mode 100644 -index 0000000..a28296d +index 000000000000..a28296d9cd31 --- /dev/null +++ b/fs/aufs/wbr_policy.c @@ -0,0 +1,830 @@ @@ -31802,7 +31824,7 @@ index 0000000..a28296d +}; diff --git a/fs/aufs/whout.c b/fs/aufs/whout.c new file mode 100644 -index 0000000..05c069e +index 000000000000..05c069ebb7c4 --- /dev/null +++ b/fs/aufs/whout.c @@ -0,0 +1,1061 @@ @@ -32869,7 +32891,7 @@ index 0000000..05c069e +} diff --git a/fs/aufs/whout.h b/fs/aufs/whout.h new file mode 100644 -index 0000000..d06f3b2 +index 000000000000..d06f3b2d8ccf --- /dev/null +++ b/fs/aufs/whout.h @@ -0,0 +1,84 @@ @@ -32959,7 +32981,7 @@ index 0000000..d06f3b2 +#endif /* __AUFS_WHOUT_H__ */ diff --git a/fs/aufs/wkq.c b/fs/aufs/wkq.c new file mode 100644 -index 0000000..7371d91 +index 000000000000..7371d91eb911 --- /dev/null +++ b/fs/aufs/wkq.c @@ -0,0 +1,213 @@ @@ -33178,7 +33200,7 @@ index 0000000..7371d91 +} diff --git a/fs/aufs/wkq.h b/fs/aufs/wkq.h new file mode 100644 -index 0000000..0f1f42d +index 000000000000..0f1f42da519a --- /dev/null +++ b/fs/aufs/wkq.h @@ -0,0 +1,93 @@ @@ -33277,7 +33299,7 @@ index 0000000..0f1f42d +#endif /* __AUFS_WKQ_H__ */ diff --git a/fs/aufs/xattr.c b/fs/aufs/xattr.c new file mode 100644 -index 0000000..7cdf37e +index 000000000000..7cdf37e3e545 --- /dev/null +++ b/fs/aufs/xattr.c @@ -0,0 +1,357 @@ @@ -33640,7 +33662,7 @@ index 0000000..7cdf37e +} diff --git a/fs/aufs/xino.c b/fs/aufs/xino.c new file mode 100644 -index 0000000..1d41d57 +index 000000000000..1d41d5752048 --- /dev/null +++ b/fs/aufs/xino.c @@ -0,0 +1,1415 @@ @@ -35060,7 +35082,7 @@ index 0000000..1d41d57 + return err; +} diff --git a/fs/dcache.c b/fs/dcache.c -index 95d71ed..b1ff5be 100644 +index a9f995f6859e..5fd25bba1282 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1164,7 +1164,7 @@ enum d_walk_ret { @@ -35080,7 +35102,7 @@ index 95d71ed..b1ff5be 100644 struct check_mount { struct vfsmount *mnt; -@@ -2864,6 +2865,7 @@ void d_exchange(struct dentry *dentry1, struct dentry *dentry2) +@@ -2862,6 +2863,7 @@ void d_exchange(struct dentry *dentry1, struct dentry *dentry2) write_sequnlock(&rename_lock); } @@ -35089,7 +35111,7 @@ index 95d71ed..b1ff5be 100644 /** * d_ancestor - search for an ancestor diff --git a/fs/exec.c b/fs/exec.c -index 65145a3..8d35776 100644 +index 904199086490..31f14c6dd976 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -109,6 +109,7 @@ bool path_noexec(const struct path *path) @@ -35101,10 +35123,10 @@ index 65145a3..8d35776 100644 #ifdef CONFIG_USELIB /* diff --git a/fs/fcntl.c b/fs/fcntl.c -index be8fbe2..c671660 100644 +index f4e7267d117f..ac1dc669975a 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c -@@ -30,7 +30,7 @@ +@@ -31,7 +31,7 @@ #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) @@ -35113,7 +35135,7 @@ index be8fbe2..c671660 100644 { struct inode * inode = file_inode(filp); int error = 0; -@@ -61,6 +61,8 @@ static int setfl(int fd, struct file * filp, unsigned long arg) +@@ -62,6 +62,8 @@ static int setfl(int fd, struct file * filp, unsigned long arg) if (filp->f_op->check_flags) error = filp->f_op->check_flags(arg); @@ -35122,7 +35144,7 @@ index be8fbe2..c671660 100644 if (error) return error; -@@ -81,6 +83,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) +@@ -82,6 +84,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) out: return error; } @@ -35131,7 +35153,7 @@ index be8fbe2..c671660 100644 static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, int force) diff --git a/fs/file_table.c b/fs/file_table.c -index 954d510..4fb5b10 100644 +index 954d510b765a..4fb5b10241a5 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -148,6 +148,7 @@ struct file *get_empty_filp(void) @@ -35167,10 +35189,10 @@ index 954d510..4fb5b10 100644 void __init files_init(void) { diff --git a/fs/inode.c b/fs/inode.c -index 88110fd..11789ff 100644 +index db5914783a71..365055b18c19 100644 --- a/fs/inode.c +++ b/fs/inode.c -@@ -855,6 +855,8 @@ unsigned int get_next_ino(void) +@@ -854,6 +854,8 @@ unsigned int get_next_ino(void) unsigned int *p = &get_cpu_var(last_ino); unsigned int res = *p; @@ -35179,7 +35201,7 @@ index 88110fd..11789ff 100644 #ifdef CONFIG_SMP if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { static atomic_t shared_last_ino; -@@ -867,7 +869,7 @@ unsigned int get_next_ino(void) +@@ -866,7 +868,7 @@ unsigned int get_next_ino(void) res++; /* get_next_ino should not provide a 0 inode number */ if (unlikely(!res)) @@ -35188,7 +35210,7 @@ index 88110fd..11789ff 100644 *p = res; put_cpu_var(last_ino); return res; -@@ -1642,7 +1644,7 @@ EXPORT_SYMBOL(generic_update_time); +@@ -1640,7 +1642,7 @@ EXPORT_SYMBOL(generic_update_time); * This does the actual work of updating an inodes time or version. Must have * had called mnt_want_write() before calling this. */ @@ -35197,7 +35219,7 @@ index 88110fd..11789ff 100644 { int (*update_time)(struct inode *, struct timespec *, int); -@@ -1651,6 +1653,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) +@@ -1649,6 +1651,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) return update_time(inode, time, flags); } @@ -35206,10 +35228,10 @@ index 88110fd..11789ff 100644 /** * touch_atime - update the access time diff --git a/fs/namespace.c b/fs/namespace.c -index cc1375ef..9b4c67c 100644 +index 5a4438445bf7..cc6f6fb1099a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c -@@ -465,6 +465,7 @@ void __mnt_drop_write(struct vfsmount *mnt) +@@ -462,6 +462,7 @@ void __mnt_drop_write(struct vfsmount *mnt) mnt_dec_writers(real_mount(mnt)); preempt_enable(); } @@ -35217,7 +35239,7 @@ index cc1375ef..9b4c67c 100644 /** * mnt_drop_write - give up write access to a mount -@@ -1884,6 +1885,7 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, +@@ -1881,6 +1882,7 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, } return 0; } @@ -35226,7 +35248,7 @@ index cc1375ef..9b4c67c 100644 static void cleanup_group_ids(struct mount *mnt, struct mount *end) { diff --git a/fs/notify/group.c b/fs/notify/group.c -index fbe3cbe..bdfc61e 100644 +index 32357534de18..14a2d48f3ce2 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -22,6 +22,7 @@ @@ -35237,7 +35259,7 @@ index fbe3cbe..bdfc61e 100644 #include #include "fsnotify.h" -@@ -100,6 +101,7 @@ void fsnotify_get_group(struct fsnotify_group *group) +@@ -109,6 +110,7 @@ void fsnotify_get_group(struct fsnotify_group *group) { atomic_inc(&group->refcnt); } @@ -35245,7 +35267,7 @@ index fbe3cbe..bdfc61e 100644 /* * Drop a reference to a group. Free it if it's through. -@@ -109,6 +111,7 @@ void fsnotify_put_group(struct fsnotify_group *group) +@@ -118,6 +120,7 @@ void fsnotify_put_group(struct fsnotify_group *group) if (atomic_dec_and_test(&group->refcnt)) fsnotify_final_destroy_group(group); } @@ -35253,7 +35275,7 @@ index fbe3cbe..bdfc61e 100644 /* * Create a new fsnotify_group and hold a reference for the group returned. -@@ -137,6 +140,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) +@@ -147,6 +150,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) return group; } @@ -35262,43 +35284,43 @@ index fbe3cbe..bdfc61e 100644 int fsnotify_fasync(int fd, struct file *file, int on) { diff --git a/fs/notify/mark.c b/fs/notify/mark.c -index 6043306..fdb50e4 100644 +index 9991f8826734..77d235c92c90 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c -@@ -113,6 +113,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) - mark->free_mark(mark); - } +@@ -118,6 +118,7 @@ static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark) + { + return atomic_inc_not_zero(&mark->refcnt); } +EXPORT_SYMBOL_GPL(fsnotify_put_mark); - /* Calculate mask of events for a list of marks */ - u32 fsnotify_recalc_mask(struct hlist_head *head) -@@ -230,6 +231,7 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, - mutex_unlock(&group->mark_mutex); - fsnotify_free_mark(mark); + static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) + { +@@ -255,6 +256,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) + queue_delayed_work(system_unbound_wq, &reaper_work, + FSNOTIFY_REAPER_DELAY); } +EXPORT_SYMBOL_GPL(fsnotify_destroy_mark); - void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock) + bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info) { -@@ -415,6 +417,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, - - return ret; +@@ -431,6 +433,7 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) + return 1; + return -1; } +EXPORT_SYMBOL_GPL(fsnotify_add_mark); - int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, - struct inode *inode, struct vfsmount *mnt, int allow_dups) -@@ -521,6 +524,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, - atomic_set(&mark->refcnt, 1); - mark->free_mark = free_mark; + static int fsnotify_attach_connector_to_object( + struct fsnotify_mark_connector __rcu **connp, +@@ -560,6 +563,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, + spin_unlock(&mark->lock); + return err; } +EXPORT_SYMBOL_GPL(fsnotify_init_mark); /* - * Destroy all marks in destroy_list, waits for SRCU period to finish before + * Attach an initialized mark to a given group and fs object. diff --git a/fs/open.c b/fs/open.c -index 949cef2..9a892fb 100644 +index cd0c5be8d012..491442ac4c1e 100644 --- a/fs/open.c +++ b/fs/open.c @@ -64,6 +64,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, @@ -35309,7 +35331,7 @@ index 949cef2..9a892fb 100644 long vfs_truncate(const struct path *path, loff_t length) { -@@ -693,6 +694,7 @@ int open_check_o_direct(struct file *f) +@@ -691,6 +692,7 @@ int open_check_o_direct(struct file *f) } return 0; } @@ -35318,10 +35340,10 @@ index 949cef2..9a892fb 100644 static int do_dentry_open(struct file *f, struct inode *inode, diff --git a/fs/proc/base.c b/fs/proc/base.c -index c87b6b9..6b71643 100644 +index f1e1927ccd48..7b5af0c86adf 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c -@@ -1946,7 +1946,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) +@@ -1943,7 +1943,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) down_read(&mm->mmap_sem); vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { @@ -35331,7 +35353,7 @@ index c87b6b9..6b71643 100644 rc = 0; } diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c -index 7563437..7c0dc0f 100644 +index 75634379f82e..7c0dc0ff4882 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -45,7 +45,10 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) @@ -35347,7 +35369,7 @@ index 7563437..7c0dc0f 100644 ino = inode->i_ino; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c -index 3125780..519b5c3 100644 +index 520802da059c..61a037d0ceea 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -292,7 +292,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) @@ -35362,7 +35384,7 @@ index 3125780..519b5c3 100644 dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; -@@ -1636,7 +1639,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) +@@ -1638,7 +1641,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) struct proc_maps_private *proc_priv = &numa_priv->proc_maps; struct vm_area_struct *vma = v; struct numa_maps *md = &numa_priv->md; @@ -35372,7 +35394,7 @@ index 3125780..519b5c3 100644 struct mm_walk walk = { .hugetlb_entry = gather_hugetlb_stats, diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c -index 23266694..58e59b6 100644 +index 23266694db11..58e59b66a2c8 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -157,7 +157,10 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, @@ -35388,7 +35410,7 @@ index 23266694..58e59b6 100644 ino = inode->i_ino; pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; diff --git a/fs/read_write.c b/fs/read_write.c -index c4f88af..b69e687 100644 +index 19d4d88fa285..ce062e8ae071 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -513,6 +513,30 @@ ssize_t __vfs_write(struct file *file, const char __user *p, size_t count, @@ -35423,10 +35445,10 @@ index c4f88af..b69e687 100644 { mm_segment_t old_fs; diff --git a/fs/splice.c b/fs/splice.c -index 006ba50..0efa652 100644 +index 540c4a44756c..21d789307de7 100644 --- a/fs/splice.c +++ b/fs/splice.c -@@ -859,8 +859,8 @@ EXPORT_SYMBOL(generic_splice_sendpage); +@@ -853,8 +853,8 @@ EXPORT_SYMBOL(generic_splice_sendpage); /* * Attempt to initiate a splice from pipe to file. */ @@ -35437,7 +35459,7 @@ index 006ba50..0efa652 100644 { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); -@@ -872,13 +872,14 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, +@@ -866,13 +866,14 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, return splice_write(pipe, out, ppos, len, flags); } @@ -35455,7 +35477,7 @@ index 006ba50..0efa652 100644 { ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); -@@ -901,6 +902,7 @@ static long do_splice_to(struct file *in, loff_t *ppos, +@@ -895,6 +896,7 @@ static long do_splice_to(struct file *in, loff_t *ppos, return splice_read(in, ppos, pipe, len, flags); } @@ -35464,7 +35486,7 @@ index 006ba50..0efa652 100644 /** * splice_direct_to_actor - splices data directly between two non-pipes diff --git a/fs/sync.c b/fs/sync.c -index 11ba023..c86fe9c 100644 +index 11ba023434b1..c86fe9cdfaeb 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -27,7 +27,7 @@ @@ -35485,7 +35507,7 @@ index 11ba023..c86fe9c 100644 /* * Write out and wait upon all dirty data associated with this diff --git a/fs/xattr.c b/fs/xattr.c -index 94f49a0..243f57e 100644 +index 464c94bf65f9..0234d49d4f3a 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -296,6 +296,7 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, @@ -35497,7 +35519,7 @@ index 94f49a0..243f57e 100644 ssize_t __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, diff --git a/include/linux/file.h b/include/linux/file.h -index 61eb82c..e700888 100644 +index 61eb82cbafba..e700888b4da4 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -19,6 +19,7 @@ struct dentry; @@ -35509,10 +35531,10 @@ index 61eb82c..e700888 100644 static inline void fput_light(struct file *file, int fput_needed) { diff --git a/include/linux/fs.h b/include/linux/fs.h -index 7415630..433e314 100644 +index 803e5a9b2654..22d291017b65 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h -@@ -1246,6 +1246,7 @@ extern void fasync_free(struct fasync_struct *); +@@ -1248,6 +1248,7 @@ extern void fasync_free(struct fasync_struct *); /* can be called from interrupts */ extern void kill_fasync(struct fasync_struct **, int, int); @@ -35520,7 +35542,7 @@ index 7415630..433e314 100644 extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force); extern void f_setown(struct file *filp, unsigned long arg, int force); extern void f_delown(struct file *filp); -@@ -1673,6 +1674,7 @@ struct file_operations { +@@ -1674,6 +1675,7 @@ struct file_operations { ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); @@ -35528,7 +35550,7 @@ index 7415630..433e314 100644 int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); -@@ -1749,6 +1751,12 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, +@@ -1750,6 +1752,12 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, struct iovec *fast_pointer, struct iovec **ret_pointer); @@ -35541,7 +35563,7 @@ index 7415630..433e314 100644 extern ssize_t __vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t __vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); -@@ -2127,6 +2135,7 @@ extern int current_umask(void); +@@ -2131,6 +2139,7 @@ extern int current_umask(void); extern void ihold(struct inode * inode); extern void iput(struct inode *); extern int generic_update_time(struct inode *, struct timespec *, int); @@ -35549,7 +35571,7 @@ index 7415630..433e314 100644 /* /sys/fs */ extern struct kobject *fs_kobj; -@@ -2407,6 +2416,7 @@ static inline bool sb_is_blkdev_sb(struct super_block *sb) +@@ -2411,6 +2420,7 @@ static inline bool sb_is_blkdev_sb(struct super_block *sb) return false; } #endif @@ -35558,10 +35580,10 @@ index 7415630..433e314 100644 extern const struct file_operations def_blk_fops; extern const struct file_operations def_chr_fops; diff --git a/include/linux/mm.h b/include/linux/mm.h -index 00a8fa7..a90742b 100644 +index 6f543a47fc92..696494b99fc5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h -@@ -1294,6 +1294,28 @@ static inline int fixup_user_fault(struct task_struct *tsk, +@@ -1306,6 +1306,28 @@ static inline int fixup_user_fault(struct task_struct *tsk, } #endif @@ -35591,7 +35613,7 @@ index 00a8fa7..a90742b 100644 unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h -index f60f45f..38c5f76 100644 +index 45cdb27791a3..1a40012233ad 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -259,6 +259,7 @@ struct vm_region { @@ -35611,10 +35633,10 @@ index f60f45f..38c5f76 100644 #ifndef CONFIG_MMU diff --git a/include/linux/splice.h b/include/linux/splice.h -index 00a2116..1f0a4a2 100644 +index db42746bdfea..12f3a5a4b88d 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h -@@ -86,4 +86,10 @@ extern void spd_release_page(struct splice_pipe_desc *, unsigned int); +@@ -86,4 +86,10 @@ extern void splice_shrink_spd(struct splice_pipe_desc *); extern const struct pipe_buf_operations page_cache_pipe_buf_ops; extern const struct pipe_buf_operations default_pipe_buf_ops; @@ -35625,21 +35647,9 @@ index 00a2116..1f0a4a2 100644 + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); #endif -diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild -index f8d9fed..902305e 100644 ---- a/include/uapi/linux/Kbuild -+++ b/include/uapi/linux/Kbuild -@@ -59,6 +59,7 @@ header-y += atmsvc.h - header-y += atm_tcp.h - header-y += atm_zatm.h - header-y += audit.h -+header-y += aufs_type.h - header-y += auto_fs4.h - header-y += auto_fs.h - header-y += auxvec.h diff --git a/include/uapi/linux/aufs_type.h b/include/uapi/linux/aufs_type.h new file mode 100644 -index 0000000..0e0a004 +index 000000000000..36de33d5aa1b --- /dev/null +++ b/include/uapi/linux/aufs_type.h @@ -0,0 +1,419 @@ @@ -35684,7 +35694,7 @@ index 0000000..0e0a004 + +#include + -+#define AUFS_VERSION "4.10-20170522" ++#define AUFS_VERSION "4.11.7+-20170703" + +/* todo? move this to linux-2.6.19/include/magic.h */ +#define AUFS_SUPER_MAGIC ('a' << 24 | 'u' << 16 | 'f' << 8 | 's') @@ -36063,10 +36073,10 @@ index 0000000..0e0a004 + +#endif /* __AUFS_TYPE_H__ */ diff --git a/kernel/fork.c b/kernel/fork.c -index 4cc564e..631a65c 100644 +index e53770d2bf95..cdf75164aa25 100644 --- a/kernel/fork.c +++ b/kernel/fork.c -@@ -641,7 +641,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, +@@ -665,7 +665,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; @@ -36076,7 +36086,7 @@ index 4cc564e..631a65c 100644 atomic_dec(&inode->i_writecount); i_mmap_lock_write(mapping); diff --git a/kernel/task_work.c b/kernel/task_work.c -index d513051..e056d54 100644 +index d513051fcca2..e056d5429783 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -119,3 +119,4 @@ void task_work_run(void) @@ -36085,7 +36095,7 @@ index d513051..e056d54 100644 } +EXPORT_SYMBOL_GPL(task_work_run); diff --git a/mm/Makefile b/mm/Makefile -index 026f6a8..723da17 100644 +index 026f6a828a50..723da170e575 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ @@ -36098,10 +36108,10 @@ index 026f6a8..723da17 100644 obj-y += init-mm.o diff --git a/mm/filemap.c b/mm/filemap.c -index 157c047..0cf15d2 100644 +index 6f1be573a5e6..69a8d947092e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c -@@ -2406,7 +2406,7 @@ int filemap_page_mkwrite(struct vm_fault *vmf) +@@ -2408,7 +2408,7 @@ int filemap_page_mkwrite(struct vm_fault *vmf) int ret = VM_FAULT_LOCKED; sb_start_pagefault(inode->i_sb); @@ -36111,7 +36121,7 @@ index 157c047..0cf15d2 100644 if (page->mapping != inode->i_mapping) { unlock_page(page); diff --git a/mm/mmap.c b/mm/mmap.c -index bfbe885..8bd32f9 100644 +index a5e3dcd75e79..a5d908c301f8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -170,7 +170,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) @@ -36123,7 +36133,7 @@ index bfbe885..8bd32f9 100644 mpol_put(vma_policy(vma)); kmem_cache_free(vm_area_cachep, vma); return next; -@@ -881,7 +881,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, +@@ -895,7 +895,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (remove_next) { if (file) { uprobe_munmap(next, next->vm_start, next->vm_end); @@ -36132,7 +36142,7 @@ index bfbe885..8bd32f9 100644 } if (next->anon_vma) anon_vma_merge(vma, next); -@@ -1731,8 +1731,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, +@@ -1745,8 +1745,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return addr; unmap_and_free_vma: @@ -36142,7 +36152,7 @@ index bfbe885..8bd32f9 100644 /* Undo any partial mapping done by a device driver. */ unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); -@@ -2537,7 +2537,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, +@@ -2571,7 +2571,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, goto out_free_mpol; if (new->vm_file) @@ -36151,7 +36161,7 @@ index bfbe885..8bd32f9 100644 if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); -@@ -2556,7 +2556,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, +@@ -2590,7 +2590,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (new->vm_ops && new->vm_ops->close) new->vm_ops->close(new); if (new->vm_file) @@ -36160,7 +36170,7 @@ index bfbe885..8bd32f9 100644 unlink_anon_vmas(new); out_free_mpol: mpol_put(vma_policy(new)); -@@ -2710,7 +2710,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, +@@ -2744,7 +2744,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, struct vm_area_struct *vma; unsigned long populate = 0; unsigned long ret = -EINVAL; @@ -36169,7 +36179,7 @@ index bfbe885..8bd32f9 100644 pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n", current->comm, current->pid); -@@ -2785,10 +2785,27 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, +@@ -2819,10 +2819,27 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, } } @@ -36198,7 +36208,7 @@ index bfbe885..8bd32f9 100644 out: up_write(&mm->mmap_sem); if (populate) -@@ -3079,7 +3096,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, +@@ -3113,7 +3130,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; if (new_vma->vm_file) @@ -36208,10 +36218,10 @@ index bfbe885..8bd32f9 100644 new_vma->vm_ops->open(new_vma); vma_link(mm, new_vma, prev, rb_link, rb_parent); diff --git a/mm/nommu.c b/mm/nommu.c -index 2d131b9..5dc9a19 100644 +index fc184f597d59..637ea81d1f2f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c -@@ -637,7 +637,7 @@ static void __put_nommu_region(struct vm_region *region) +@@ -641,7 +641,7 @@ static void __put_nommu_region(struct vm_region *region) up_write(&nommu_region_sem); if (region->vm_file) @@ -36220,7 +36230,7 @@ index 2d131b9..5dc9a19 100644 /* IO memory and memory shared directly out of the pagecache * from ramfs/tmpfs mustn't be released here */ -@@ -795,7 +795,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) +@@ -799,7 +799,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); if (vma->vm_file) @@ -36229,7 +36239,7 @@ index 2d131b9..5dc9a19 100644 put_nommu_region(vma->vm_region); kmem_cache_free(vm_area_cachep, vma); } -@@ -1322,7 +1322,7 @@ unsigned long do_mmap(struct file *file, +@@ -1326,7 +1326,7 @@ unsigned long do_mmap(struct file *file, goto error_just_free; } } @@ -36238,7 +36248,7 @@ index 2d131b9..5dc9a19 100644 kmem_cache_free(vm_region_jar, region); region = pregion; result = start; -@@ -1397,10 +1397,10 @@ unsigned long do_mmap(struct file *file, +@@ -1401,10 +1401,10 @@ unsigned long do_mmap(struct file *file, up_write(&nommu_region_sem); error: if (region->vm_file) @@ -36253,13 +36263,13 @@ index 2d131b9..5dc9a19 100644 diff --git a/mm/prfile.c b/mm/prfile.c new file mode 100644 -index 0000000..c1c8518 +index 000000000000..1ef053bf4f49 --- /dev/null +++ b/mm/prfile.c @@ -0,0 +1,85 @@ +/* -+ * Mainly for aufs which mmap(2) diffrent file and wants to print different path -+ * in /proc/PID/maps. ++ * Mainly for aufs which mmap(2) different file and wants to print different ++ * path in /proc/PID/maps. + * Call these functions via macros defined in linux/mm.h. + * + * See Documentation/filesystems/aufs/design/06mmap.txt @@ -36343,7 +36353,7 @@ index 0000000..c1c8518 +} +#endif /* !CONFIG_MMU */ diff --git a/security/commoncap.c b/security/commoncap.c -index 78b3783..c8b3e88 100644 +index 7abebd782d5e..c079ce4d392b 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -1062,12 +1062,14 @@ int cap_mmap_addr(unsigned long addr) @@ -36362,7 +36372,7 @@ index 78b3783..c8b3e88 100644 #ifdef CONFIG_SECURITY diff --git a/security/device_cgroup.c b/security/device_cgroup.c -index 03c1652..f88c84b 100644 +index 03c1652c9a1f..f88c84bf1b61 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -7,6 +7,7 @@ @@ -36382,10 +36392,10 @@ index 03c1652..f88c84b 100644 int devcgroup_inode_mknod(int mode, dev_t dev) { diff --git a/security/security.c b/security/security.c -index d0e07f2..5e323b0 100644 +index b9fea3999cf8..afa97dded1f7 100644 --- a/security/security.c +++ b/security/security.c -@@ -481,6 +481,7 @@ int security_path_rmdir(const struct path *dir, struct dentry *dentry) +@@ -492,6 +492,7 @@ int security_path_rmdir(const struct path *dir, struct dentry *dentry) return 0; return call_int_hook(path_rmdir, 0, dir, dentry); } @@ -36393,7 +36403,7 @@ index d0e07f2..5e323b0 100644 int security_path_unlink(const struct path *dir, struct dentry *dentry) { -@@ -497,6 +498,7 @@ int security_path_symlink(const struct path *dir, struct dentry *dentry, +@@ -508,6 +509,7 @@ int security_path_symlink(const struct path *dir, struct dentry *dentry, return 0; return call_int_hook(path_symlink, 0, dir, dentry, old_name); } @@ -36401,7 +36411,7 @@ index d0e07f2..5e323b0 100644 int security_path_link(struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) -@@ -505,6 +507,7 @@ int security_path_link(struct dentry *old_dentry, const struct path *new_dir, +@@ -516,6 +518,7 @@ int security_path_link(struct dentry *old_dentry, const struct path *new_dir, return 0; return call_int_hook(path_link, 0, old_dentry, new_dir, new_dentry); } @@ -36409,7 +36419,7 @@ index d0e07f2..5e323b0 100644 int security_path_rename(const struct path *old_dir, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, -@@ -532,6 +535,7 @@ int security_path_truncate(const struct path *path) +@@ -543,6 +546,7 @@ int security_path_truncate(const struct path *path) return 0; return call_int_hook(path_truncate, 0, path); } @@ -36417,7 +36427,7 @@ index d0e07f2..5e323b0 100644 int security_path_chmod(const struct path *path, umode_t mode) { -@@ -539,6 +543,7 @@ int security_path_chmod(const struct path *path, umode_t mode) +@@ -550,6 +554,7 @@ int security_path_chmod(const struct path *path, umode_t mode) return 0; return call_int_hook(path_chmod, 0, path, mode); } @@ -36425,7 +36435,7 @@ index d0e07f2..5e323b0 100644 int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid) { -@@ -546,6 +551,7 @@ int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid) +@@ -557,6 +562,7 @@ int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid) return 0; return call_int_hook(path_chown, 0, path, uid, gid); } @@ -36433,7 +36443,7 @@ index d0e07f2..5e323b0 100644 int security_path_chroot(const struct path *path) { -@@ -631,6 +637,7 @@ int security_inode_readlink(struct dentry *dentry) +@@ -642,6 +648,7 @@ int security_inode_readlink(struct dentry *dentry) return 0; return call_int_hook(inode_readlink, 0, dentry); } @@ -36441,7 +36451,7 @@ index d0e07f2..5e323b0 100644 int security_inode_follow_link(struct dentry *dentry, struct inode *inode, bool rcu) -@@ -646,6 +653,7 @@ int security_inode_permission(struct inode *inode, int mask) +@@ -657,6 +664,7 @@ int security_inode_permission(struct inode *inode, int mask) return 0; return call_int_hook(inode_permission, 0, inode, mask); } @@ -36449,7 +36459,7 @@ index d0e07f2..5e323b0 100644 int security_inode_setattr(struct dentry *dentry, struct iattr *attr) { -@@ -817,6 +825,7 @@ int security_file_permission(struct file *file, int mask) +@@ -828,6 +836,7 @@ int security_file_permission(struct file *file, int mask) return fsnotify_perm(file, mask); } @@ -36457,7 +36467,7 @@ index d0e07f2..5e323b0 100644 int security_file_alloc(struct file *file) { -@@ -876,6 +885,7 @@ int security_mmap_file(struct file *file, unsigned long prot, +@@ -887,6 +896,7 @@ int security_mmap_file(struct file *file, unsigned long prot, return ret; return ima_file_mmap(file, prot); } diff --git a/include-kbuild-export-pci_ids.patch b/include-kbuild-export-pci_ids.patch deleted file mode 100644 index ebcf983..0000000 --- a/include-kbuild-export-pci_ids.patch +++ /dev/null @@ -1,19 +0,0 @@ -From Thierry Vignaud (Mandriva) - -We now lacks /usr/include/linux/pci_ids.h which break ldetect build... -Can you readd it please? -Thanks ---- - include/linux/Kbuild | 1 + - 1 file changed, 1 insertion(+) - ---- linux/include/uapi/linux/Kbuild.include-kbuild-export-pci_ids.orig -+++ linux/include/uapi/linux/Kbuild -@@ -277,6 +277,7 @@ header-y += param.h - header-y += parport.h - header-y += patchkey.h - header-y += pci.h -+header-y += pci_ids.h - header-y += pci_regs.h - header-y += perf_event.h - header-y += personality.h diff --git a/kernel-i586.config b/kernel-i586.config index 3c74cb7..f9fef5b 100644 --- a/kernel-i586.config +++ b/kernel-i586.config @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 4.11.x-nrj-desktop Kernel Configuration +# Linux/x86 4.12.x-nrj-desktop Kernel Configuration # # CONFIG_64BIT is not set CONFIG_X86_32=y @@ -264,6 +264,7 @@ CONFIG_BLK_DEV_BSGLIB=y CONFIG_BLK_DEV_INTEGRITY=y CONFIG_BLK_DEV_ZONED=y CONFIG_BLK_DEV_THROTTLING=y +# CONFIG_BLK_DEV_THROTTLING_LOW is not set # CONFIG_BLK_CMDLINE_PARSER is not set CONFIG_BLK_WBT=y # CONFIG_BLK_WBT_SQ is not set @@ -305,14 +306,14 @@ CONFIG_IOSCHED_NOOP=y CONFIG_IOSCHED_DEADLINE=y CONFIG_IOSCHED_CFQ=y CONFIG_CFQ_GROUP_IOSCHED=y +# CONFIG_DEFAULT_DEADLINE is not set +CONFIG_DEFAULT_CFQ=y +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="cfq" +CONFIG_MQ_IOSCHED_DEADLINE=y +CONFIG_MQ_IOSCHED_KYBER=y CONFIG_IOSCHED_BFQ=y CONFIG_BFQ_GROUP_IOSCHED=y -# CONFIG_DEFAULT_DEADLINE is not set -# CONFIG_DEFAULT_CFQ is not set -CONFIG_DEFAULT_BFQ=y -# CONFIG_DEFAULT_NOOP is not set -CONFIG_DEFAULT_IOSCHED="bfq" -CONFIG_MQ_IOSCHED_DEADLINE=y CONFIG_PREEMPT_NOTIFIERS=y CONFIG_PADATA=y CONFIG_ASN1=y @@ -417,11 +418,11 @@ CONFIG_X86_LOCAL_APIC=y CONFIG_X86_IO_APIC=y CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y CONFIG_X86_MCE=y +# CONFIG_X86_MCELOG_LEGACY is not set CONFIG_X86_MCE_INTEL=y CONFIG_X86_MCE_AMD=y # CONFIG_X86_ANCIENT_MCE is not set CONFIG_X86_MCE_THRESHOLD=y -# CONFIG_X86_MCE_INJECT is not set CONFIG_X86_THERMAL_VECTOR=y # @@ -603,6 +604,7 @@ CONFIG_ACPI_WATCHDOG=y CONFIG_ACPI_EXTLOG=m CONFIG_PMIC_OPREGION=y CONFIG_CRC_PMIC_OPREGION=y +# CONFIG_XPOWER_PMIC_OPREGION is not set # CONFIG_BXT_WC_PMIC_OPREGION is not set CONFIG_ACPI_CONFIGFS=m CONFIG_SFI=y @@ -722,9 +724,23 @@ CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m CONFIG_HOTPLUG_PCI_SHPC=m # -# PCI host controller drivers +# DesignWare PCI Core Support # # CONFIG_PCIE_DW_PLAT is not set + +# +# PCI host controller drivers +# + +# +# PCI Endpoint +# +# CONFIG_PCI_ENDPOINT is not set + +# +# PCI switch controller drivers +# +CONFIG_PCI_SW_SWITCHTEC=m # CONFIG_ISA_BUS is not set CONFIG_ISA_DMA_API=y CONFIG_ISA=y @@ -1300,11 +1316,12 @@ CONFIG_BRIDGE=m CONFIG_BRIDGE_IGMP_SNOOPING=y CONFIG_BRIDGE_VLAN_FILTERING=y CONFIG_NET_DSA=m -CONFIG_NET_DSA_TAG_BRCM=y CONFIG_NET_DSA_TAG_DSA=y CONFIG_NET_DSA_TAG_EDSA=y CONFIG_NET_DSA_TAG_TRAILER=y CONFIG_NET_DSA_TAG_QCA=y +CONFIG_NET_DSA_TAG_MTK=y +CONFIG_NET_DSA_TAG_LAN9303=y CONFIG_VLAN_8021Q=m CONFIG_VLAN_8021Q_GVRP=y CONFIG_VLAN_8021Q_MVRP=y @@ -1375,6 +1392,7 @@ CONFIG_NET_SCH_HHF=m CONFIG_NET_SCH_PIE=m CONFIG_NET_SCH_INGRESS=m CONFIG_NET_SCH_PLUG=m +# CONFIG_NET_SCH_DEFAULT is not set # # Classification @@ -1491,6 +1509,7 @@ CONFIG_CAN_GW=m # CAN Device Drivers # CONFIG_CAN_VCAN=m +CONFIG_CAN_VXCAN=m CONFIG_CAN_SLCAN=m CONFIG_CAN_DEV=m # CONFIG_CAN_CALC_BITTIMING is not set @@ -1505,6 +1524,7 @@ CONFIG_CAN_CC770_ISA=m CONFIG_CAN_CC770_PLATFORM=m CONFIG_CAN_IFI_CANFD=m CONFIG_CAN_M_CAN=m +CONFIG_CAN_PEAK_PCIEFD=m CONFIG_CAN_SJA1000=m CONFIG_CAN_SJA1000_ISA=m # CONFIG_CAN_SJA1000_PLATFORM is not set @@ -1522,6 +1542,7 @@ CONFIG_CAN_SOFTING_CS=m # # CAN SPI interfaces # +CONFIG_CAN_HI311X=m CONFIG_CAN_MCP251X=m # @@ -1533,6 +1554,7 @@ CONFIG_CAN_GS_USB=m CONFIG_CAN_KVASER_USB=m CONFIG_CAN_PEAK_USB=m CONFIG_CAN_8DEV_USB=m +CONFIG_CAN_MCBA_USB=m # CONFIG_CAN_DEBUG_DEVICES is not set CONFIG_IRDA=m @@ -1619,7 +1641,9 @@ CONFIG_BT_HCIBTUSB_BCM=y CONFIG_BT_HCIBTUSB_RTL=y CONFIG_BT_HCIBTSDIO=m CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_SERDEV=y CONFIG_BT_HCIUART_H4=y +CONFIG_BT_HCIUART_NOKIA=m CONFIG_BT_HCIUART_BCSP=y CONFIG_BT_HCIUART_ATH3K=y CONFIG_BT_HCIUART_LL=y @@ -1710,7 +1734,6 @@ CONFIG_NFC_SHDLC=y # # Near Field Communication (NFC) devices # -CONFIG_NFC_WILINK=m CONFIG_NFC_TRF7970A=m CONFIG_NFC_MEI_PHY=m CONFIG_NFC_SIM=m @@ -1890,10 +1913,8 @@ CONFIG_MTD_NAND_ECC_BCH=y CONFIG_MTD_SM_COMMON=m CONFIG_MTD_NAND_DENALI=m CONFIG_MTD_NAND_DENALI_PCI=m -CONFIG_MTD_NAND_DENALI_SCRATCH_REG_ADDR=0xFF108018 CONFIG_MTD_NAND_GPIO=m # CONFIG_MTD_NAND_OMAP_BCH_BUILD is not set -CONFIG_MTD_NAND_IDS=m CONFIG_MTD_NAND_RICOH=m CONFIG_MTD_NAND_DISKONCHIP=m # CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED is not set @@ -2047,10 +2068,7 @@ CONFIG_PCH_PHUB=m CONFIG_USB_SWITCH_FSA9480=m CONFIG_LATTICE_ECP3_CONFIG=m CONFIG_SRAM=y -CONFIG_PANEL=m -CONFIG_PANEL_PARPORT=0 -CONFIG_PANEL_PROFILE=5 -# CONFIG_PANEL_CHANGE_MESSAGE is not set +# CONFIG_PCI_ENDPOINT_TEST is not set CONFIG_C2PORT=m CONFIG_C2PORT_DURAMAR_2150=m @@ -2416,7 +2434,6 @@ CONFIG_DM_SNAPSHOT=m CONFIG_DM_THIN_PROVISIONING=m CONFIG_DM_CACHE=m CONFIG_DM_CACHE_SMQ=m -CONFIG_DM_CACHE_CLEANER=m CONFIG_DM_ERA=m CONFIG_DM_MIRROR=m CONFIG_DM_LOG_USERSPACE=m @@ -2432,6 +2449,7 @@ CONFIG_DM_VERITY=m # CONFIG_DM_VERITY_FEC is not set CONFIG_DM_SWITCH=m CONFIG_DM_LOG_WRITES=m +CONFIG_DM_INTEGRITY=m CONFIG_TARGET_CORE=m CONFIG_TCM_IBLOCK=m CONFIG_TCM_FILEIO=m @@ -2490,6 +2508,7 @@ CONFIG_VETH=m CONFIG_VIRTIO_NET=m CONFIG_NLMON=m CONFIG_NET_VRF=m +CONFIG_VSOCKMON=m CONFIG_SUNGEM_PHY=m # @@ -2505,7 +2524,6 @@ CONFIG_CAIF_VIRTIO=m # Distributed Switch Architecture drivers # CONFIG_NET_DSA_MV88E6060=m -CONFIG_NET_DSA_BCM_SF2=m CONFIG_B53=m CONFIG_B53_SPI_DRIVER=m CONFIG_B53_MDIO_DRIVER=m @@ -2514,7 +2532,11 @@ CONFIG_B53_SRAB_DRIVER=m CONFIG_NET_DSA_MV88E6XXX=m CONFIG_NET_DSA_MV88E6XXX_GLOBAL2=y CONFIG_NET_DSA_QCA8K=m - +CONFIG_NET_DSA_LOOP=m +CONFIG_NET_DSA_MT7530=m +CONFIG_NET_DSA_SMSC_LAN9303=m +CONFIG_NET_DSA_SMSC_LAN9303_I2C=m +CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m CONFIG_ETHERNET=y CONFIG_MDIO=m CONFIG_NET_VENDOR_3COM=y @@ -2566,6 +2588,7 @@ CONFIG_BCMGENET=m CONFIG_BNX2=m CONFIG_CNIC=m CONFIG_TIGON3=m +CONFIG_TIGON3_HWMON=y CONFIG_BNX2X=m CONFIG_BNX2X_SRIOV=y CONFIG_BNXT=m @@ -2634,7 +2657,6 @@ CONFIG_IXGBE_DCB=y CONFIG_IXGBEVF=m CONFIG_I40E=m CONFIG_I40E_DCB=y -CONFIG_I40E_FCOE=y CONFIG_I40EVF=m CONFIG_FM10K=m CONFIG_NET_VENDOR_I825XX=y @@ -2655,6 +2677,7 @@ CONFIG_MLX4_CORE=m CONFIG_MLX5_CORE=m CONFIG_MLX5_CORE_EN=y CONFIG_MLX5_CORE_EN_DCB=y +# CONFIG_MLX5_CORE_IPOIB is not set CONFIG_MLXSW_CORE=m CONFIG_MLXSW_CORE_HWMON=y CONFIG_MLXSW_CORE_THERMAL=y @@ -2761,7 +2784,6 @@ CONFIG_HAPPYMEAL=m CONFIG_SUNGEM=m CONFIG_CASSINI=m CONFIG_NIU=m -CONFIG_NET_VENDOR_SYNOPSYS=y CONFIG_NET_VENDOR_TEHUTI=y CONFIG_TEHUTI=m CONFIG_NET_VENDOR_TI=y @@ -2780,18 +2802,17 @@ CONFIG_WIZNET_BUS_ANY=y CONFIG_WIZNET_W5100_SPI=m CONFIG_NET_VENDOR_XIRCOM=y CONFIG_PCMCIA_XIRC2PS=m +CONFIG_NET_VENDOR_SYNOPSYS=y +CONFIG_DWC_XLGMAC=m +CONFIG_DWC_XLGMAC_PCI=m CONFIG_NET_SB1000=m +CONFIG_MDIO_DEVICE=y +CONFIG_MDIO_BITBANG=m +CONFIG_MDIO_GPIO=m CONFIG_PHYLIB=y CONFIG_SWPHY=y CONFIG_LED_TRIGGER_PHY=y -# -# MDIO bus device drivers -# -CONFIG_MDIO_BCM_UNIMAC=m -CONFIG_MDIO_BITBANG=m -CONFIG_MDIO_GPIO=m - # # MII PHY device drivers # @@ -3147,6 +3168,8 @@ CONFIG_IEEE802154_MRF24J40=m CONFIG_IEEE802154_CC2520=m CONFIG_IEEE802154_ATUSB=m CONFIG_IEEE802154_ADF7242=m +CONFIG_IEEE802154_CA8210=m +# CONFIG_IEEE802154_CA8210_DEBUGFS is not set CONFIG_VMXNET3=m CONFIG_FUJITSU_ES=m CONFIG_HYPERV_NET=m @@ -3276,6 +3299,7 @@ CONFIG_ISDN_HDLC=m CONFIG_NVM=y # CONFIG_NVM_DEBUG is not set CONFIG_NVM_RRPC=m +CONFIG_NVM_PBLK=m # # Input device support @@ -3335,6 +3359,7 @@ CONFIG_MOUSE_PS2_ALPS=y CONFIG_MOUSE_PS2_BYD=y CONFIG_MOUSE_PS2_LOGIPS2PP=y CONFIG_MOUSE_PS2_SYNAPTICS=y +CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y CONFIG_MOUSE_PS2_CYPRESS=y CONFIG_MOUSE_PS2_LIFEBOOK=y CONFIG_MOUSE_PS2_TRACKPOINT=y @@ -3343,6 +3368,7 @@ CONFIG_MOUSE_PS2_SENTELIC=y CONFIG_MOUSE_PS2_TOUCHKIT=y CONFIG_MOUSE_PS2_FOCALTECH=y CONFIG_MOUSE_PS2_VMMOUSE=y +CONFIG_MOUSE_PS2_SMBUS=y CONFIG_MOUSE_SERIAL=m CONFIG_MOUSE_APPLETOUCH=m CONFIG_MOUSE_BCM5974=m @@ -3389,6 +3415,8 @@ CONFIG_JOYSTICK_XPAD=m CONFIG_JOYSTICK_XPAD_FF=y CONFIG_JOYSTICK_XPAD_LEDS=y CONFIG_JOYSTICK_WALKERA0701=m +CONFIG_JOYSTICK_PSXPAD_SPI=m +# CONFIG_JOYSTICK_PSXPAD_SPI_FF is not set CONFIG_INPUT_TABLET=y CONFIG_TABLET_USB_ACECAD=m CONFIG_TABLET_USB_AIPTEK=m @@ -3478,6 +3506,7 @@ CONFIG_TOUCHSCREEN_TSC200X_CORE=m CONFIG_TOUCHSCREEN_TSC2004=m CONFIG_TOUCHSCREEN_TSC2005=m CONFIG_TOUCHSCREEN_TSC2007=m +# CONFIG_TOUCHSCREEN_TSC2007_IIO is not set CONFIG_TOUCHSCREEN_PCAP=m CONFIG_TOUCHSCREEN_RM_TS=m CONFIG_TOUCHSCREEN_SILEAD=m @@ -3651,6 +3680,7 @@ CONFIG_SERIAL_KGDB_NMI=y CONFIG_SERIAL_MAX3100=m CONFIG_SERIAL_MAX310X=y CONFIG_SERIAL_UARTLITE=m +CONFIG_SERIAL_UARTLITE_NR_UARTS=1 CONFIG_SERIAL_CORE=y CONFIG_SERIAL_CORE_CONSOLE=y CONFIG_CONSOLE_POLL=y @@ -3754,6 +3784,7 @@ CONFIG_I2C_MUX=m # Multiplexer I2C Chip support # CONFIG_I2C_MUX_GPIO=m +CONFIG_I2C_MUX_LTC4306=m CONFIG_I2C_MUX_PCA9541=m CONFIG_I2C_MUX_PCA954x=m CONFIG_I2C_MUX_PINCTRL=m @@ -3952,11 +3983,7 @@ CONFIG_GPIO_LYNXPOINT=y # # Port-mapped I/O GPIO drivers # -CONFIG_GPIO_104_DIO_48E=m -CONFIG_GPIO_104_IDIO_16=m -CONFIG_GPIO_104_IDI_48=m CONFIG_GPIO_F7188X=m -# CONFIG_GPIO_GPIO_MM is not set CONFIG_GPIO_IT87=m CONFIG_GPIO_SCH=m CONFIG_GPIO_SCH311X=m @@ -4051,6 +4078,7 @@ CONFIG_W1_SLAVE_DS2423=m CONFIG_W1_SLAVE_DS2431=m CONFIG_W1_SLAVE_DS2433=m # CONFIG_W1_SLAVE_DS2433_CRC is not set +CONFIG_W1_SLAVE_DS2438=m # CONFIG_W1_SLAVE_DS2760 is not set CONFIG_W1_SLAVE_DS2780=m CONFIG_W1_SLAVE_DS2781=m @@ -4078,6 +4106,9 @@ CONFIG_BATTERY_BQ27XXX=m CONFIG_BATTERY_BQ27XXX_I2C=m CONFIG_BATTERY_DA9030=m CONFIG_BATTERY_DA9052=m +CONFIG_CHARGER_AXP20X=m +CONFIG_BATTERY_AXP20X=m +CONFIG_AXP20X_POWER=m CONFIG_AXP288_CHARGER=m # CONFIG_AXP288_FUEL_GAUGE is not set CONFIG_BATTERY_MAX17040=m @@ -4104,7 +4135,6 @@ CONFIG_CHARGER_TPS65090=m CONFIG_CHARGER_TPS65217=m CONFIG_BATTERY_GAUGE_LTC2941=m CONFIG_CHARGER_RT9455=m -CONFIG_AXP20X_POWER=m CONFIG_HWMON=y CONFIG_HWMON_VID=m # CONFIG_HWMON_DEBUG_CHIP is not set @@ -4136,6 +4166,7 @@ CONFIG_SENSORS_K10TEMP=m CONFIG_SENSORS_FAM15H_POWER=m CONFIG_SENSORS_APPLESMC=m CONFIG_SENSORS_ASB100=m +CONFIG_SENSORS_ASPEED=m CONFIG_SENSORS_ATXP1=m CONFIG_SENSORS_DS620=m CONFIG_SENSORS_DS1621=m @@ -4257,7 +4288,6 @@ CONFIG_SENSORS_TMP103=m CONFIG_SENSORS_TMP108=m CONFIG_SENSORS_TMP401=m CONFIG_SENSORS_TMP421=m -CONFIG_SENSORS_TWL4030_MADC=m CONFIG_SENSORS_VIA_CPUTEMP=m CONFIG_SENSORS_VIA686A=m CONFIG_SENSORS_VT1211=m @@ -4282,6 +4312,7 @@ CONFIG_SENSORS_XGENE=m CONFIG_SENSORS_ACPI_POWER=m CONFIG_SENSORS_ATK0110=m CONFIG_THERMAL=y +CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=0 CONFIG_THERMAL_HWMON=y CONFIG_THERMAL_WRITABLE_TRIPS=y CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y @@ -4470,6 +4501,7 @@ CONFIG_MFD_INTEL_QUARK_I2C_GPIO=m CONFIG_LPC_ICH=m CONFIG_LPC_SCH=m CONFIG_INTEL_SOC_PMIC=y +CONFIG_INTEL_SOC_PMIC_BXTWC=m CONFIG_MFD_INTEL_LPSS=m CONFIG_MFD_INTEL_LPSS_ACPI=m CONFIG_MFD_INTEL_LPSS_PCI=m @@ -4509,6 +4541,7 @@ CONFIG_MFD_SYSCON=y CONFIG_MFD_TI_AM335X_TSCADC=m CONFIG_MFD_LP3943=m CONFIG_MFD_LP8788=y +CONFIG_MFD_TI_LMU=m CONFIG_MFD_PALMAS=y # CONFIG_TPS6105X is not set # CONFIG_TPS65010 is not set @@ -4548,6 +4581,7 @@ CONFIG_MFD_WM8350=y CONFIG_MFD_WM8350_I2C=y CONFIG_MFD_WM8994=y # CONFIG_REGULATOR is not set +CONFIG_CEC_CORE=m CONFIG_MEDIA_SUPPORT=m # @@ -4560,8 +4594,7 @@ CONFIG_MEDIA_RADIO_SUPPORT=y CONFIG_MEDIA_SDR_SUPPORT=y CONFIG_MEDIA_RC_SUPPORT=y CONFIG_MEDIA_CEC_SUPPORT=y -# CONFIG_MEDIA_CEC_DEBUG is not set -CONFIG_MEDIA_CEC_EDID=y +# CONFIG_MEDIA_CEC_RC is not set CONFIG_MEDIA_CONTROLLER=y # CONFIG_MEDIA_CONTROLLER_DVB is not set CONFIG_VIDEO_DEV=m @@ -4627,6 +4660,7 @@ CONFIG_RC_LOOPBACK=m CONFIG_IR_GPIO_CIR=m CONFIG_IR_SERIAL=m CONFIG_IR_SERIAL_TRANSMITTER=y +CONFIG_IR_SIR=m CONFIG_MEDIA_USB_SUPPORT=y # @@ -4795,6 +4829,7 @@ CONFIG_USB_MSI2500=m # USB HDMI CEC adapters # CONFIG_USB_PULSE8_CEC=m +CONFIG_USB_RAINSHADOW_CEC=m CONFIG_MEDIA_PCI_SUPPORT=y # @@ -4884,11 +4919,13 @@ CONFIG_V4L_MEM2MEM_DRIVERS=y CONFIG_VIDEO_MEM2MEM_DEINTERLACE=m CONFIG_VIDEO_SH_VEU=m CONFIG_V4L_TEST_DRIVERS=y +CONFIG_VIDEO_VIMC=m CONFIG_VIDEO_VIVID=m CONFIG_VIDEO_VIVID_CEC=y CONFIG_VIDEO_VIVID_MAX_DEVS=64 CONFIG_VIDEO_VIM2M=m CONFIG_DVB_PLATFORM_DRIVERS=y +# CONFIG_CEC_PLATFORM_DRIVERS is not set # # Supported MMC/SDIO adapters @@ -5016,6 +5053,7 @@ CONFIG_VIDEO_ADV7175=m # # Camera sensor devices # +CONFIG_VIDEO_OV2640=m CONFIG_VIDEO_OV7640=m CONFIG_VIDEO_OV7670=m CONFIG_VIDEO_MT9M111=m @@ -5054,7 +5092,6 @@ CONFIG_SOC_CAMERA_MT9M111=m CONFIG_SOC_CAMERA_MT9T031=m CONFIG_SOC_CAMERA_MT9T112=m CONFIG_SOC_CAMERA_MT9V022=m -CONFIG_SOC_CAMERA_OV2640=m CONFIG_SOC_CAMERA_OV5642=m CONFIG_SOC_CAMERA_OV6650=m CONFIG_SOC_CAMERA_OV772X=m @@ -5275,6 +5312,7 @@ CONFIG_DRM_MIPI_DSI=y CONFIG_DRM_KMS_HELPER=m CONFIG_DRM_KMS_FB_HELPER=y CONFIG_DRM_FBDEV_EMULATION=y +CONFIG_DRM_FBDEV_OVERALLOC=100 CONFIG_DRM_LOAD_EDID_FIRMWARE=y CONFIG_DRM_TTM=m CONFIG_DRM_GEM_CMA_HELPER=y @@ -5315,6 +5353,9 @@ CONFIG_DRM_I915_USERPTR=y # CONFIG_DRM_I915_WERROR is not set # CONFIG_DRM_I915_DEBUG is not set # CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS is not set +# CONFIG_DRM_I915_SELFTEST is not set +# CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS is not set +# CONFIG_DRM_I915_DEBUG_VBLANK_EVADE is not set CONFIG_DRM_VGEM=m CONFIG_DRM_VMWGFX=m CONFIG_DRM_VMWGFX_FBCON=y @@ -5508,6 +5549,7 @@ CONFIG_BACKLIGHT_AS3711=m CONFIG_BACKLIGHT_GPIO=m CONFIG_BACKLIGHT_LV5207LP=m CONFIG_BACKLIGHT_BD6107=m +CONFIG_BACKLIGHT_ARCXCNN=m CONFIG_VGASTATE=m CONFIG_HDMI=y @@ -5746,6 +5788,8 @@ CONFIG_SND_FIREWORKS=m CONFIG_SND_BEBOB=m CONFIG_SND_FIREWIRE_DIGI00X=m CONFIG_SND_FIREWIRE_TASCAM=m +CONFIG_SND_FIREWIRE_MOTU=m +CONFIG_SND_FIREFACE=m CONFIG_SND_PCMCIA=y CONFIG_SND_VXPOCKET=m CONFIG_SND_PDAUDIOCF=m @@ -5757,7 +5801,7 @@ CONFIG_SND_SOC_TOPOLOGY=y CONFIG_SND_SOC_AMD_ACP=m CONFIG_SND_ATMEL_SOC=m CONFIG_SND_DESIGNWARE_I2S=m -CONFIG_SND_DESIGNWARE_PCM=m +# CONFIG_SND_DESIGNWARE_PCM is not set # # SoC Audio for Freescale CPUs @@ -5772,6 +5816,7 @@ CONFIG_SND_SOC_FSL_SSI=m CONFIG_SND_SOC_FSL_SPDIF=m CONFIG_SND_SOC_FSL_ESAI=m CONFIG_SND_SOC_IMX_AUDMUX=m +CONFIG_SND_I2S_HI6210_I2S=m CONFIG_SND_SOC_IMG=y CONFIG_SND_SOC_IMG_I2S_IN=m CONFIG_SND_SOC_IMG_I2S_OUT=m @@ -5797,18 +5842,26 @@ CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH=m CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH=m CONFIG_SND_SOC_INTEL_SKYLAKE=m CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m CONFIG_SND_SOC_XTFPGA_I2S=m +CONFIG_ZX_TDM=m CONFIG_SND_SOC_I2C_AND_SPI=m # # CODEC drivers # CONFIG_SND_SOC_AC97_CODEC=m +CONFIG_SND_SOC_ADAU_UTILS=m CONFIG_SND_SOC_ADAU1701=m +CONFIG_SND_SOC_ADAU17X1=m +CONFIG_SND_SOC_ADAU1761=m +CONFIG_SND_SOC_ADAU1761_I2C=m +CONFIG_SND_SOC_ADAU1761_SPI=m CONFIG_SND_SOC_ADAU7002=m CONFIG_SND_SOC_AK4104=m CONFIG_SND_SOC_AK4554=m @@ -5820,6 +5873,7 @@ CONFIG_SND_SOC_BT_SCO=m CONFIG_SND_SOC_CS35L32=m CONFIG_SND_SOC_CS35L33=m CONFIG_SND_SOC_CS35L34=m +CONFIG_SND_SOC_CS35L35=m CONFIG_SND_SOC_CS42L42=m CONFIG_SND_SOC_CS42L51=m CONFIG_SND_SOC_CS42L51_I2C=m @@ -5835,9 +5889,12 @@ CONFIG_SND_SOC_CS42XX8=m CONFIG_SND_SOC_CS42XX8_I2C=m CONFIG_SND_SOC_CS4349=m CONFIG_SND_SOC_CS53L30=m +CONFIG_SND_SOC_DA7213=m CONFIG_SND_SOC_DA7219=m +CONFIG_SND_SOC_DIO2125=m CONFIG_SND_SOC_DMIC=m CONFIG_SND_SOC_HDMI_CODEC=m +CONFIG_SND_SOC_ES7134=m CONFIG_SND_SOC_ES8328=m CONFIG_SND_SOC_ES8328_I2C=m CONFIG_SND_SOC_ES8328_SPI=m @@ -5847,6 +5904,7 @@ CONFIG_SND_SOC_INNO_RK3036=m CONFIG_SND_SOC_MAX98090=m CONFIG_SND_SOC_MAX98357A=m CONFIG_SND_SOC_MAX98504=m +CONFIG_SND_SOC_MAX98927=m CONFIG_SND_SOC_MAX9860=m CONFIG_SND_SOC_MSM8916_WCD_ANALOG=m CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=m @@ -5875,6 +5933,7 @@ CONFIG_SND_SOC_RT5677_SPI=m CONFIG_SND_SOC_SGTL5000=m CONFIG_SND_SOC_SIGMADSP=m CONFIG_SND_SOC_SIGMADSP_I2C=m +CONFIG_SND_SOC_SIGMADSP_REGMAP=m CONFIG_SND_SOC_SIRF_AUDIO_CODEC=m CONFIG_SND_SOC_SPDIF=m CONFIG_SND_SOC_SSM2602=m @@ -5918,13 +5977,13 @@ CONFIG_SND_SOC_WM8978=m CONFIG_SND_SOC_WM8985=m CONFIG_SND_SOC_NAU8540=m CONFIG_SND_SOC_NAU8810=m +CONFIG_SND_SOC_NAU8824=m CONFIG_SND_SOC_NAU8825=m CONFIG_SND_SOC_TPA6130A2=m CONFIG_SND_SIMPLE_CARD_UTILS=m CONFIG_SND_SIMPLE_CARD=m CONFIG_SND_X86=y CONFIG_HDMI_LPE_AUDIO=m -# CONFIG_SOUND_PRIME is not set CONFIG_AC97_BUS=m # @@ -5940,6 +5999,7 @@ CONFIG_HID_GENERIC=m # Special HID drivers # CONFIG_HID_A4TECH=m +CONFIG_HID_ACCUTOUCH=m CONFIG_HID_ACRUX=m CONFIG_HID_ACRUX_FF=y CONFIG_HID_APPLE=m @@ -5989,6 +6049,7 @@ CONFIG_HID_MAYFLASH=m CONFIG_HID_MICROSOFT=m CONFIG_HID_MONTEREY=m CONFIG_HID_MULTITOUCH=m +CONFIG_HID_NTI=m CONFIG_HID_NTRIG=m CONFIG_HID_ORTEK=m CONFIG_HID_PANTHERLORD=m @@ -6055,6 +6116,7 @@ CONFIG_USB_SUPPORT=y CONFIG_USB_COMMON=y CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB=y +CONFIG_USB_PCI=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y # @@ -6319,6 +6381,7 @@ CONFIG_USB_R8A66597=m CONFIG_USB_PXA27X=m CONFIG_USB_MV_UDC=m CONFIG_USB_MV_U3D=m +CONFIG_USB_SNP_CORE=m # CONFIG_USB_M66592 is not set CONFIG_USB_BDC_UDC=m @@ -6401,6 +6464,11 @@ CONFIG_USB_G_DBGP=m # CONFIG_USB_G_DBGP_PRINTK is not set CONFIG_USB_G_DBGP_SERIAL=y CONFIG_USB_G_WEBCAM=m + +# +# USB Power Delivery and Type-C drivers +# +CONFIG_TYPEC=m CONFIG_USB_LED_TRIG=y CONFIG_USB_ULPI_BUS=m CONFIG_UWB=m @@ -6435,6 +6503,7 @@ CONFIG_MMC_REALTEK_PCI=m CONFIG_MMC_REALTEK_USB=m CONFIG_MMC_TOSHIBA_PCI=m CONFIG_MMC_MTK=m +CONFIG_MMC_SDHCI_XENON=m CONFIG_MEMSTICK=m # CONFIG_MEMSTICK_DEBUG is not set @@ -6464,6 +6533,7 @@ CONFIG_LEDS_BRIGHTNESS_HW_CHANGED=y CONFIG_LEDS_88PM860X=m CONFIG_LEDS_LM3530=m CONFIG_LEDS_LM3642=m +CONFIG_LEDS_MT6323=m CONFIG_LEDS_NET48XX=m CONFIG_LEDS_WRAP=m CONFIG_LEDS_PCA9532=m @@ -6491,7 +6561,6 @@ CONFIG_LEDS_PWM=m CONFIG_LEDS_INTEL_SS4200=m CONFIG_LEDS_LT3593=m CONFIG_LEDS_ADP5520=m -CONFIG_LEDS_DELL_NETBOOKS=m CONFIG_LEDS_MC13783=m CONFIG_LEDS_TCA6507=m CONFIG_LEDS_TLC591XX=m @@ -6534,7 +6603,7 @@ CONFIG_EDAC=y CONFIG_EDAC_LEGACY_SYSFS=y # CONFIG_EDAC_DEBUG is not set CONFIG_EDAC_DECODE_MCE=y -CONFIG_EDAC_MM_EDAC=m +# CONFIG_EDAC_GHES is not set CONFIG_EDAC_AMD64=m # CONFIG_EDAC_AMD64_ERROR_INJECTION is not set CONFIG_EDAC_AMD76X=m @@ -6703,7 +6772,6 @@ CONFIG_DW_DMAC_CORE=m CONFIG_DW_DMAC=m CONFIG_DW_DMAC_PCI=m CONFIG_HSU_DMA=m -CONFIG_HSU_DMA_PCI=m # # DMA Clients @@ -6717,12 +6785,18 @@ CONFIG_ASYNC_TX_DMA=y CONFIG_SYNC_FILE=y CONFIG_SW_SYNC=y CONFIG_AUXDISPLAY=y +CONFIG_CHARLCD=m +CONFIG_HD44780=m CONFIG_KS0108=m CONFIG_KS0108_PORT=0x378 CONFIG_KS0108_DELAY=2 CONFIG_CFAG12864B=m CONFIG_CFAG12864B_RATE=20 # CONFIG_IMG_ASCII_LCD is not set +CONFIG_PANEL=m +CONFIG_PANEL_PARPORT=0 +CONFIG_PANEL_PROFILE=5 +# CONFIG_PANEL_CHANGE_MESSAGE is not set CONFIG_UIO=m CONFIG_UIO_CIF=m # CONFIG_UIO_PDRV_GENIRQ is not set @@ -6747,7 +6821,7 @@ CONFIG_VFIO_MDEV=m CONFIG_VFIO_MDEV_DEVICE=m CONFIG_IRQ_BYPASS_MANAGER=m CONFIG_VIRT_DRIVERS=y -CONFIG_VIRTIO=y +CONFIG_VIRTIO=m # # Virtio drivers @@ -6912,6 +6986,7 @@ CONFIG_RTLLIB_CRYPTO_CCMP=m CONFIG_RTLLIB_CRYPTO_TKIP=m CONFIG_RTLLIB_CRYPTO_WEP=m CONFIG_RTL8192E=m +CONFIG_RTL8723BS=m CONFIG_R8712U=m CONFIG_R8188EU=m CONFIG_88EU_AP_MODE=y @@ -7022,14 +7097,23 @@ CONFIG_SPEAKUP_SYNTH_SPKOUT=m CONFIG_SPEAKUP_SYNTH_TXPRT=m CONFIG_SPEAKUP_SYNTH_DUMMY=m CONFIG_STAGING_MEDIA=y +CONFIG_INTEL_ATOMISP=y +CONFIG_VIDEO_ATOMISP=m +CONFIG_VIDEO_OV5693=m +CONFIG_VIDEO_OV2722=m +CONFIG_VIDEO_GC2235=m +CONFIG_VIDEO_OV8858=m +CONFIG_VIDEO_MSRLIST_HELPER=m +CONFIG_VIDEO_MT9M114=m +CONFIG_VIDEO_AP1302=m +CONFIG_VIDEO_GC0310=m +CONFIG_VIDEO_OV2680=m +CONFIG_VIDEO_LM3554=m +CONFIG_VIDEO_IMX=m + CONFIG_I2C_BCM2048=m CONFIG_DVB_CXD2099=m CONFIG_LIRC_STAGING=y -CONFIG_LIRC_BT829=m -CONFIG_LIRC_IMON=m -CONFIG_LIRC_PARALLEL=m -CONFIG_LIRC_SASEM=m -CONFIG_LIRC_SIR=m CONFIG_LIRC_ZILOG=m # @@ -7065,6 +7149,13 @@ CONFIG_HDM_I2C=m CONFIG_HDM_USB=m # CONFIG_KS7010 is not set # CONFIG_GREYBUS is not set + +# +# USB Power Delivery and Type-C drivers +# +CONFIG_TYPEC_TCPM=m +CONFIG_TYPEC_TCPCI=m +CONFIG_TYPEC_FUSB302=m CONFIG_X86_PLATFORM_DEVICES=y CONFIG_ACER_WMI=m CONFIG_ACERHDF=m @@ -7074,6 +7165,7 @@ CONFIG_DELL_SMBIOS=m CONFIG_DELL_LAPTOP=m CONFIG_DELL_WMI=m CONFIG_DELL_WMI_AIO=m +CONFIG_DELL_WMI_LED=m CONFIG_DELL_SMO8800=m CONFIG_DELL_RBTN=m CONFIG_FUJITSU_LAPTOP=m @@ -7113,6 +7205,7 @@ CONFIG_TOSHIBA_BT_RFKILL=m CONFIG_TOSHIBA_HAPS=m CONFIG_TOSHIBA_WMI=m CONFIG_ACPI_CMPC=m +CONFIG_INTEL_CHT_INT33FE=m CONFIG_INTEL_HID_EVENT=m CONFIG_INTEL_VBTN=m CONFIG_INTEL_IPS=m @@ -7207,6 +7300,10 @@ CONFIG_REMOTEPROC=m # # Broadcom SoC drivers # + +# +# i.MX SoC drivers +# # CONFIG_SUNXI_SRAM is not set CONFIG_SOC_TI=y CONFIG_SOC_ZTE=y @@ -7262,6 +7359,9 @@ CONFIG_IIO_TRIGGERED_EVENT=m # # Accelerometers # +CONFIG_ADXL345=m +CONFIG_ADXL345_I2C=m +CONFIG_ADXL345_SPI=m CONFIG_BMA180=m CONFIG_BMA220=m CONFIG_BMC150_ACCEL=m @@ -7305,22 +7405,25 @@ CONFIG_AD7793=m CONFIG_AD7887=m CONFIG_AD7923=m CONFIG_AD799X=m +CONFIG_AXP20X_ADC=m CONFIG_AXP288_ADC=m CONFIG_HI8435=m CONFIG_HX711=m CONFIG_INA2XX_ADC=m CONFIG_LP8788_ADC=m CONFIG_LTC2485=m +CONFIG_LTC2497=m CONFIG_MAX1027=m CONFIG_MAX11100=m +CONFIG_MAX1118=m CONFIG_MAX1363=m +CONFIG_MAX9611=m CONFIG_MCP320X=m CONFIG_MCP3422=m CONFIG_NAU7802=m CONFIG_PALMAS_GPADC=m CONFIG_QCOM_SPMI_IADC=m # CONFIG_QCOM_SPMI_VADC is not set -CONFIG_STX104=m CONFIG_TI_ADC081C=m CONFIG_TI_ADC0832=m CONFIG_TI_ADC12138=m @@ -7367,7 +7470,6 @@ CONFIG_IIO_ST_SENSORS_CORE=m # # Counters # -CONFIG_104_QUAD_8=m # # Digital to analog converters @@ -7383,6 +7485,7 @@ CONFIG_AD5592R=m CONFIG_AD5593R=m CONFIG_AD5504=m CONFIG_AD5624R_SPI=m +CONFIG_LTC2632=m CONFIG_AD5686=m CONFIG_AD5755=m CONFIG_AD5761=m @@ -7443,6 +7546,7 @@ CONFIG_ITG3200=m CONFIG_AFE4403=m CONFIG_AFE4404=m CONFIG_MAX30100=m +CONFIG_MAX30102=m # # Humidity sensors @@ -7450,6 +7554,7 @@ CONFIG_MAX30100=m CONFIG_AM2315=m CONFIG_DHT11=m CONFIG_HDC100X=m +CONFIG_HID_SENSOR_HUMIDITY=m CONFIG_HTS221=m CONFIG_HTS221_I2C=m CONFIG_HTS221_SPI=m @@ -7489,6 +7594,7 @@ CONFIG_CM32181=m CONFIG_CM3232=m CONFIG_CM3323=m CONFIG_CM36651=m +CONFIG_IIO_CROS_EC_LIGHT_PROX=m CONFIG_GP2AP020A00F=m CONFIG_SENSORS_ISL29018=m CONFIG_ISL29125=m @@ -7510,6 +7616,7 @@ CONFIG_TSL4531=m CONFIG_US5182D=m CONFIG_VCNL4000=m CONFIG_VEML6070=m +CONFIG_VL6180=m # # Magnetometer sensors @@ -7585,6 +7692,7 @@ CONFIG_IIO_CROS_EC_BARO=m # Proximity and distance sensors # # CONFIG_LIDAR_LITE_V2 is not set +CONFIG_SRF04=m # CONFIG_SX9500 is not set CONFIG_SRF08=m @@ -7592,6 +7700,7 @@ CONFIG_SRF08=m # Temperature sensors # # CONFIG_MAXIM_THERMOCOUPLE is not set +CONFIG_HID_SENSOR_TEMP=m CONFIG_MLX90614=m CONFIG_TMP006=m CONFIG_TMP007=m @@ -7639,6 +7748,7 @@ CONFIG_IPACK_BUS=m CONFIG_RESET_CONTROLLER=y # CONFIG_RESET_ATH79 is not set # CONFIG_RESET_BERLIN is not set +# CONFIG_RESET_IMX7 is not set # CONFIG_RESET_LPC18XX is not set # CONFIG_RESET_MESON is not set # CONFIG_RESET_PISTACHIO is not set @@ -7677,6 +7787,7 @@ CONFIG_INTEL_RAPL=m # CONFIG_RAS=y CONFIG_MCE_AMD_INJ=m +# CONFIG_RAS_CEC is not set CONFIG_THUNDERBOLT=m # @@ -7689,8 +7800,8 @@ CONFIG_ND_BLK=y CONFIG_ND_CLAIM=y CONFIG_ND_BTT=y CONFIG_BTT=y +CONFIG_DAX=y CONFIG_DEV_DAX=m -CONFIG_NR_DEV_DAX=32768 CONFIG_NVMEM=m CONFIG_STM=m CONFIG_STM_DUMMY=m @@ -7709,6 +7820,8 @@ CONFIG_INTEL_TH_PTI=m # FPGA Configuration Support # CONFIG_FPGA=m +CONFIG_FPGA_MGR_XILINX_SPI=m +CONFIG_ALTERA_PR_IP_CORE=m # # FSI support @@ -7926,6 +8039,7 @@ CONFIG_UBIFS_FS_LZO=y CONFIG_UBIFS_FS_ZLIB=y # CONFIG_UBIFS_ATIME_SUPPORT is not set CONFIG_UBIFS_FS_ENCRYPTION=y +CONFIG_UBIFS_FS_SECURITY=y CONFIG_CRAMFS=y CONFIG_SQUASHFS=y # CONFIG_SQUASHFS_FILE_CACHE is not set @@ -7998,7 +8112,6 @@ CONFIG_NFS_V4_1=y CONFIG_NFS_V4_2=y CONFIG_PNFS_FILE_LAYOUT=m CONFIG_PNFS_BLOCK=m -CONFIG_PNFS_OBJLAYOUT=m CONFIG_PNFS_FLEXFILE_LAYOUT=m CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" CONFIG_NFS_V4_1_MIGRATION=y @@ -8335,6 +8448,7 @@ CONFIG_STRICT_DEVMEM=y CONFIG_EARLY_PRINTK=y # CONFIG_EARLY_PRINTK_DBGP is not set # CONFIG_EARLY_PRINTK_EFI is not set +# CONFIG_EARLY_PRINTK_USB_XDBC is not set # CONFIG_X86_PTDUMP_CORE is not set # CONFIG_X86_PTDUMP is not set # CONFIG_EFI_PGT_DUMP is not set @@ -8596,6 +8710,7 @@ CONFIG_PKCS7_MESSAGE_PARSER=y # Certificates for signature checking # # CONFIG_SYSTEM_TRUSTED_KEYRING is not set +# CONFIG_SYSTEM_BLACKLIST_KEYRING is not set CONFIG_KVM_MMIO=y CONFIG_KVM_ASYNC_PF=y CONFIG_KVM_VFIO=y @@ -8605,7 +8720,6 @@ CONFIG_KVM=m CONFIG_KVM_INTEL=m CONFIG_KVM_AMD=m CONFIG_KVM_MMU_AUDIT=y -CONFIG_KVM_DEVICE_ASSIGNMENT=y CONFIG_VHOST_NET=m CONFIG_VHOST_SCSI=m CONFIG_VHOST_VSOCK=m diff --git a/kernel-nrj-desktop-x86_64.config b/kernel-nrj-desktop-x86_64.config index 3ddfb93..98006be 100644 --- a/kernel-nrj-desktop-x86_64.config +++ b/kernel-nrj-desktop-x86_64.config @@ -29,6 +29,7 @@ CONFIG_INFINIBAND_SRP=m CONFIG_INFINIBAND_SRPT=m CONFIG_INFINIBAND_ISER=m CONFIG_INFINIBAND_ISERT=m +CONFIG_INFINIBAND_OPA_VNIC=m CONFIG_INFINIBAND_RDMAVT=m # CONFIG_INFINIBAND_QEDR is not set CONFIG_INFINIBAND_BNXT_RE=m diff --git a/kernel-x86_64.config b/kernel-x86_64.config index 618278e..2966941 100644 --- a/kernel-x86_64.config +++ b/kernel-x86_64.config @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 4.11.x-nrj-desktop Kernel Configuration +# Linux/x86 4.12.x-nrj-desktop Kernel Configuration # CONFIG_64BIT=y CONFIG_X86_64=y @@ -269,6 +269,7 @@ CONFIG_BLK_DEV_BSGLIB=y CONFIG_BLK_DEV_INTEGRITY=y CONFIG_BLK_DEV_ZONED=y CONFIG_BLK_DEV_THROTTLING=y +# CONFIG_BLK_DEV_THROTTLING_LOW is not set # CONFIG_BLK_CMDLINE_PARSER is not set CONFIG_BLK_WBT=y # CONFIG_BLK_WBT_SQ is not set @@ -311,14 +312,14 @@ CONFIG_IOSCHED_NOOP=y CONFIG_IOSCHED_DEADLINE=y CONFIG_IOSCHED_CFQ=y CONFIG_CFQ_GROUP_IOSCHED=y +# CONFIG_DEFAULT_DEADLINE is not set +CONFIG_DEFAULT_CFQ=y +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="cfq" +CONFIG_MQ_IOSCHED_DEADLINE=y +CONFIG_MQ_IOSCHED_KYBER=y CONFIG_IOSCHED_BFQ=y CONFIG_BFQ_GROUP_IOSCHED=y -# CONFIG_DEFAULT_DEADLINE is not set -# CONFIG_DEFAULT_CFQ is not set -CONFIG_DEFAULT_BFQ=y -# CONFIG_DEFAULT_NOOP is not set -CONFIG_DEFAULT_IOSCHED="bfq" -CONFIG_MQ_IOSCHED_DEADLINE=y CONFIG_PREEMPT_NOTIFIERS=y CONFIG_PADATA=y CONFIG_ASN1=y @@ -358,8 +359,11 @@ CONFIG_PARAVIRT=y # CONFIG_PARAVIRT_DEBUG is not set # CONFIG_PARAVIRT_SPINLOCKS is not set CONFIG_XEN=y +CONFIG_XEN_PV=y +CONFIG_XEN_PV_SMP=y CONFIG_XEN_DOM0=y CONFIG_XEN_PVHVM=y +CONFIG_XEN_PVHVM_SMP=y CONFIG_XEN_512GB=y CONFIG_XEN_SAVE_RESTORE=y # CONFIG_XEN_DEBUG_FS is not set @@ -405,6 +409,7 @@ CONFIG_X86_LOCAL_APIC=y CONFIG_X86_IO_APIC=y CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y CONFIG_X86_MCE=y +# CONFIG_X86_MCELOG_LEGACY is not set CONFIG_X86_MCE_INTEL=y CONFIG_X86_MCE_AMD=y CONFIG_X86_MCE_THRESHOLD=y @@ -698,10 +703,24 @@ CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m CONFIG_HOTPLUG_PCI_SHPC=m # -# PCI host controller drivers +# DesignWare PCI Core Support # # CONFIG_PCIE_DW_PLAT is not set + +# +# PCI host controller drivers +# # CONFIG_VMD is not set + +# +# PCI Endpoint +# +# CONFIG_PCI_ENDPOINT is not set + +# +# PCI switch controller drivers +# +CONFIG_PCI_SW_SWITCHTEC=m # CONFIG_ISA_BUS is not set CONFIG_ISA_DMA_API=y CONFIG_AMD_NB=y @@ -760,7 +779,6 @@ CONFIG_COMPAT_32=y CONFIG_COMPAT=y CONFIG_COMPAT_FOR_U64_ALIGNMENT=y CONFIG_SYSVIPC_COMPAT=y -CONFIG_KEYS_COMPAT=y CONFIG_X86_DEV_DMA_OPS=y CONFIG_NET=y CONFIG_COMPAT_NETLINK_MESSAGES=y @@ -1282,6 +1300,8 @@ CONFIG_NET_DSA_TAG_DSA=y CONFIG_NET_DSA_TAG_EDSA=y CONFIG_NET_DSA_TAG_TRAILER=y CONFIG_NET_DSA_TAG_QCA=y +CONFIG_NET_DSA_TAG_MTK=y +CONFIG_NET_DSA_TAG_LAN9303=y CONFIG_VLAN_8021Q=m CONFIG_VLAN_8021Q_GVRP=y CONFIG_VLAN_8021Q_MVRP=y @@ -1348,6 +1368,7 @@ CONFIG_NET_SCH_HHF=m CONFIG_NET_SCH_PIE=m CONFIG_NET_SCH_INGRESS=m CONFIG_NET_SCH_PLUG=m +# CONFIG_NET_SCH_DEFAULT is not set # # Classification @@ -1461,6 +1482,7 @@ CONFIG_CAN_GW=m # CAN Device Drivers # CONFIG_CAN_VCAN=m +CONFIG_CAN_VXCAN=m CONFIG_CAN_SLCAN=m CONFIG_CAN_DEV=m # CONFIG_CAN_CALC_BITTIMING is not set @@ -1474,6 +1496,7 @@ CONFIG_CAN_CC770_ISA=m CONFIG_CAN_CC770_PLATFORM=m CONFIG_CAN_IFI_CANFD=m CONFIG_CAN_M_CAN=m +CONFIG_CAN_PEAK_PCIEFD=m CONFIG_CAN_SJA1000=m CONFIG_CAN_SJA1000_ISA=m CONFIG_CAN_SJA1000_PLATFORM=m @@ -1490,6 +1513,7 @@ CONFIG_CAN_SOFTING_CS=m # # CAN SPI interfaces # +CONFIG_CAN_HI311X=m CONFIG_CAN_MCP251X=m # @@ -1501,6 +1525,7 @@ CONFIG_CAN_GS_USB=m CONFIG_CAN_KVASER_USB=m CONFIG_CAN_PEAK_USB=m CONFIG_CAN_8DEV_USB=m +CONFIG_CAN_MCBA_USB=m # CONFIG_CAN_DEBUG_DEVICES is not set CONFIG_IRDA=m @@ -1586,7 +1611,9 @@ CONFIG_BT_HCIBTUSB_BCM=y CONFIG_BT_HCIBTUSB_RTL=y CONFIG_BT_HCIBTSDIO=m CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_SERDEV=y CONFIG_BT_HCIUART_H4=y +CONFIG_BT_HCIUART_NOKIA=m CONFIG_BT_HCIUART_BCSP=y CONFIG_BT_HCIUART_ATH3K=y CONFIG_BT_HCIUART_LL=y @@ -1659,6 +1686,7 @@ CONFIG_RFKILL_INPUT=y CONFIG_RFKILL_GPIO=m CONFIG_NET_9P=m CONFIG_NET_9P_VIRTIO=m +CONFIG_NET_9P_XEN=m # CONFIG_NET_9P_DEBUG is not set CONFIG_CAIF=m # CONFIG_CAIF_DEBUG is not set @@ -2010,10 +2038,7 @@ CONFIG_VMWARE_BALLOON=m CONFIG_USB_SWITCH_FSA9480=m CONFIG_LATTICE_ECP3_CONFIG=m CONFIG_SRAM=y -CONFIG_PANEL=m -CONFIG_PANEL_PARPORT=0 -CONFIG_PANEL_PROFILE=5 -# CONFIG_PANEL_CHANGE_MESSAGE is not set +# CONFIG_PCI_ENDPOINT_TEST is not set CONFIG_C2PORT=m CONFIG_C2PORT_DURAMAR_2150=m @@ -2375,7 +2400,6 @@ CONFIG_DM_SNAPSHOT=m CONFIG_DM_THIN_PROVISIONING=m CONFIG_DM_CACHE=m CONFIG_DM_CACHE_SMQ=m -CONFIG_DM_CACHE_CLEANER=m CONFIG_DM_ERA=m CONFIG_DM_MIRROR=m CONFIG_DM_LOG_USERSPACE=m @@ -2391,6 +2415,7 @@ CONFIG_DM_VERITY=m # CONFIG_DM_VERITY_FEC is not set CONFIG_DM_SWITCH=m CONFIG_DM_LOG_WRITES=m +CONFIG_DM_INTEGRITY=m CONFIG_TARGET_CORE=m CONFIG_TCM_IBLOCK=m CONFIG_TCM_FILEIO=m @@ -2449,6 +2474,7 @@ CONFIG_VETH=m CONFIG_VIRTIO_NET=m CONFIG_NLMON=m CONFIG_NET_VRF=m +CONFIG_VSOCKMON=m CONFIG_SUNGEM_PHY=m # @@ -2473,6 +2499,11 @@ CONFIG_B53_SRAB_DRIVER=m CONFIG_NET_DSA_MV88E6XXX=m CONFIG_NET_DSA_MV88E6XXX_GLOBAL2=y CONFIG_NET_DSA_QCA8K=m +CONFIG_NET_DSA_LOOP=m +CONFIG_NET_DSA_MT7530=m +CONFIG_NET_DSA_SMSC_LAN9303=m +CONFIG_NET_DSA_SMSC_LAN9303_I2C=m +CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m CONFIG_ETHERNET=y CONFIG_MDIO=m CONFIG_NET_VENDOR_3COM=y @@ -2521,6 +2552,7 @@ CONFIG_BCMGENET=m CONFIG_BNX2=m CONFIG_CNIC=m CONFIG_TIGON3=m +CONFIG_TIGON3_HWMON=y CONFIG_BNX2X=m CONFIG_BNX2X_SRIOV=y CONFIG_BNXT=m @@ -2614,6 +2646,7 @@ CONFIG_MLX4_CORE=m CONFIG_MLX5_CORE=m CONFIG_MLX5_CORE_EN=y CONFIG_MLX5_CORE_EN_DCB=y +# CONFIG_MLX5_CORE_IPOIB is not set CONFIG_MLXSW_CORE=m CONFIG_MLXSW_CORE_HWMON=y CONFIG_MLXSW_CORE_THERMAL=y @@ -2734,6 +2767,9 @@ CONFIG_WIZNET_BUS_ANY=y CONFIG_WIZNET_W5100_SPI=m CONFIG_NET_VENDOR_XIRCOM=y CONFIG_PCMCIA_XIRC2PS=m +CONFIG_NET_VENDOR_SYNOPSYS=y +CONFIG_DWC_XLGMAC=m +CONFIG_DWC_XLGMAC_PCI=m # CONFIG_HIPPI is not set CONFIG_NET_SB1000=m CONFIG_PHYLIB=y @@ -2743,7 +2779,7 @@ CONFIG_LED_TRIGGER_PHY=y # # MDIO bus device drivers # -CONFIG_MDIO_BCM_UNIMAC=m +CONFIG_MDIO_DEVICE=y CONFIG_MDIO_BITBANG=m CONFIG_MDIO_CAVIUM=m CONFIG_MDIO_GPIO=m @@ -3105,6 +3141,8 @@ CONFIG_IEEE802154_MRF24J40=m CONFIG_IEEE802154_CC2520=m CONFIG_IEEE802154_ATUSB=m CONFIG_IEEE802154_ADF7242=m +CONFIG_IEEE802154_CA8210=m +# CONFIG_IEEE802154_CA8210_DEBUGFS is not set CONFIG_XEN_NETDEV_FRONTEND=y CONFIG_XEN_NETDEV_BACKEND=m CONFIG_VMXNET3=m @@ -3226,6 +3264,7 @@ CONFIG_ISDN_HDLC=m CONFIG_NVM=y # CONFIG_NVM_DEBUG is not set CONFIG_NVM_RRPC=m +CONFIG_NVM_PBLK=m # # Input device support @@ -3285,6 +3324,7 @@ CONFIG_MOUSE_PS2_ALPS=y CONFIG_MOUSE_PS2_BYD=y CONFIG_MOUSE_PS2_LOGIPS2PP=y CONFIG_MOUSE_PS2_SYNAPTICS=y +CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y CONFIG_MOUSE_PS2_CYPRESS=y CONFIG_MOUSE_PS2_LIFEBOOK=y CONFIG_MOUSE_PS2_TRACKPOINT=y @@ -3293,6 +3333,7 @@ CONFIG_MOUSE_PS2_SENTELIC=y CONFIG_MOUSE_PS2_TOUCHKIT=y CONFIG_MOUSE_PS2_FOCALTECH=y CONFIG_MOUSE_PS2_VMMOUSE=y +CONFIG_MOUSE_PS2_SMBUS=y CONFIG_MOUSE_SERIAL=m CONFIG_MOUSE_APPLETOUCH=m CONFIG_MOUSE_BCM5974=m @@ -3335,6 +3376,8 @@ CONFIG_JOYSTICK_XPAD=m CONFIG_JOYSTICK_XPAD_FF=y CONFIG_JOYSTICK_XPAD_LEDS=y CONFIG_JOYSTICK_WALKERA0701=m +CONFIG_JOYSTICK_PSXPAD_SPI=m +# CONFIG_JOYSTICK_PSXPAD_SPI_FF is not set CONFIG_INPUT_TABLET=y CONFIG_TABLET_USB_ACECAD=m CONFIG_TABLET_USB_AIPTEK=m @@ -3423,6 +3466,7 @@ CONFIG_TOUCHSCREEN_TSC200X_CORE=m CONFIG_TOUCHSCREEN_TSC2004=m CONFIG_TOUCHSCREEN_TSC2005=m CONFIG_TOUCHSCREEN_TSC2007=m +# CONFIG_TOUCHSCREEN_TSC2007_IIO is not set CONFIG_TOUCHSCREEN_PCAP=m CONFIG_TOUCHSCREEN_RM_TS=m CONFIG_TOUCHSCREEN_SILEAD=m @@ -3592,6 +3636,7 @@ CONFIG_SERIAL_KGDB_NMI=y CONFIG_SERIAL_MAX3100=m CONFIG_SERIAL_MAX310X=y CONFIG_SERIAL_UARTLITE=m +CONFIG_SERIAL_UARTLITE_NR_UARTS=1 CONFIG_SERIAL_CORE=y CONFIG_SERIAL_CORE_CONSOLE=y CONFIG_CONSOLE_POLL=y @@ -3690,6 +3735,7 @@ CONFIG_I2C_MUX=m # Multiplexer I2C Chip support # CONFIG_I2C_MUX_GPIO=m +CONFIG_I2C_MUX_LTC4306=m CONFIG_I2C_MUX_PCA9541=m CONFIG_I2C_MUX_PCA954x=m CONFIG_I2C_MUX_PINCTRL=m @@ -3977,6 +4023,7 @@ CONFIG_W1_SLAVE_DS2423=m CONFIG_W1_SLAVE_DS2431=m CONFIG_W1_SLAVE_DS2433=m # CONFIG_W1_SLAVE_DS2433_CRC is not set +CONFIG_W1_SLAVE_DS2438=m CONFIG_W1_SLAVE_DS2760=m CONFIG_W1_SLAVE_DS2780=m CONFIG_W1_SLAVE_DS2781=m @@ -4007,6 +4054,9 @@ CONFIG_BATTERY_DA9030=m CONFIG_BATTERY_DA9052=m CONFIG_CHARGER_DA9150=m CONFIG_BATTERY_DA9150=m +CONFIG_CHARGER_AXP20X=m +CONFIG_BATTERY_AXP20X=m +CONFIG_AXP20X_POWER=m CONFIG_AXP288_CHARGER=m CONFIG_AXP288_FUEL_GAUGE=m CONFIG_BATTERY_MAX17040=m @@ -4034,7 +4084,6 @@ CONFIG_CHARGER_TPS65217=m CONFIG_BATTERY_GAUGE_LTC2941=m CONFIG_BATTERY_RT5033=m CONFIG_CHARGER_RT9455=m -CONFIG_AXP20X_POWER=m CONFIG_HWMON=y CONFIG_HWMON_VID=m # CONFIG_HWMON_DEBUG_CHIP is not set @@ -4066,6 +4115,7 @@ CONFIG_SENSORS_K10TEMP=m CONFIG_SENSORS_FAM15H_POWER=m CONFIG_SENSORS_APPLESMC=m CONFIG_SENSORS_ASB100=m +CONFIG_SENSORS_ASPEED=m CONFIG_SENSORS_ATXP1=m CONFIG_SENSORS_DS620=m CONFIG_SENSORS_DS1621=m @@ -4212,6 +4262,7 @@ CONFIG_SENSORS_XGENE=m CONFIG_SENSORS_ACPI_POWER=m CONFIG_SENSORS_ATK0110=m CONFIG_THERMAL=y +CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=0 CONFIG_THERMAL_HWMON=y CONFIG_THERMAL_WRITABLE_TRIPS=y CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y @@ -4388,6 +4439,7 @@ CONFIG_MFD_INTEL_QUARK_I2C_GPIO=m CONFIG_LPC_ICH=m CONFIG_LPC_SCH=m CONFIG_INTEL_SOC_PMIC=y +CONFIG_INTEL_SOC_PMIC_BXTWC=m CONFIG_MFD_INTEL_LPSS=m CONFIG_MFD_INTEL_LPSS_ACPI=m CONFIG_MFD_INTEL_LPSS_PCI=m @@ -4430,6 +4482,7 @@ CONFIG_MFD_SYSCON=y CONFIG_MFD_TI_AM335X_TSCADC=m CONFIG_MFD_LP3943=m CONFIG_MFD_LP8788=y +CONFIG_MFD_TI_LMU=m CONFIG_MFD_PALMAS=y # CONFIG_TPS6105X is not set # CONFIG_TPS65010 is not set @@ -4468,6 +4521,7 @@ CONFIG_MFD_WM8350=y CONFIG_MFD_WM8350_I2C=y CONFIG_MFD_WM8994=y # CONFIG_REGULATOR is not set +CONFIG_CEC_CORE=m CONFIG_MEDIA_SUPPORT=m # @@ -4480,8 +4534,7 @@ CONFIG_MEDIA_RADIO_SUPPORT=y CONFIG_MEDIA_SDR_SUPPORT=y CONFIG_MEDIA_RC_SUPPORT=y CONFIG_MEDIA_CEC_SUPPORT=y -# CONFIG_MEDIA_CEC_DEBUG is not set -CONFIG_MEDIA_CEC_EDID=y +# CONFIG_MEDIA_CEC_RC is not set CONFIG_MEDIA_CONTROLLER=y # CONFIG_MEDIA_CONTROLLER_DVB is not set CONFIG_VIDEO_DEV=m @@ -4547,6 +4600,7 @@ CONFIG_RC_LOOPBACK=m CONFIG_IR_GPIO_CIR=m CONFIG_IR_SERIAL=m CONFIG_IR_SERIAL_TRANSMITTER=y +CONFIG_IR_SIR=m CONFIG_MEDIA_USB_SUPPORT=y # @@ -4715,6 +4769,7 @@ CONFIG_USB_MSI2500=m # USB HDMI CEC adapters # CONFIG_USB_PULSE8_CEC=m +CONFIG_USB_RAINSHADOW_CEC=m CONFIG_MEDIA_PCI_SUPPORT=y # @@ -4797,11 +4852,13 @@ CONFIG_V4L_MEM2MEM_DRIVERS=y CONFIG_VIDEO_MEM2MEM_DEINTERLACE=m CONFIG_VIDEO_SH_VEU=m CONFIG_V4L_TEST_DRIVERS=y +CONFIG_VIDEO_VIMC=m CONFIG_VIDEO_VIVID=m CONFIG_VIDEO_VIVID_CEC=y CONFIG_VIDEO_VIVID_MAX_DEVS=64 CONFIG_VIDEO_VIM2M=m CONFIG_DVB_PLATFORM_DRIVERS=y +# CONFIG_CEC_PLATFORM_DRIVERS is not set # # Supported MMC/SDIO adapters @@ -4906,6 +4963,7 @@ CONFIG_VIDEO_SAA7127=m # # Camera sensor devices # +CONFIG_VIDEO_OV2640=m CONFIG_VIDEO_OV7640=m CONFIG_VIDEO_OV7670=m CONFIG_VIDEO_MT9M111=m @@ -4944,7 +5002,6 @@ CONFIG_SOC_CAMERA_MT9M111=m CONFIG_SOC_CAMERA_MT9T031=m CONFIG_SOC_CAMERA_MT9T112=m CONFIG_SOC_CAMERA_MT9V022=m -CONFIG_SOC_CAMERA_OV2640=m CONFIG_SOC_CAMERA_OV5642=m CONFIG_SOC_CAMERA_OV6650=m CONFIG_SOC_CAMERA_OV772X=m @@ -5159,6 +5216,7 @@ CONFIG_DRM_MIPI_DSI=y CONFIG_DRM_KMS_HELPER=m CONFIG_DRM_KMS_FB_HELPER=y CONFIG_DRM_FBDEV_EMULATION=y +CONFIG_DRM_FBDEV_OVERALLOC=100 CONFIG_DRM_LOAD_EDID_FIRMWARE=y CONFIG_DRM_TTM=m CONFIG_DRM_GEM_CMA_HELPER=y @@ -5200,6 +5258,9 @@ CONFIG_DRM_I915_USERPTR=y # CONFIG_DRM_I915_WERROR is not set # CONFIG_DRM_I915_DEBUG is not set # CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS is not set +# CONFIG_DRM_I915_SELFTEST is not set +# CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS is not set +# CONFIG_DRM_I915_DEBUG_VBLANK_EVADE is not set CONFIG_DRM_VGEM=m CONFIG_DRM_VMWGFX=m CONFIG_DRM_VMWGFX_FBCON=y @@ -5394,6 +5455,7 @@ CONFIG_BACKLIGHT_AS3711=m CONFIG_BACKLIGHT_GPIO=m CONFIG_BACKLIGHT_LV5207LP=m CONFIG_BACKLIGHT_BD6107=m +CONFIG_BACKLIGHT_ARCXCNN=m CONFIG_VGASTATE=m CONFIG_HDMI=y @@ -5590,6 +5652,8 @@ CONFIG_SND_FIREWORKS=m CONFIG_SND_BEBOB=m CONFIG_SND_FIREWIRE_DIGI00X=m CONFIG_SND_FIREWIRE_TASCAM=m +CONFIG_SND_FIREWIRE_MOTU=m +CONFIG_SND_FIREFACE=m CONFIG_SND_PCMCIA=y CONFIG_SND_VXPOCKET=m CONFIG_SND_PDAUDIOCF=m @@ -5601,7 +5665,7 @@ CONFIG_SND_SOC_TOPOLOGY=y CONFIG_SND_SOC_AMD_ACP=m CONFIG_SND_ATMEL_SOC=m CONFIG_SND_DESIGNWARE_I2S=m -CONFIG_SND_DESIGNWARE_PCM=m +# CONFIG_SND_DESIGNWARE_PCM is not set # # SoC Audio for Freescale CPUs @@ -5616,6 +5680,7 @@ CONFIG_SND_SOC_FSL_SSI=m CONFIG_SND_SOC_FSL_SPDIF=m CONFIG_SND_SOC_FSL_ESAI=m CONFIG_SND_SOC_IMX_AUDMUX=m +CONFIG_SND_I2S_HI6210_I2S=m CONFIG_SND_SOC_IMG=y CONFIG_SND_SOC_IMG_I2S_IN=m CONFIG_SND_SOC_IMG_I2S_OUT=m @@ -5641,18 +5706,26 @@ CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH=m CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH=m CONFIG_SND_SOC_INTEL_SKYLAKE=m CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m CONFIG_SND_SOC_XTFPGA_I2S=m +CONFIG_ZX_TDM=m CONFIG_SND_SOC_I2C_AND_SPI=m # # CODEC drivers # CONFIG_SND_SOC_AC97_CODEC=m +CONFIG_SND_SOC_ADAU_UTILS=m CONFIG_SND_SOC_ADAU1701=m +CONFIG_SND_SOC_ADAU17X1=m +CONFIG_SND_SOC_ADAU1761=m +CONFIG_SND_SOC_ADAU1761_I2C=m +CONFIG_SND_SOC_ADAU1761_SPI=m CONFIG_SND_SOC_ADAU7002=m CONFIG_SND_SOC_AK4104=m CONFIG_SND_SOC_AK4554=m @@ -5664,6 +5737,7 @@ CONFIG_SND_SOC_BT_SCO=m CONFIG_SND_SOC_CS35L32=m CONFIG_SND_SOC_CS35L33=m CONFIG_SND_SOC_CS35L34=m +CONFIG_SND_SOC_CS35L35=m CONFIG_SND_SOC_CS42L42=m CONFIG_SND_SOC_CS42L51=m CONFIG_SND_SOC_CS42L51_I2C=m @@ -5679,9 +5753,12 @@ CONFIG_SND_SOC_CS42XX8=m CONFIG_SND_SOC_CS42XX8_I2C=m CONFIG_SND_SOC_CS4349=m CONFIG_SND_SOC_CS53L30=m +CONFIG_SND_SOC_DA7213=m CONFIG_SND_SOC_DA7219=m +CONFIG_SND_SOC_DIO2125=m CONFIG_SND_SOC_DMIC=m CONFIG_SND_SOC_HDMI_CODEC=m +CONFIG_SND_SOC_ES7134=m CONFIG_SND_SOC_ES8328=m CONFIG_SND_SOC_ES8328_I2C=m CONFIG_SND_SOC_ES8328_SPI=m @@ -5691,6 +5768,7 @@ CONFIG_SND_SOC_INNO_RK3036=m CONFIG_SND_SOC_MAX98090=m CONFIG_SND_SOC_MAX98357A=m CONFIG_SND_SOC_MAX98504=m +CONFIG_SND_SOC_MAX98927=m CONFIG_SND_SOC_MAX9860=m CONFIG_SND_SOC_MSM8916_WCD_ANALOG=m CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=m @@ -5719,6 +5797,7 @@ CONFIG_SND_SOC_RT5677_SPI=m CONFIG_SND_SOC_SGTL5000=m CONFIG_SND_SOC_SIGMADSP=m CONFIG_SND_SOC_SIGMADSP_I2C=m +CONFIG_SND_SOC_SIGMADSP_REGMAP=m CONFIG_SND_SOC_SIRF_AUDIO_CODEC=m CONFIG_SND_SOC_SPDIF=m CONFIG_SND_SOC_SSM2602=m @@ -5762,6 +5841,7 @@ CONFIG_SND_SOC_WM8978=m CONFIG_SND_SOC_WM8985=m CONFIG_SND_SOC_NAU8540=m CONFIG_SND_SOC_NAU8810=m +CONFIG_SND_SOC_NAU8824=m CONFIG_SND_SOC_NAU8825=m CONFIG_SND_SOC_TPA6130A2=m CONFIG_SND_SIMPLE_CARD_UTILS=m @@ -5784,6 +5864,7 @@ CONFIG_HID_GENERIC=m # Special HID drivers # CONFIG_HID_A4TECH=m +CONFIG_HID_ACCUTOUCH=m CONFIG_HID_ACRUX=m CONFIG_HID_ACRUX_FF=y CONFIG_HID_APPLE=m @@ -5833,6 +5914,7 @@ CONFIG_HID_MAYFLASH=m CONFIG_HID_MICROSOFT=m CONFIG_HID_MONTEREY=m CONFIG_HID_MULTITOUCH=m +CONFIG_HID_NTI=m CONFIG_HID_NTRIG=m CONFIG_HID_ORTEK=m CONFIG_HID_PANTHERLORD=m @@ -5904,6 +5986,7 @@ CONFIG_USB_SUPPORT=y CONFIG_USB_COMMON=y CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB=y +CONFIG_USB_PCI=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y # @@ -6170,6 +6253,7 @@ CONFIG_USB_R8A66597=m CONFIG_USB_PXA27X=m CONFIG_USB_MV_UDC=m CONFIG_USB_MV_U3D=m +CONFIG_USB_SNP_CORE=m # CONFIG_USB_M66592 is not set CONFIG_USB_BDC_UDC=m @@ -6252,6 +6336,11 @@ CONFIG_USB_G_DBGP=m # CONFIG_USB_G_DBGP_PRINTK is not set CONFIG_USB_G_DBGP_SERIAL=y CONFIG_USB_G_WEBCAM=m + +# +# USB Power Delivery and Type-C drivers +# +CONFIG_TYPEC=m CONFIG_USB_LED_TRIG=y CONFIG_USB_ULPI_BUS=m CONFIG_UWB=m @@ -6287,6 +6376,7 @@ CONFIG_MMC_REALTEK_PCI=m CONFIG_MMC_REALTEK_USB=m CONFIG_MMC_TOSHIBA_PCI=m CONFIG_MMC_MTK=m +CONFIG_MMC_SDHCI_XENON=m CONFIG_MEMSTICK=m # CONFIG_MEMSTICK_DEBUG is not set @@ -6317,6 +6407,7 @@ CONFIG_LEDS_88PM860X=m CONFIG_LEDS_LM3530=m CONFIG_LEDS_LM3533=m CONFIG_LEDS_LM3642=m +CONFIG_LEDS_MT6323=m CONFIG_LEDS_PCA9532=m CONFIG_LEDS_PCA9532_GPIO=y CONFIG_LEDS_GPIO=m @@ -6342,7 +6433,6 @@ CONFIG_LEDS_PWM=m CONFIG_LEDS_INTEL_SS4200=m CONFIG_LEDS_LT3593=m CONFIG_LEDS_ADP5520=m -CONFIG_LEDS_DELL_NETBOOKS=m CONFIG_LEDS_MC13783=m CONFIG_LEDS_TCA6507=m CONFIG_LEDS_TLC591XX=m @@ -6385,7 +6475,7 @@ CONFIG_EDAC=y # CONFIG_EDAC_LEGACY_SYSFS is not set # CONFIG_EDAC_DEBUG is not set CONFIG_EDAC_DECODE_MCE=m -CONFIG_EDAC_MM_EDAC=m +# CONFIG_EDAC_GHES is not set CONFIG_EDAC_AMD64=m # CONFIG_EDAC_AMD64_ERROR_INJECTION is not set CONFIG_EDAC_E752X=m @@ -6569,12 +6659,18 @@ CONFIG_SYNC_FILE=y CONFIG_SW_SYNC=y CONFIG_DCA=m CONFIG_AUXDISPLAY=y +CONFIG_CHARLCD=m +CONFIG_HD44780=m CONFIG_KS0108=m CONFIG_KS0108_PORT=0x378 CONFIG_KS0108_DELAY=2 CONFIG_CFAG12864B=m CONFIG_CFAG12864B_RATE=20 # CONFIG_IMG_ASCII_LCD is not set +CONFIG_PANEL=m +CONFIG_PANEL_PARPORT=0 +CONFIG_PANEL_PROFILE=5 +# CONFIG_PANEL_CHANGE_MESSAGE is not set CONFIG_UIO=m CONFIG_UIO_CIF=m # CONFIG_UIO_PDRV_GENIRQ is not set @@ -6615,6 +6711,7 @@ CONFIG_VIRTIO_MMIO=m # Microsoft Hyper-V guest support # CONFIG_HYPERV=m +CONFIG_HYPERV_TSCPAGE=y CONFIG_HYPERV_UTILS=m CONFIG_HYPERV_BALLOON=m @@ -6792,6 +6889,7 @@ CONFIG_RTLLIB_CRYPTO_CCMP=m CONFIG_RTLLIB_CRYPTO_TKIP=m CONFIG_RTLLIB_CRYPTO_WEP=m CONFIG_RTL8192E=m +CONFIG_RTL8723BS=m CONFIG_R8712U=m CONFIG_R8188EU=m CONFIG_88EU_AP_MODE=y @@ -6898,6 +6996,19 @@ CONFIG_SPEAKUP_SYNTH_SPKOUT=m CONFIG_SPEAKUP_SYNTH_TXPRT=m CONFIG_SPEAKUP_SYNTH_DUMMY=m CONFIG_STAGING_MEDIA=y +CONFIG_INTEL_ATOMISP=y +CONFIG_VIDEO_ATOMISP=m +CONFIG_VIDEO_OV5693=m +CONFIG_VIDEO_OV2722=m +CONFIG_VIDEO_GC2235=m +CONFIG_VIDEO_OV8858=m +CONFIG_VIDEO_MSRLIST_HELPER=m +CONFIG_VIDEO_MT9M114=m +CONFIG_VIDEO_AP1302=m +CONFIG_VIDEO_GC0310=m +CONFIG_VIDEO_OV2680=m +CONFIG_VIDEO_LM3554=m +CONFIG_VIDEO_IMX=m CONFIG_I2C_BCM2048=m CONFIG_DVB_CXD2099=m CONFIG_LIRC_STAGING=y @@ -6944,6 +7055,13 @@ CONFIG_HDM_I2C=m CONFIG_HDM_USB=m # CONFIG_KS7010 is not set # CONFIG_GREYBUS is not set + +# +# USB Power Delivery and Type-C drivers +# +CONFIG_TYPEC_TCPM=m +CONFIG_TYPEC_TCPCI=m +CONFIG_TYPEC_FUSB302=m CONFIG_X86_PLATFORM_DEVICES=y CONFIG_ACER_WMI=m CONFIG_ACERHDF=m @@ -6953,6 +7071,7 @@ CONFIG_DELL_SMBIOS=m CONFIG_DELL_LAPTOP=m CONFIG_DELL_WMI=m CONFIG_DELL_WMI_AIO=m +CONFIG_DELL_WMI_LED=m CONFIG_DELL_SMO8800=m CONFIG_DELL_RBTN=m CONFIG_FUJITSU_LAPTOP=m @@ -6991,6 +7110,7 @@ CONFIG_TOSHIBA_BT_RFKILL=m CONFIG_TOSHIBA_HAPS=m CONFIG_TOSHIBA_WMI=m CONFIG_ACPI_CMPC=m +CONFIG_INTEL_CHT_INT33FE=m CONFIG_INTEL_HID_EVENT=m CONFIG_INTEL_VBTN=m CONFIG_INTEL_IPS=m @@ -7146,6 +7266,9 @@ CONFIG_IIO_TRIGGERED_EVENT=m # # Accelerometers # +CONFIG_ADXL345=m +CONFIG_ADXL345_I2C=m +CONFIG_ADXL345_SPI=m CONFIG_BMA180=m CONFIG_BMA220=m CONFIG_BMC150_ACCEL=m @@ -7189,6 +7312,7 @@ CONFIG_AD7793=m CONFIG_AD7887=m CONFIG_AD7923=m CONFIG_AD799X=m +CONFIG_AXP20X_ADC=m CONFIG_AXP288_ADC=m CONFIG_DA9150_GPADC=m CONFIG_HI8435=m @@ -7196,9 +7320,12 @@ CONFIG_HX711=m CONFIG_INA2XX_ADC=m CONFIG_LP8788_ADC=m CONFIG_LTC2485=m +CONFIG_LTC2497=m CONFIG_MAX1027=m CONFIG_MAX11100=m +CONFIG_MAX1118=m CONFIG_MAX1363=m +CONFIG_MAX9611=m CONFIG_MCP320X=m CONFIG_MCP3422=m CONFIG_NAU7802=m @@ -7266,6 +7393,7 @@ CONFIG_AD5592R=m CONFIG_AD5593R=m CONFIG_AD5504=m CONFIG_AD5624R_SPI=m +CONFIG_LTC2632=m CONFIG_AD5686=m CONFIG_AD5755=m CONFIG_AD5761=m @@ -7326,6 +7454,7 @@ CONFIG_ITG3200=m CONFIG_AFE4403=m CONFIG_AFE4404=m CONFIG_MAX30100=m +CONFIG_MAX30102=m # # Humidity sensors @@ -7333,6 +7462,7 @@ CONFIG_MAX30100=m CONFIG_AM2315=m CONFIG_DHT11=m CONFIG_HDC100X=m +CONFIG_HID_SENSOR_HUMIDITY=m CONFIG_HTS221=m CONFIG_HTS221_I2C=m CONFIG_HTS221_SPI=m @@ -7372,6 +7502,7 @@ CONFIG_CM32181=m CONFIG_CM3232=m CONFIG_CM3323=m CONFIG_CM36651=m +CONFIG_IIO_CROS_EC_LIGHT_PROX=m CONFIG_GP2AP020A00F=m CONFIG_SENSORS_ISL29018=m CONFIG_ISL29125=m @@ -7394,6 +7525,7 @@ CONFIG_TSL4531=m CONFIG_US5182D=m CONFIG_VCNL4000=m CONFIG_VEML6070=m +CONFIG_VL6180=m # # Magnetometer sensors @@ -7469,6 +7601,7 @@ CONFIG_IIO_CROS_EC_BARO=m # Proximity and distance sensors # # CONFIG_LIDAR_LITE_V2 is not set +CONFIG_SRF04=m # CONFIG_SX9500 is not set CONFIG_SRF08=m @@ -7476,6 +7609,7 @@ CONFIG_SRF08=m # Temperature sensors # # CONFIG_MAXIM_THERMOCOUPLE is not set +CONFIG_HID_SENSOR_TEMP=m CONFIG_MLX90614=m CONFIG_TMP006=m CONFIG_TMP007=m @@ -7523,6 +7657,7 @@ CONFIG_ARM_GIC_MAX_NR=1 CONFIG_RESET_CONTROLLER=y # CONFIG_RESET_ATH79 is not set # CONFIG_RESET_BERLIN is not set +# CONFIG_RESET_IMX7 is not set # CONFIG_RESET_LPC18XX is not set # CONFIG_RESET_MESON is not set # CONFIG_RESET_PISTACHIO is not set @@ -7561,6 +7696,7 @@ CONFIG_INTEL_RAPL=m # CONFIG_RAS=y CONFIG_MCE_AMD_INJ=m +# CONFIG_RAS_CEC is not set CONFIG_THUNDERBOLT=m # @@ -7573,8 +7709,8 @@ CONFIG_ND_BLK=y CONFIG_ND_CLAIM=y CONFIG_ND_BTT=y CONFIG_BTT=y +CONFIG_DAX=y CONFIG_DEV_DAX=m -CONFIG_NR_DEV_DAX=32768 CONFIG_NVMEM=m CONFIG_STM=m CONFIG_STM_DUMMY=m @@ -7593,6 +7729,8 @@ CONFIG_INTEL_TH_PTI=m # FPGA Configuration Support # CONFIG_FPGA=m +CONFIG_FPGA_MGR_XILINX_SPI=m +CONFIG_ALTERA_PR_IP_CORE=m # # FSI support @@ -7811,6 +7949,7 @@ CONFIG_UBIFS_FS_LZO=y CONFIG_UBIFS_FS_ZLIB=y # CONFIG_UBIFS_ATIME_SUPPORT is not set CONFIG_UBIFS_FS_ENCRYPTION=y +CONFIG_UBIFS_FS_SECURITY=y CONFIG_CRAMFS=y CONFIG_SQUASHFS=y # CONFIG_SQUASHFS_FILE_CACHE is not set @@ -7883,7 +8022,6 @@ CONFIG_NFS_V4_1=y CONFIG_NFS_V4_2=y CONFIG_PNFS_FILE_LAYOUT=m CONFIG_PNFS_BLOCK=m -CONFIG_PNFS_OBJLAYOUT=m CONFIG_PNFS_FLEXFILE_LAYOUT=m CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" CONFIG_NFS_V4_1_MIGRATION=y @@ -8222,6 +8360,7 @@ CONFIG_STRICT_DEVMEM=y CONFIG_EARLY_PRINTK=y # CONFIG_EARLY_PRINTK_DBGP is not set # CONFIG_EARLY_PRINTK_EFI is not set +# CONFIG_EARLY_PRINTK_USB_XDBC is not set # CONFIG_X86_PTDUMP_CORE is not set # CONFIG_X86_PTDUMP is not set # CONFIG_EFI_PGT_DUMP is not set @@ -8252,6 +8391,7 @@ CONFIG_PUNIT_ATOM_DEBUG=m # Security options # CONFIG_KEYS=y +CONFIG_KEYS_COMPAT=y # CONFIG_PERSISTENT_KEYRINGS is not set # CONFIG_BIG_KEYS is not set CONFIG_TRUSTED_KEYS=y @@ -8505,6 +8645,7 @@ CONFIG_PKCS7_MESSAGE_PARSER=y # Certificates for signature checking # # CONFIG_SYSTEM_TRUSTED_KEYRING is not set +# CONFIG_SYSTEM_BLACKLIST_KEYRING is not set CONFIG_KVM_MMIO=y CONFIG_KVM_ASYNC_PF=y CONFIG_KVM_VFIO=y diff --git a/kernel.spec b/kernel.spec index 91948ce..c76de08 100644 --- a/kernel.spec +++ b/kernel.spec @@ -1,7 +1,7 @@ %define kernelversion 4 -%define patchlevel 11 +%define patchlevel 12 # sublevel is now used for -stable patches -%define sublevel 9 +%define sublevel 2 # Release number. Increase this before a rebuild. %define rpmrel 1 @@ -181,10 +181,6 @@ Patch100: perf-python-ext-link-with-dl.patch # this. Patch101: perf-xmlto-skip-validation.patch -# Export pci_ids.h to user space, needed by ldetect -# TODO: is it really needed now? -Patch103: include-kbuild-export-pci_ids.patch - # http://bugs.rosalinux.ru/show_bug.cgi?id=6235 # http://bugs.rosalinux.ru/show_bug.cgi?id=6459 Patch104: audit-make-it-less-verbose.patch @@ -204,12 +200,6 @@ Patch108: ata-prefer-ata-drivers-over-ide-drivers-when-both-are-built.patch # AUFS from http://aufs.sourceforge.net/ Patch109: fs-aufs4.patch -# BFQ IO scheduler, http://algogroup.unimore.it/people/paolo/disk_sched/ -Patch111: 0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.11..patch -Patch112: 0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.11.0.patch -Patch113: 0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for.patch -Patch114: 0004-blk-bfq-turn-BFQ-v7r11-for-4.11.0-into-BFQ-v8r11-for.patch - # https://bugs.freedesktop.org/show_bug.cgi?id=97822 # http://bugs.rosalinux.ru/show_bug.cgi?id=7533 Patch200: i915_hack_bug_97822.patch diff --git a/linux-4.11.tar.sign b/linux-4.11.tar.sign deleted file mode 100644 index dbd5e00..0000000 --- a/linux-4.11.tar.sign +++ /dev/null @@ -1,10 +0,0 @@ ------BEGIN PGP SIGNATURE----- - -iQEcBAABAgAGBQJZBqIqAAoJEHm+PkMAQRiG8+gH/iazF2SZxnZH9URKLpHGkT5D -fz+O4rnWjB9M8JmOUH3MRhukS720/qLmlwROM4Cghxigmv9RMcqds7JlHGxUnphh -Qv083nmDy3IWRGeN4FUDqFo0u1CB/BMZwrgEMf1FKT6vbmlzVroK7Xj6iGKflJH9 -BZZHt3YkSwuutXff6LjsA8eTiNrRLzNStCxBHuC3fv9hTDEFfZLrQAITg+XbdAgJ -U4tEKQ+EwaHi261LMLmDd3TjzB2SLrfckBICDrmQSk0Bvc/p0ZtWcjjaIZQm1jfk -RpMBbav5uVPoskbGNVDUwRhTraqZ9Lg0Gvb9RQYDeYZJCWGu62EOPS4KP76OmDc= -=j6oI ------END PGP SIGNATURE----- diff --git a/linux-4.12.tar.sign b/linux-4.12.tar.sign new file mode 100644 index 0000000..04d1393 --- /dev/null +++ b/linux-4.12.tar.sign @@ -0,0 +1,10 @@ +-----BEGIN PGP SIGNATURE----- + +iQEcBAABAgAGBQJZWYAgAAoJEHm+PkMAQRiGMeQH/2qxxgnUkkbgST1WWAlWVENU +nXU9p+1bO4uF5xXvTyATXKcc8jfUfZ60GBUGvYVIpQWuivaRVBFgIZNzl8kn5J6V +uSX7x9AfzVzo6YSTCrXZvsCf1gImkTCVE3pWLPzALjiXC4esx+1p85GjJ3jWym6i +Mx6yErG9L9StDnUVqKLBDHPXM6N04uShzLSaxh93opFrxNx8YhROGln5KjJ6KRKS +hv85x/kmgS7sQTxKK2LH/2NJefqBc3rviFVe/pPEy8GAc1KpsLerDWazlac5/gxt +CkTwzAHtLOpi2QhGD2oKrR0nvUFAlIOtU1Q5HdwstFMOwfStsYfsEPF4bhzU9c4= +=dZzI +-----END PGP SIGNATURE----- diff --git a/patch-4.11.9.sign b/patch-4.11.9.sign deleted file mode 100644 index 5d4e346..0000000 --- a/patch-4.11.9.sign +++ /dev/null @@ -1,16 +0,0 @@ ------BEGIN PGP SIGNATURE----- - -iQIzBAABCAAdFiEEZH8oZUiU471FcZm+ONu9yGCSaT4FAllc3qYACgkQONu9yGCS -aT7VRhAAin+SouSf/im62RamDlQbZHof2YDwi1DCKr8CR6IXCfHvZgNq2HDQiDkM -Tx6S+cTiVwNBU6MQrkFkp7EwwsPguz/OjGlcKdvdEovCIQ5MAycXE+ZmfDkBFn1N -+EKNQbnpzWfwZcqNI15YpIoLpOPAV5tURYOuFOKZLglumd5WcTsBwVyZeUJJsJcp -y0h7FNf0C8LhSvwWnkTx45giiv+ot/Jg00+2QpooBZ2RaF9ZYFKZnDnCn7dmjBMh -eU/Zp4y8GceG79jC0AbMJK52dusS6nDcPW9dedKRW3WPvNDMvU78fXMJjjuLnzv/ -IpEd+9vBEGzC6sBvx5gTKEToJI2lYIqGfsANST6+yNQaJSdhrdt8eBbLhqW2DLju -+8EGCEDP9UicRUYvuaUOaOhp9Wfmg7qHaFaNJF4yykrbYAy+AJSCVVtqkJBHNxBf -BsivJ3VRAjYDTR/2E7rxpFfOjprZVmpmGyZ7IREbb8cNwK//VX+NvK6qgEbZmkAU -3L9UI+p9KX6VSc43v7GYBQp2SISXGSURsu/WikqM6FqQmNxy44ergRC1Z0VIZr9t -PHiSubcCFoJSw5M+aoPgYUsUV+K1p97BCJOkcY9uu23HD0asRZGtYGp6CcPaKwNm -/Rznvn1uLEJh9ixv63NGeEwfgrIlfuStCBFn1BLgc+li/U0CEyg= -=iA2J ------END PGP SIGNATURE----- diff --git a/patch-4.12.2.sign b/patch-4.12.2.sign new file mode 100644 index 0000000..0a913dd --- /dev/null +++ b/patch-4.12.2.sign @@ -0,0 +1,16 @@ +-----BEGIN PGP SIGNATURE----- + +iQIzBAABCAAdFiEEZH8oZUiU471FcZm+ONu9yGCSaT4FAllp9/gACgkQONu9yGCS +aT6zSBAApwifKW45v3Zt4oYOkZBmCuVgCJrH8tyX9OIniJT67bF52ZaWBHzYtLlL +5J5TkpP1Qz/Pd6/rCI3KCaj6M4+cX5lLarOMq0kIHpuKPq4lppQPoAtiJzX9U+w9 +Nod3rFswNuAoUoN1i9lIs1jom+ydviCsZdL56SOpW32locsdmL9bDkSEJLM1oCdD +xO5W8rK6+mKR/M+CbPicI/ZKrKieHuRhRNAW0iTCh91XsjOL4Il2GM32Va44Fc1l +6t8zaiixsqGIOv+JceIOaMQna2tm1888QF8naIRaoTgzjYZs6gu09MJpGDnLmLF/ +nyHpxVoNhAHv+tpCWy+Wj/7S9xivmmRJ215M+iaEOV9vr96YwOkiX0vKfNrGsPCy +wJ4VLkYpp9aItg4RotXNu1FC677qjsvqRMyPNUEC9M+2ok/bbNLAJzKYGlWEUqJG +O3oIG2AkOsm7UNrSW7ysPNjcfgd6zp3yTX1AaJI7/4Ldd/ScdelDvoBbBI3wC6WB +NmvhFOdIqbrGNf6b9rgEne4lgm7e8JmbQ9Gno/mBgwHeaM4jEx/nTj+C9+ZQFmmw +gx4eSeg4+V0CWqpx7xi/cLA4HsvsYaazJYyaCbL1guexcerPozv+bV9A2wSJ1uFb +BPT+/0u/zuz1m96R3qWu5Vop7K7/a8syWulao85hiaJYafEqjcA= +=KSdU +-----END PGP SIGNATURE----- diff --git a/sanitize-memory.patch b/sanitize-memory.patch index 7ffbabe..d19ba88 100644 --- a/sanitize-memory.patch +++ b/sanitize-memory.patch @@ -1,8 +1,8 @@ diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index facc20a..db7ec30 100644 +index 7737ab5d04b2..f4a031d8cb37 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -2808,6 +2808,10 @@ +@@ -2826,6 +2826,10 @@ the specified number of seconds. This is to be used if your oopses keep scrolling off the screen. @@ -14,10 +14,10 @@ index facc20a..db7ec30 100644 pcd. [PARIDE] diff --git a/fs/buffer.c b/fs/buffer.c -index 9196f2a..6215406 100644 +index 161be58c5cb0..9dbaa32ec7d5 100644 --- a/fs/buffer.c +++ b/fs/buffer.c -@@ -3512,7 +3512,7 @@ void __init buffer_init(void) +@@ -3500,7 +3500,7 @@ void __init buffer_init(void) bh_cachep = kmem_cache_create("buffer_head", sizeof(struct buffer_head), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| @@ -27,10 +27,10 @@ index 9196f2a..6215406 100644 /* diff --git a/fs/dcache.c b/fs/dcache.c -index b1ff5be..e869514 100644 +index 5fd25bba1282..b7121d7442f0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c -@@ -3619,7 +3619,8 @@ void __init vfs_caches_init_early(void) +@@ -3617,7 +3617,8 @@ void __init vfs_caches_init_early(void) void __init vfs_caches_init(void) { names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, @@ -41,7 +41,7 @@ index b1ff5be..e869514 100644 dcache_init(); inode_init(); diff --git a/include/linux/slab.h b/include/linux/slab.h -index 3c37a8c..341210a 100644 +index 04a7f7993e67..fd08756a7869 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -23,6 +23,13 @@ @@ -59,10 +59,10 @@ index 3c37a8c..341210a 100644 #define SLAB_CACHE_DMA 0x00004000UL /* Use GFP_DMA memory */ #define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */ diff --git a/kernel/fork.c b/kernel/fork.c -index 631a65c..d4cd8d4 100644 +index cdf75164aa25..0302999e1270 100644 --- a/kernel/fork.c +++ b/kernel/fork.c -@@ -2173,7 +2173,7 @@ void __init proc_caches_init(void) +@@ -2212,7 +2212,7 @@ void __init proc_caches_init(void) sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); @@ -72,15 +72,15 @@ index 631a65c..d4cd8d4 100644 nsproxy_cache_init(); } diff --git a/mm/rmap.c b/mm/rmap.c -index f683801..d863239 100644 +index d405f0e0ee96..fc37f2d91905 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -430,10 +430,10 @@ static void anon_vma_ctor(void *data) void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), -- 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, -+ 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT|SLAB_NO_SANITIZE, +- 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, ++ 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT|SLAB_NO_SANITIZE, anon_vma_ctor); anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, - SLAB_PANIC|SLAB_ACCOUNT); @@ -89,7 +89,7 @@ index f683801..d863239 100644 /* diff --git a/mm/slab.c b/mm/slab.c -index 807d86c..6cac198 100644 +index 2a31ee3c5814..f11da46c2722 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3520,6 +3520,17 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, @@ -111,7 +111,7 @@ index 807d86c..6cac198 100644 objp = cache_free_debugcheck(cachep, objp, caller); diff --git a/mm/slab.h b/mm/slab.h -index 65e7c3f..fc7e00e 100644 +index 9cfcf099709c..130e9e250738 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -77,6 +77,15 @@ extern const struct kmalloc_info_struct { @@ -131,7 +131,7 @@ index 65e7c3f..fc7e00e 100644 unsigned long align, unsigned long size); diff --git a/mm/slab_common.c b/mm/slab_common.c -index 09d0e84..d87c631 100644 +index 01a0fe2eb332..794220498bf5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -49,7 +49,11 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, @@ -193,20 +193,20 @@ index 09d0e84..d87c631 100644 return NULL; if (ctor) -@@ -453,6 +479,11 @@ kmem_cache_create(const char *name, size_t size, size_t align, - * passed flags. +@@ -454,6 +480,11 @@ kmem_cache_create(const char *name, size_t size, size_t align, */ flags &= CACHE_CREATE_MASK; -+ + +#ifdef CONFIG_PAX_MEMORY_SANITIZE + if (flags & SLAB_DESTROY_BY_RCU) + flags |= SLAB_NO_SANITIZE; +#endif - ++ s = __kmem_cache_alias(name, size, align, flags, ctor); if (s) + goto out_unlock; diff --git a/mm/slob.c b/mm/slob.c -index eac04d43..f455845 100644 +index 1bae78d71096..a76c903104c9 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -365,6 +365,11 @@ static void slob_free(void *block, int size) @@ -222,7 +222,7 @@ index eac04d43..f455845 100644 /* This slob page is about to become partially free. Easy! */ sp->units = units; diff --git a/mm/slub.c b/mm/slub.c -index 7f4bc70..a913874 100644 +index 8addc535bcdc..8932b7d6d324 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2959,6 +2959,23 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page, @@ -252,18 +252,18 @@ index 7f4bc70..a913874 100644 @@ -3456,6 +3473,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) s->inuse = size; - if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || + if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || +#ifdef CONFIG_PAX_MEMORY_SANITIZE -+ (pax_sanitize_slab && !(flags & SLAB_NO_SANITIZE)) || ++ (pax_sanitize_slab && !(flags & SLAB_NO_SANITIZE)) || +#endif s->ctor)) { /* * Relocate free pointer after the object if it is not diff --git a/net/core/skbuff.c b/net/core/skbuff.c -index f1d0459..dc2922f3 100644 +index b1be7c01efe2..167c2c1123f5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c -@@ -3474,12 +3474,14 @@ void __init skb_init(void) +@@ -3473,12 +3473,14 @@ void __init skb_init(void) skbuff_head_cache = kmem_cache_create("skbuff_head_cache", sizeof(struct sk_buff), 0, @@ -281,7 +281,7 @@ index f1d0459..dc2922f3 100644 } diff --git a/security/Kconfig b/security/Kconfig -index d900f47..fba9613 100644 +index 93027fdf47d1..95f02d5134b4 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -6,6 +6,37 @@ menu "Security options"