From e7a7e32ae0a216abccb2146c8173ab55996491ec Mon Sep 17 00:00:00 2001 From: Evgenii Shatokhin Date: Tue, 23 Aug 2016 16:24:13 +0300 Subject: [PATCH] Revisited the patches and configs for the kernel 4.7.x --- .abf.yml | 4 +- ...onfig-build-bits-for-BFQ-v7r11-4.7.0.patch | 10 +- ...ce-the-BFQ-v7r11-I-O-sched-for-4.7.0.patch | 6 +- ...rly-Queue-Merge-EQM-to-BFQ-v7r11-for.patch | 8 +- ...FQ-v7r11-for-4.7.0-into-BFQ-v8r2-for.patch | 6491 +++++++++++++++++ README.BFQ | 539 ++ fs-aufs4.patch | 1377 ++-- hp-wmi-rfkill-fix.patch | 31 - kernel-i586.config | 182 +- kernel-x86_64.config | 165 +- kernel.spec | 12 +- linux-4.6.tar.sign | 11 - linux-4.7.tar.sign | 11 + patch-4.6.7.sign | 17 - patch-4.7.2.sign | 17 + sanitize-memory.patch | 36 +- 16 files changed, 8152 insertions(+), 765 deletions(-) rename 0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.5.0.patch => 0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.7.0.patch (93%) rename 0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.5.0.patch => 0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.7.0.patch (99%) create mode 100644 0004-block-bfq-turn-BFQ-v7r11-for-4.7.0-into-BFQ-v8r2-for.patch create mode 100644 README.BFQ delete mode 100644 hp-wmi-rfkill-fix.patch delete mode 100644 linux-4.6.tar.sign create mode 100644 linux-4.7.tar.sign delete mode 100644 patch-4.6.7.sign create mode 100644 patch-4.7.2.sign diff --git a/.abf.yml b/.abf.yml index 15cbd07..0de7b6c 100644 --- a/.abf.yml +++ b/.abf.yml @@ -1,3 +1,3 @@ sources: - "linux-4.6.tar.xz": 4192d2873e630d902da050415f82f763e4b744d3 - "patch-4.6.7.xz": a309e0a8cf684840d100b51c80e1ef28ce7f9e4d + linux-4.7.tar.xz: 99551524779bf05382e363f4879101227664dd55 + patch-4.7.2.xz: 9b67591349ae2e719c477cdb8a1dd2e8fc3d3646 diff --git a/0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.5.0.patch b/0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.7.0.patch similarity index 93% rename from 0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.5.0.patch rename to 0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.7.0.patch index 17ee130..ff75a8b 100644 --- a/0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.5.0.patch +++ b/0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.7.0.patch @@ -1,7 +1,7 @@ -From f54f3003586bf00ba0ee5974a92b732477b834e3 Mon Sep 17 00:00:00 2001 +From 22ee35ec82fa543b65c1b6d516a086a21f723846 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 7 Apr 2015 13:39:12 +0200 -Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7r11-4.5.0 +Subject: [PATCH 1/4] block: cgroups, kconfig, build bits for BFQ-v7r11-4.7.0 Update Kconfig.iosched and do the related Makefile changes to include kernel configuration options for BFQ. Also increase the number of @@ -74,7 +74,7 @@ index 421bef9..0ee5f0f 100644 endmenu diff --git a/block/Makefile b/block/Makefile -index 00ecc97..1ed86d5 100644 +index 9eda232..4a36683 100644 --- a/block/Makefile +++ b/block/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o @@ -86,10 +86,10 @@ index 00ecc97..1ed86d5 100644 obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index c70e358..ae43492 100644 +index 3d9cf32..8d862a0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h -@@ -44,7 +44,7 @@ struct pr_ops; +@@ -45,7 +45,7 @@ struct pr_ops; * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. */ diff --git a/0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.5.0.patch b/0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.7.0.patch similarity index 99% rename from 0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.5.0.patch rename to 0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.7.0.patch index 6095fa9..368a4ff 100644 --- a/0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.5.0.patch +++ b/0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.7.0.patch @@ -1,7 +1,7 @@ -From 03d30cc06a5436c05ee338bd21903802181bafe9 Mon Sep 17 00:00:00 2001 +From 2aae32be2a18a7d0da104ae42c08cb9bce9d9c7c Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 9 May 2013 19:10:02 +0200 -Subject: [PATCH 2/3] block: introduce the BFQ-v7r11 I/O sched for 4.5.0 +Subject: [PATCH 2/4] block: introduce the BFQ-v7r11 I/O sched for 4.7.0 The general structure is borrowed from CFQ, as much of the code for handling I/O contexts. Over time, several useful features have been @@ -6287,7 +6287,7 @@ index 0000000..a64fec1 +} diff --git a/block/bfq.h b/block/bfq.h new file mode 100644 -index 0000000..3bb7df2 +index 0000000..485d0c9 --- /dev/null +++ b/block/bfq.h @@ -0,0 +1,801 @@ diff --git a/0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for.patch b/0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for.patch index c2abd3a..a9876aa 100644 --- a/0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for.patch +++ b/0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for.patch @@ -1,8 +1,8 @@ -From d3deade9dc903f58c2bf79e316b785f6eaf2441f Mon Sep 17 00:00:00 2001 +From 47de1e46ef5f462e9694e5b0607aec6ad658f1e0 Mon Sep 17 00:00:00 2001 From: Mauro Andreolini Date: Sun, 6 Sep 2015 16:09:05 +0200 -Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r11 for - 4.5.0 +Subject: [PATCH 3/4] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r11 for + 4.7.0 A set of processes may happen to perform interleaved reads, i.e.,requests whose union would give rise to a sequential read pattern. There are two @@ -964,7 +964,7 @@ index f9787a6..d1f648d 100644 bfqd->bfq_large_burst_thresh = 11; diff --git a/block/bfq.h b/block/bfq.h -index 3bb7df2..32dfcee 100644 +index 485d0c9..f73c942 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -183,6 +183,8 @@ struct bfq_group; diff --git a/0004-block-bfq-turn-BFQ-v7r11-for-4.7.0-into-BFQ-v8r2-for.patch b/0004-block-bfq-turn-BFQ-v7r11-for-4.7.0-into-BFQ-v8r2-for.patch new file mode 100644 index 0000000..cbc051f --- /dev/null +++ b/0004-block-bfq-turn-BFQ-v7r11-for-4.7.0-into-BFQ-v8r2-for.patch @@ -0,0 +1,6491 @@ +From 0061399c3c07fb8d119c0d581b613b870e63b165 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Tue, 17 May 2016 08:28:04 +0200 +Subject: [PATCH 4/4] block, bfq: turn BFQ-v7r11 for 4.7.0 into BFQ-v8r2 for + 4.7.0 + +--- + block/Kconfig.iosched | 2 +- + block/bfq-cgroup.c | 480 +++++---- + block/bfq-iosched.c | 2601 +++++++++++++++++++++++++++++-------------------- + block/bfq-sched.c | 441 +++++++-- + block/bfq.h | 708 +++++++------- + 5 files changed, 2483 insertions(+), 1749 deletions(-) + +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched +index f78cd1a..6d92579 100644 +--- a/block/Kconfig.iosched ++++ b/block/Kconfig.iosched +@@ -53,7 +53,7 @@ config IOSCHED_BFQ + + config BFQ_GROUP_IOSCHED + bool "BFQ hierarchical scheduling support" +- depends on CGROUPS && IOSCHED_BFQ=y ++ depends on IOSCHED_BFQ && BLK_CGROUP + default n + ---help--- + Enable hierarchical scheduling in BFQ, using the blkio controller. +diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c +index 5ee99ec..c83d90c 100644 +--- a/block/bfq-cgroup.c ++++ b/block/bfq-cgroup.c +@@ -162,7 +162,6 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) + static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) + { + struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); +- BUG_ON(!pd); + return pd_to_bfqg(pd); + } + +@@ -224,14 +223,6 @@ static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) + blkg_rwstat_add(&bfqg->stats.merged, rw, 1); + } + +-static void bfqg_stats_update_dispatch(struct bfq_group *bfqg, +- uint64_t bytes, int rw) +-{ +- blkg_stat_add(&bfqg->stats.sectors, bytes >> 9); +- blkg_rwstat_add(&bfqg->stats.serviced, rw, 1); +- blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes); +-} +- + static void bfqg_stats_update_completion(struct bfq_group *bfqg, + uint64_t start_time, uint64_t io_start_time, int rw) + { +@@ -248,17 +239,11 @@ static void bfqg_stats_update_completion(struct bfq_group *bfqg, + /* @stats = 0 */ + static void bfqg_stats_reset(struct bfqg_stats *stats) + { +- if (!stats) +- return; +- + /* queued stats shouldn't be cleared */ +- blkg_rwstat_reset(&stats->service_bytes); +- blkg_rwstat_reset(&stats->serviced); + blkg_rwstat_reset(&stats->merged); + blkg_rwstat_reset(&stats->service_time); + blkg_rwstat_reset(&stats->wait_time); + blkg_stat_reset(&stats->time); +- blkg_stat_reset(&stats->unaccounted_time); + blkg_stat_reset(&stats->avg_queue_size_sum); + blkg_stat_reset(&stats->avg_queue_size_samples); + blkg_stat_reset(&stats->dequeue); +@@ -268,21 +253,19 @@ static void bfqg_stats_reset(struct bfqg_stats *stats) + } + + /* @to += @from */ +-static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from) ++static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) + { + if (!to || !from) + return; + + /* queued stats shouldn't be cleared */ +- blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes); +- blkg_rwstat_add_aux(&to->serviced, &from->serviced); + blkg_rwstat_add_aux(&to->merged, &from->merged); + blkg_rwstat_add_aux(&to->service_time, &from->service_time); + blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); + blkg_stat_add_aux(&from->time, &from->time); +- blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); + blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); +- blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); ++ blkg_stat_add_aux(&to->avg_queue_size_samples, ++ &from->avg_queue_size_samples); + blkg_stat_add_aux(&to->dequeue, &from->dequeue); + blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); + blkg_stat_add_aux(&to->idle_time, &from->idle_time); +@@ -308,10 +291,8 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) + if (unlikely(!parent)) + return; + +- bfqg_stats_merge(&parent->dead_stats, &bfqg->stats); +- bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats); ++ bfqg_stats_add_aux(&parent->stats, &bfqg->stats); + bfqg_stats_reset(&bfqg->stats); +- bfqg_stats_reset(&bfqg->dead_stats); + } + + static void bfq_init_entity(struct bfq_entity *entity, +@@ -332,15 +313,11 @@ static void bfq_init_entity(struct bfq_entity *entity, + + static void bfqg_stats_exit(struct bfqg_stats *stats) + { +- blkg_rwstat_exit(&stats->service_bytes); +- blkg_rwstat_exit(&stats->serviced); + blkg_rwstat_exit(&stats->merged); + blkg_rwstat_exit(&stats->service_time); + blkg_rwstat_exit(&stats->wait_time); + blkg_rwstat_exit(&stats->queued); +- blkg_stat_exit(&stats->sectors); + blkg_stat_exit(&stats->time); +- blkg_stat_exit(&stats->unaccounted_time); + blkg_stat_exit(&stats->avg_queue_size_sum); + blkg_stat_exit(&stats->avg_queue_size_samples); + blkg_stat_exit(&stats->dequeue); +@@ -351,15 +328,11 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) + + static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) + { +- if (blkg_rwstat_init(&stats->service_bytes, gfp) || +- blkg_rwstat_init(&stats->serviced, gfp) || +- blkg_rwstat_init(&stats->merged, gfp) || ++ if (blkg_rwstat_init(&stats->merged, gfp) || + blkg_rwstat_init(&stats->service_time, gfp) || + blkg_rwstat_init(&stats->wait_time, gfp) || + blkg_rwstat_init(&stats->queued, gfp) || +- blkg_stat_init(&stats->sectors, gfp) || + blkg_stat_init(&stats->time, gfp) || +- blkg_stat_init(&stats->unaccounted_time, gfp) || + blkg_stat_init(&stats->avg_queue_size_sum, gfp) || + blkg_stat_init(&stats->avg_queue_size_samples, gfp) || + blkg_stat_init(&stats->dequeue, gfp) || +@@ -374,20 +347,36 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) + } + + static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) +- { ++{ + return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; +- } ++} + + static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) + { + return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); + } + ++static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) ++{ ++ struct bfq_group_data *bgd; ++ ++ bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); ++ if (!bgd) ++ return NULL; ++ return &bgd->pd; ++} ++ + static void bfq_cpd_init(struct blkcg_policy_data *cpd) + { + struct bfq_group_data *d = cpd_to_bfqgd(cpd); + +- d->weight = BFQ_DEFAULT_GRP_WEIGHT; ++ d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? ++ CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; ++} ++ ++static void bfq_cpd_free(struct blkcg_policy_data *cpd) ++{ ++ kfree(cpd_to_bfqgd(cpd)); + } + + static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) +@@ -398,8 +387,7 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) + if (!bfqg) + return NULL; + +- if (bfqg_stats_init(&bfqg->stats, gfp) || +- bfqg_stats_init(&bfqg->dead_stats, gfp)) { ++ if (bfqg_stats_init(&bfqg->stats, gfp)) { + kfree(bfqg); + return NULL; + } +@@ -407,27 +395,20 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) + return &bfqg->pd; + } + +-static void bfq_group_set_parent(struct bfq_group *bfqg, +- struct bfq_group *parent) ++static void bfq_pd_init(struct blkg_policy_data *pd) + { ++ struct blkcg_gq *blkg; ++ struct bfq_group *bfqg; ++ struct bfq_data *bfqd; + struct bfq_entity *entity; ++ struct bfq_group_data *d; + +- BUG_ON(!parent); +- BUG_ON(!bfqg); +- BUG_ON(bfqg == parent); +- ++ blkg = pd_to_blkg(pd); ++ BUG_ON(!blkg); ++ bfqg = blkg_to_bfqg(blkg); ++ bfqd = blkg->q->elevator->elevator_data; + entity = &bfqg->entity; +- entity->parent = parent->my_entity; +- entity->sched_data = &parent->sched_data; +-} +- +-static void bfq_pd_init(struct blkg_policy_data *pd) +-{ +- struct blkcg_gq *blkg = pd_to_blkg(pd); +- struct bfq_group *bfqg = blkg_to_bfqg(blkg); +- struct bfq_data *bfqd = blkg->q->elevator->elevator_data; +- struct bfq_entity *entity = &bfqg->entity; +- struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); ++ d = blkcg_to_bfqgd(blkg->blkcg); + + entity->orig_weight = entity->weight = entity->new_weight = d->weight; + entity->my_sched_data = &bfqg->sched_data; +@@ -445,70 +426,53 @@ static void bfq_pd_free(struct blkg_policy_data *pd) + struct bfq_group *bfqg = pd_to_bfqg(pd); + + bfqg_stats_exit(&bfqg->stats); +- bfqg_stats_exit(&bfqg->dead_stats); +- + return kfree(bfqg); + } + +-/* offset delta from bfqg->stats to bfqg->dead_stats */ +-static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) - +- offsetof(struct bfq_group, stats); +- +-/* to be used by recursive prfill, sums live and dead stats recursively */ +-static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) ++static void bfq_pd_reset_stats(struct blkg_policy_data *pd) + { +- u64 sum = 0; ++ struct bfq_group *bfqg = pd_to_bfqg(pd); + +- sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); +- sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, +- off + dead_stats_off_delta); +- return sum; ++ bfqg_stats_reset(&bfqg->stats); + } + +-/* to be used by recursive prfill, sums live and dead rwstats recursively */ +-static struct blkg_rwstat bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, +- int off) ++static void bfq_group_set_parent(struct bfq_group *bfqg, ++ struct bfq_group *parent) + { +- struct blkg_rwstat a, b; ++ struct bfq_entity *entity; ++ ++ BUG_ON(!parent); ++ BUG_ON(!bfqg); ++ BUG_ON(bfqg == parent); + +- a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); +- b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, +- off + dead_stats_off_delta); +- blkg_rwstat_add_aux(&a, &b); +- return a; ++ entity = &bfqg->entity; ++ entity->parent = parent->my_entity; ++ entity->sched_data = &parent->sched_data; + } + +-static void bfq_pd_reset_stats(struct blkg_policy_data *pd) ++static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, ++ struct blkcg *blkcg) + { +- struct bfq_group *bfqg = pd_to_bfqg(pd); ++ struct blkcg_gq *blkg; + +- bfqg_stats_reset(&bfqg->stats); +- bfqg_stats_reset(&bfqg->dead_stats); ++ blkg = blkg_lookup(blkcg, bfqd->queue); ++ if (likely(blkg)) ++ return blkg_to_bfqg(blkg); ++ return NULL; + } + +-static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, +- struct blkcg *blkcg) ++static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, ++ struct blkcg *blkcg) + { +- struct request_queue *q = bfqd->queue; +- struct bfq_group *bfqg = NULL, *parent; +- struct bfq_entity *entity = NULL; ++ struct bfq_group *bfqg, *parent; ++ struct bfq_entity *entity; + + assert_spin_locked(bfqd->queue->queue_lock); + +- /* avoid lookup for the common case where there's no blkcg */ +- if (blkcg == &blkcg_root) { +- bfqg = bfqd->root_group; +- } else { +- struct blkcg_gq *blkg; +- +- blkg = blkg_lookup_create(blkcg, q); +- if (!IS_ERR(blkg)) +- bfqg = blkg_to_bfqg(blkg); +- else /* fallback to root_group */ +- bfqg = bfqd->root_group; +- } ++ bfqg = bfq_lookup_bfqg(bfqd, blkcg); + +- BUG_ON(!bfqg); ++ if (unlikely(!bfqg)) ++ return NULL; + + /* + * Update chain of bfq_groups as we might be handling a leaf group +@@ -531,13 +495,18 @@ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, + return bfqg; + } + +-static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); ++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq); ++ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool compensate, ++ enum bfqq_expiration reason); + + /** + * bfq_bfqq_move - migrate @bfqq to @bfqg. + * @bfqd: queue descriptor. + * @bfqq: the queue to move. +- * @entity: @bfqq's entity. + * @bfqg: the group to move to. + * + * Move @bfqq to @bfqg, deactivating it from its old group and reactivating +@@ -548,26 +517,40 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) + * rcu_read_lock()). + */ + static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, +- struct bfq_entity *entity, struct bfq_group *bfqg) ++ struct bfq_group *bfqg) + { +- int busy, resume; +- +- busy = bfq_bfqq_busy(bfqq); +- resume = !RB_EMPTY_ROOT(&bfqq->sort_list); ++ struct bfq_entity *entity = &bfqq->entity; + +- BUG_ON(resume && !entity->on_st); +- BUG_ON(busy && !resume && entity->on_st && ++ BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list)); ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st); ++ BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) ++ && entity->on_st && + bfqq != bfqd->in_service_queue); ++ BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue); ++ ++ /* If bfqq is empty, then bfq_bfqq_expire also invokes ++ * bfq_del_bfqq_busy, thereby removing bfqq and its entity ++ * from data structures related to current group. Otherwise we ++ * need to remove bfqq explicitly with bfq_deactivate_bfqq, as ++ * we do below. ++ */ ++ if (bfqq == bfqd->in_service_queue) ++ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, ++ false, BFQ_BFQQ_PREEMPTED); ++ ++ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) ++ && &bfq_entity_service_tree(entity)->idle != ++ entity->tree); + +- if (busy) { +- BUG_ON(atomic_read(&bfqq->ref) < 2); ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); + +- if (!resume) +- bfq_del_bfqq_busy(bfqd, bfqq, 0); +- else +- bfq_deactivate_bfqq(bfqd, bfqq, 0); +- } else if (entity->on_st) ++ if (bfq_bfqq_busy(bfqq)) ++ bfq_deactivate_bfqq(bfqd, bfqq, 0); ++ else if (entity->on_st) { ++ BUG_ON(&bfq_entity_service_tree(entity)->idle != ++ entity->tree); + bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); ++ } + bfqg_put(bfqq_group(bfqq)); + + /* +@@ -579,14 +562,17 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + entity->sched_data = &bfqg->sched_data; + bfqg_get(bfqg); + +- if (busy) { ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); ++ if (bfq_bfqq_busy(bfqq)) { + bfq_pos_tree_add_move(bfqd, bfqq); +- if (resume) +- bfq_activate_bfqq(bfqd, bfqq); ++ bfq_activate_bfqq(bfqd, bfqq); + } + + if (!bfqd->in_service_queue && !bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); ++ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) ++ && &bfq_entity_service_tree(entity)->idle != ++ entity->tree); + } + + /** +@@ -613,7 +599,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, + + lockdep_assert_held(bfqd->queue->queue_lock); + +- bfqg = bfq_find_alloc_group(bfqd, blkcg); ++ bfqg = bfq_find_set_group(bfqd, blkcg); + if (async_bfqq) { + entity = &async_bfqq->entity; + +@@ -621,7 +607,8 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, + bic_set_bfqq(bic, NULL, 0); + bfq_log_bfqq(bfqd, async_bfqq, + "bic_change_group: %p %d", +- async_bfqq, atomic_read(&async_bfqq->ref)); ++ async_bfqq, ++ async_bfqq->ref); + bfq_put_queue(async_bfqq); + } + } +@@ -629,7 +616,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, + if (sync_bfqq) { + entity = &sync_bfqq->entity; + if (entity->sched_data != &bfqg->sched_data) +- bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); ++ bfq_bfqq_move(bfqd, sync_bfqq, bfqg); + } + + return bfqg; +@@ -638,25 +625,23 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, + static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) + { + struct bfq_data *bfqd = bic_to_bfqd(bic); +- struct blkcg *blkcg; + struct bfq_group *bfqg = NULL; +- uint64_t id; ++ uint64_t serial_nr; + + rcu_read_lock(); +- blkcg = bio_blkcg(bio); +- id = blkcg->css.serial_nr; +- rcu_read_unlock(); ++ serial_nr = bio_blkcg(bio)->css.serial_nr; + + /* + * Check whether blkcg has changed. The condition may trigger + * spuriously on a newly created cic but there's no harm. + */ +- if (unlikely(!bfqd) || likely(bic->blkcg_id == id)) +- return; ++ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) ++ goto out; + +- bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg); +- BUG_ON(!bfqg); +- bic->blkcg_id = id; ++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); ++ bic->blkcg_serial_nr = serial_nr; ++out: ++ rcu_read_unlock(); + } + + /** +@@ -682,8 +667,7 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + BUG_ON(!bfqq); +- bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); +- return; ++ bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); + } + + /** +@@ -711,16 +695,15 @@ static void bfq_reparent_active_entities(struct bfq_data *bfqd, + if (bfqg->sched_data.in_service_entity) + bfq_reparent_leaf_entity(bfqd, + bfqg->sched_data.in_service_entity); +- +- return; + } + + /** +- * bfq_destroy_group - destroy @bfqg. +- * @bfqg: the group being destroyed. ++ * bfq_pd_offline - deactivate the entity associated with @pd, ++ * and reparent its children entities. ++ * @pd: descriptor of the policy going offline. + * +- * Destroy @bfqg, making sure that it is not referenced from its parent. +- * blkio already grabs the queue_lock for us, so no need to use RCU-based magic ++ * blkio already grabs the queue_lock for us, so no need to use ++ * RCU-based magic + */ + static void bfq_pd_offline(struct blkg_policy_data *pd) + { +@@ -779,6 +762,12 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) + bfq_put_async_queues(bfqd, bfqg); + BUG_ON(entity->tree); + ++ /* ++ * @blkg is going offline and will be ignored by ++ * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so ++ * that they don't get lost. If IOs complete after this point, the ++ * stats for them will be lost. Oh well... ++ */ + bfqg_stats_xfer_dead(bfqg); + } + +@@ -788,46 +777,35 @@ static void bfq_end_wr_async(struct bfq_data *bfqd) + + list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { + struct bfq_group *bfqg = blkg_to_bfqg(blkg); ++ BUG_ON(!bfqg); + + bfq_end_wr_async_queues(bfqd, bfqg); + } + bfq_end_wr_async_queues(bfqd, bfqd->root_group); + } + +-static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css, +- struct cftype *cftype) +-{ +- struct blkcg *blkcg = css_to_blkcg(css); +- struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); +- int ret = -EINVAL; +- +- spin_lock_irq(&blkcg->lock); +- ret = bfqgd->weight; +- spin_unlock_irq(&blkcg->lock); +- +- return ret; +-} +- +-static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v) ++static int bfq_io_show_weight(struct seq_file *sf, void *v) + { + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); ++ unsigned int val = 0; + +- spin_lock_irq(&blkcg->lock); +- seq_printf(sf, "%u\n", bfqgd->weight); +- spin_unlock_irq(&blkcg->lock); ++ if (bfqgd) ++ val = bfqgd->weight; ++ ++ seq_printf(sf, "%u\n", val); + + return 0; + } + +-static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, +- struct cftype *cftype, +- u64 val) ++static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, ++ struct cftype *cftype, ++ u64 val) + { + struct blkcg *blkcg = css_to_blkcg(css); + struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); + struct blkcg_gq *blkg; +- int ret = -EINVAL; ++ int ret = -ERANGE; + + if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) + return ret; +@@ -837,6 +815,7 @@ static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, + bfqgd->weight = (unsigned short)val; + hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { + struct bfq_group *bfqg = blkg_to_bfqg(blkg); ++ + if (!bfqg) + continue; + /* +@@ -871,13 +850,18 @@ static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, + return ret; + } + +-static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of, +- char *buf, size_t nbytes, +- loff_t off) ++static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, ++ char *buf, size_t nbytes, ++ loff_t off) + { ++ u64 weight; + /* First unsigned long found in the file is used */ +- return bfqio_cgroup_weight_write(of_css(of), NULL, +- simple_strtoull(strim(buf), NULL, 0)); ++ int ret = kstrtoull(strim(buf), 0, &weight); ++ ++ if (ret) ++ return ret; ++ ++ return bfq_io_set_weight_legacy(of_css(of), NULL, weight); + } + + static int bfqg_print_stat(struct seq_file *sf, void *v) +@@ -897,16 +881,17 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v) + static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) + { +- u64 sum = bfqg_stat_pd_recursive_sum(pd, off); +- ++ u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), ++ &blkcg_policy_bfq, off); + return __blkg_prfill_u64(sf, pd, sum); + } + + static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) + { +- struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off); +- ++ struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), ++ &blkcg_policy_bfq, ++ off); + return __blkg_prfill_rwstat(sf, pd, &sum); + } + +@@ -926,6 +911,41 @@ static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) + return 0; + } + ++static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, ++ int off) ++{ ++ u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); ++ ++ return __blkg_prfill_u64(sf, pd, sum >> 9); ++} ++ ++static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); ++ return 0; ++} ++ ++static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, ++ struct blkg_policy_data *pd, int off) ++{ ++ struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, ++ offsetof(struct blkcg_gq, stat_bytes)); ++ u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + ++ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); ++ ++ return __blkg_prfill_u64(sf, pd, sum >> 9); ++} ++ ++static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, ++ false); ++ return 0; ++} ++ ++ + static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, + struct blkg_policy_data *pd, int off) + { +@@ -950,7 +970,8 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) + return 0; + } + +-static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) ++static struct bfq_group * ++bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) + { + int ret; + +@@ -958,41 +979,18 @@ static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int n + if (ret) + return NULL; + +- return blkg_to_bfqg(bfqd->queue->root_blkg); ++ return blkg_to_bfqg(bfqd->queue->root_blkg); + } + +-static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) +-{ +- struct bfq_group_data *bgd; +- +- bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); +- if (!bgd) +- return NULL; +- return &bgd->pd; +-} +- +-static void bfq_cpd_free(struct blkcg_policy_data *cpd) +-{ +- kfree(cpd_to_bfqgd(cpd)); +-} +- +-static struct cftype bfqio_files_dfl[] = { ++static struct cftype bfq_blkcg_legacy_files[] = { + { +- .name = "weight", ++ .name = "bfq.weight", + .flags = CFTYPE_NOT_ON_ROOT, +- .seq_show = bfqio_cgroup_weight_read_dfl, +- .write = bfqio_cgroup_weight_write_dfl, ++ .seq_show = bfq_io_show_weight, ++ .write_u64 = bfq_io_set_weight_legacy, + }, +- {} /* terminate */ +-}; + +-static struct cftype bfqio_files[] = { +- { +- .name = "bfq.weight", +- .read_u64 = bfqio_cgroup_weight_read, +- .write_u64 = bfqio_cgroup_weight_write, +- }, +- /* statistics, cover only the tasks in the bfqg */ ++ /* statistics, covers only the tasks in the bfqg */ + { + .name = "bfq.time", + .private = offsetof(struct bfq_group, stats.time), +@@ -1000,18 +998,17 @@ static struct cftype bfqio_files[] = { + }, + { + .name = "bfq.sectors", +- .private = offsetof(struct bfq_group, stats.sectors), +- .seq_show = bfqg_print_stat, ++ .seq_show = bfqg_print_stat_sectors, + }, + { + .name = "bfq.io_service_bytes", +- .private = offsetof(struct bfq_group, stats.service_bytes), +- .seq_show = bfqg_print_rwstat, ++ .private = (unsigned long)&blkcg_policy_bfq, ++ .seq_show = blkg_print_stat_bytes, + }, + { + .name = "bfq.io_serviced", +- .private = offsetof(struct bfq_group, stats.serviced), +- .seq_show = bfqg_print_rwstat, ++ .private = (unsigned long)&blkcg_policy_bfq, ++ .seq_show = blkg_print_stat_ios, + }, + { + .name = "bfq.io_service_time", +@@ -1042,18 +1039,17 @@ static struct cftype bfqio_files[] = { + }, + { + .name = "bfq.sectors_recursive", +- .private = offsetof(struct bfq_group, stats.sectors), +- .seq_show = bfqg_print_stat_recursive, ++ .seq_show = bfqg_print_stat_sectors_recursive, + }, + { + .name = "bfq.io_service_bytes_recursive", +- .private = offsetof(struct bfq_group, stats.service_bytes), +- .seq_show = bfqg_print_rwstat_recursive, ++ .private = (unsigned long)&blkcg_policy_bfq, ++ .seq_show = blkg_print_stat_bytes_recursive, + }, + { + .name = "bfq.io_serviced_recursive", +- .private = offsetof(struct bfq_group, stats.serviced), +- .seq_show = bfqg_print_rwstat_recursive, ++ .private = (unsigned long)&blkcg_policy_bfq, ++ .seq_show = blkg_print_stat_ios_recursive, + }, + { + .name = "bfq.io_service_time_recursive", +@@ -1099,32 +1095,35 @@ static struct cftype bfqio_files[] = { + .private = offsetof(struct bfq_group, stats.dequeue), + .seq_show = bfqg_print_stat, + }, +- { +- .name = "bfq.unaccounted_time", +- .private = offsetof(struct bfq_group, stats.unaccounted_time), +- .seq_show = bfqg_print_stat, +- }, + { } /* terminate */ + }; + +-static struct blkcg_policy blkcg_policy_bfq = { +- .dfl_cftypes = bfqio_files_dfl, +- .legacy_cftypes = bfqio_files, +- +- .pd_alloc_fn = bfq_pd_alloc, +- .pd_init_fn = bfq_pd_init, +- .pd_offline_fn = bfq_pd_offline, +- .pd_free_fn = bfq_pd_free, +- .pd_reset_stats_fn = bfq_pd_reset_stats, +- +- .cpd_alloc_fn = bfq_cpd_alloc, +- .cpd_init_fn = bfq_cpd_init, +- .cpd_bind_fn = bfq_cpd_init, +- .cpd_free_fn = bfq_cpd_free, +- ++static struct cftype bfq_blkg_files[] = { ++ { ++ .name = "bfq.weight", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .seq_show = bfq_io_show_weight, ++ .write = bfq_io_set_weight, ++ }, ++ {} /* terminate */ + }; + +-#else ++#else /* CONFIG_BFQ_GROUP_IOSCHED */ ++ ++static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, ++ struct bfq_queue *bfqq, int rw) { } ++static inline void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw) { } ++static inline void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) { } ++static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, ++ uint64_t start_time, uint64_t io_start_time, int rw) { } ++static inline void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, ++struct bfq_group *curr_bfqg) { } ++static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } ++static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } + + static void bfq_init_entity(struct bfq_entity *entity, + struct bfq_group *bfqg) +@@ -1146,27 +1145,20 @@ bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) + return bfqd->root_group; + } + +-static void bfq_bfqq_move(struct bfq_data *bfqd, +- struct bfq_queue *bfqq, +- struct bfq_entity *entity, +- struct bfq_group *bfqg) +-{ +-} +- + static void bfq_end_wr_async(struct bfq_data *bfqd) + { + bfq_end_wr_async_queues(bfqd, bfqd->root_group); + } + +-static void bfq_disconnect_groups(struct bfq_data *bfqd) ++static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, ++ struct blkcg *blkcg) + { +- bfq_put_async_queues(bfqd, bfqd->root_group); ++ return bfqd->root_group; + } + +-static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, +- struct blkcg *blkcg) ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) + { +- return bfqd->root_group; ++ return bfqq->bfqd->root_group; + } + + static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c +index d1f648d..5bff378 100644 +--- a/block/bfq-iosched.c ++++ b/block/bfq-iosched.c +@@ -7,25 +7,26 @@ + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * +- * Copyright (C) 2010 Paolo Valente ++ * Copyright (C) 2016 Paolo Valente + * + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ + * file. + * +- * BFQ is a proportional-share storage-I/O scheduling algorithm based on +- * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, +- * measured in number of sectors, to processes instead of time slices. The +- * device is not granted to the in-service process for a given time slice, +- * but until it has exhausted its assigned budget. This change from the time +- * to the service domain allows BFQ to distribute the device throughput +- * among processes as desired, without any distortion due to ZBR, workload +- * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, +- * called B-WF2Q+, to schedule processes according to their budgets. More +- * precisely, BFQ schedules queues associated to processes. Thanks to the +- * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to +- * I/O-bound processes issuing sequential requests (to boost the +- * throughput), and yet guarantee a low latency to interactive and soft +- * real-time applications. ++ * BFQ is a proportional-share storage-I/O scheduling algorithm based ++ * on the slice-by-slice service scheme of CFQ. But BFQ assigns ++ * budgets, measured in number of sectors, to processes instead of ++ * time slices. The device is not granted to the in-service process ++ * for a given time slice, but until it has exhausted its assigned ++ * budget. This change from the time to the service domain enables BFQ ++ * to distribute the device throughput among processes as desired, ++ * without any distortion due to throughput fluctuations, or to device ++ * internal queueing. BFQ uses an ad hoc internal scheduler, called ++ * B-WF2Q+, to schedule processes according to their budgets. More ++ * precisely, BFQ schedules queues associated with processes. Thanks to ++ * the accurate policy of B-WF2Q+, BFQ can afford to assign high ++ * budgets to I/O-bound processes issuing sequential requests (to ++ * boost the throughput), and yet guarantee a low latency to ++ * interactive and soft real-time applications. + * + * BFQ is described in [1], where also a reference to the initial, more + * theoretical paper on BFQ can be found. The interested reader can find +@@ -87,7 +88,6 @@ static const int bfq_stats_min_budgets = 194; + + /* Default maximum budget values, in sectors and number of requests. */ + static const int bfq_default_max_budget = 16 * 1024; +-static const int bfq_max_budget_async_rq = 4; + + /* + * Async to sync throughput distribution is controlled as follows: +@@ -97,8 +97,7 @@ static const int bfq_max_budget_async_rq = 4; + static const int bfq_async_charge_factor = 10; + + /* Default timeout values, in jiffies, approximating CFQ defaults. */ +-static const int bfq_timeout_sync = HZ / 8; +-static int bfq_timeout_async = HZ / 25; ++static const int bfq_timeout = HZ / 8; + + struct kmem_cache *bfq_pool; + +@@ -109,8 +108,9 @@ struct kmem_cache *bfq_pool; + #define BFQ_HW_QUEUE_THRESHOLD 4 + #define BFQ_HW_QUEUE_SAMPLES 32 + +-#define BFQQ_SEEK_THR (sector_t)(8 * 1024) +-#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) ++#define BFQQ_SEEK_THR (sector_t)(8 * 100) ++#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) ++#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) + + /* Min samples used for peak rate estimation (for autotuning). */ + #define BFQ_PEAK_RATE_SAMPLES 32 +@@ -141,16 +141,24 @@ struct kmem_cache *bfq_pool; + * The device's speed class is dynamically (re)detected in + * bfq_update_peak_rate() every time the estimated peak rate is updated. + * +- * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0] +- * are the reference values for a slow/fast rotational device, whereas +- * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for +- * a slow/fast non-rotational device. Finally, device_speed_thresh are the +- * thresholds used to switch between speed classes. ++ * In the following definitions, R_slow[0]/R_fast[0] and ++ * T_slow[0]/T_fast[0] are the reference values for a slow/fast ++ * rotational device, whereas R_slow[1]/R_fast[1] and ++ * T_slow[1]/T_fast[1] are the reference values for a slow/fast ++ * non-rotational device. Finally, device_speed_thresh are the ++ * thresholds used to switch between speed classes. The reference ++ * rates are not the actual peak rates of the devices used as a ++ * reference, but slightly lower values. The reason for using these ++ * slightly lower values is that the peak-rate estimator tends to ++ * yield slightly lower values than the actual peak rate (it can yield ++ * the actual peak rate only if there is only one process doing I/O, ++ * and the process does sequential I/O). ++ * + * Both the reference peak rates and the thresholds are measured in + * sectors/usec, left-shifted by BFQ_RATE_SHIFT. + */ +-static int R_slow[2] = {1536, 10752}; +-static int R_fast[2] = {17415, 34791}; ++static int R_slow[2] = {1000, 10700}; ++static int R_fast[2] = {14000, 33000}; + /* + * To improve readability, a conversion function is used to initialize the + * following arrays, which entails that they can be initialized only in a +@@ -410,11 +418,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) + */ + static bool bfq_symmetric_scenario(struct bfq_data *bfqd) + { +- return +-#ifdef CONFIG_BFQ_GROUP_IOSCHED +- !bfqd->active_numerous_groups && +-#endif +- !bfq_differentiated_weights(bfqd); ++ return !bfq_differentiated_weights(bfqd); + } + + /* +@@ -534,9 +538,19 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd, + static unsigned long bfq_serv_to_charge(struct request *rq, + struct bfq_queue *bfqq) + { +- return blk_rq_sectors(rq) * +- (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * +- bfq_async_charge_factor)); ++ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) ++ return blk_rq_sectors(rq); ++ ++ /* ++ * If there are no weight-raised queues, then amplify service ++ * by just the async charge factor; otherwise amplify service ++ * by twice the async charge factor, to further reduce latency ++ * for weight-raised queues. ++ */ ++ if (bfqq->bfqd->wr_busy_queues == 0) ++ return blk_rq_sectors(rq) * bfq_async_charge_factor; ++ ++ return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; + } + + /** +@@ -591,12 +605,23 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) + dur = bfqd->RT_prod; + do_div(dur, bfqd->peak_rate); + +- return dur; +-} ++ /* ++ * Limit duration between 3 and 13 seconds. Tests show that ++ * higher values than 13 seconds often yield the opposite of ++ * the desired result, i.e., worsen responsiveness by letting ++ * non-interactive and non-soft-real-time applications ++ * preserve weight raising for a too long time interval. ++ * ++ * On the other end, lower values than 3 seconds make it ++ * difficult for most interactive tasks to complete their jobs ++ * before weight-raising finishes. ++ */ ++ if (dur > msecs_to_jiffies(13000)) ++ dur = msecs_to_jiffies(13000); ++ else if (dur < msecs_to_jiffies(3000)) ++ dur = msecs_to_jiffies(3000); + +-static unsigned bfq_bfqq_cooperations(struct bfq_queue *bfqq) +-{ +- return bfqq->bic ? bfqq->bic->cooperations : 0; ++ return dur; + } + + static void +@@ -606,31 +631,11 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) + bfq_mark_bfqq_idle_window(bfqq); + else + bfq_clear_bfqq_idle_window(bfqq); ++ + if (bic->saved_IO_bound) + bfq_mark_bfqq_IO_bound(bfqq); + else + bfq_clear_bfqq_IO_bound(bfqq); +- /* Assuming that the flag in_large_burst is already correctly set */ +- if (bic->wr_time_left && bfqq->bfqd->low_latency && +- !bfq_bfqq_in_large_burst(bfqq) && +- bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { +- /* +- * Start a weight raising period with the duration given by +- * the raising_time_left snapshot. +- */ +- if (bfq_bfqq_busy(bfqq)) +- bfqq->bfqd->wr_busy_queues++; +- bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; +- bfqq->wr_cur_max_time = bic->wr_time_left; +- bfqq->last_wr_start_finish = jiffies; +- bfqq->entity.prio_changed = 1; +- } +- /* +- * Clear wr_time_left to prevent bfq_bfqq_save_state() from +- * getting confused about the queue's need of a weight-raising +- * period. +- */ +- bic->wr_time_left = 0; + } + + static int bfqq_process_refs(struct bfq_queue *bfqq) +@@ -640,7 +645,7 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) + lockdep_assert_held(bfqq->bfqd->queue->queue_lock); + + io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; +- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; ++ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; + BUG_ON(process_refs < 0); + return process_refs; + } +@@ -655,6 +660,7 @@ static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) + hlist_del_init(&item->burst_list_node); + hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); + bfqd->burst_size = 1; ++ bfqd->burst_parent_entity = bfqq->entity.parent; + } + + /* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ +@@ -663,6 +669,10 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) + /* Increment burst size to take into account also bfqq */ + bfqd->burst_size++; + ++ bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); ++ ++ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); ++ + if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { + struct bfq_queue *pos, *bfqq_item; + struct hlist_node *n; +@@ -672,15 +682,19 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) + * other to consider this burst as large. + */ + bfqd->large_burst = true; ++ bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); + + /* + * We can now mark all queues in the burst list as + * belonging to a large burst. + */ + hlist_for_each_entry(bfqq_item, &bfqd->burst_list, +- burst_list_node) ++ burst_list_node) { + bfq_mark_bfqq_in_large_burst(bfqq_item); ++ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); ++ } + bfq_mark_bfqq_in_large_burst(bfqq); ++ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); + + /* + * From now on, and until the current burst finishes, any +@@ -692,67 +706,79 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) + hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, + burst_list_node) + hlist_del_init(&pos->burst_list_node); +- } else /* burst not yet large: add bfqq to the burst list */ ++ } else /* ++ * Burst not yet large: add bfqq to the burst list. Do ++ * not increment the ref counter for bfqq, because bfqq ++ * is removed from the burst list before freeing bfqq ++ * in put_queue. ++ */ + hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); + } + + /* +- * If many queues happen to become active shortly after each other, then, +- * to help the processes associated to these queues get their job done as +- * soon as possible, it is usually better to not grant either weight-raising +- * or device idling to these queues. In this comment we describe, firstly, +- * the reasons why this fact holds, and, secondly, the next function, which +- * implements the main steps needed to properly mark these queues so that +- * they can then be treated in a different way. ++ * If many queues belonging to the same group happen to be created ++ * shortly after each other, then the processes associated with these ++ * queues have typically a common goal. In particular, bursts of queue ++ * creations are usually caused by services or applications that spawn ++ * many parallel threads/processes. Examples are systemd during boot, ++ * or git grep. To help these processes get their job done as soon as ++ * possible, it is usually better to not grant either weight-raising ++ * or device idling to their queues. + * +- * As for the terminology, we say that a queue becomes active, i.e., +- * switches from idle to backlogged, either when it is created (as a +- * consequence of the arrival of an I/O request), or, if already existing, +- * when a new request for the queue arrives while the queue is idle. +- * Bursts of activations, i.e., activations of different queues occurring +- * shortly after each other, are typically caused by services or applications +- * that spawn or reactivate many parallel threads/processes. Examples are +- * systemd during boot or git grep. ++ * In this comment we describe, firstly, the reasons why this fact ++ * holds, and, secondly, the next function, which implements the main ++ * steps needed to properly mark these queues so that they can then be ++ * treated in a different way. + * +- * These services or applications benefit mostly from a high throughput: +- * the quicker the requests of the activated queues are cumulatively served, +- * the sooner the target job of these queues gets completed. As a consequence, +- * weight-raising any of these queues, which also implies idling the device +- * for it, is almost always counterproductive: in most cases it just lowers +- * throughput. ++ * The above services or applications benefit mostly from a high ++ * throughput: the quicker the requests of the activated queues are ++ * cumulatively served, the sooner the target job of these queues gets ++ * completed. As a consequence, weight-raising any of these queues, ++ * which also implies idling the device for it, is almost always ++ * counterproductive. In most cases it just lowers throughput. + * +- * On the other hand, a burst of activations may be also caused by the start +- * of an application that does not consist in a lot of parallel I/O-bound +- * threads. In fact, with a complex application, the burst may be just a +- * consequence of the fact that several processes need to be executed to +- * start-up the application. To start an application as quickly as possible, +- * the best thing to do is to privilege the I/O related to the application +- * with respect to all other I/O. Therefore, the best strategy to start as +- * quickly as possible an application that causes a burst of activations is +- * to weight-raise all the queues activated during the burst. This is the ++ * On the other hand, a burst of queue creations may be caused also by ++ * the start of an application that does not consist of a lot of ++ * parallel I/O-bound threads. In fact, with a complex application, ++ * several short processes may need to be executed to start-up the ++ * application. In this respect, to start an application as quickly as ++ * possible, the best thing to do is in any case to privilege the I/O ++ * related to the application with respect to all other ++ * I/O. Therefore, the best strategy to start as quickly as possible ++ * an application that causes a burst of queue creations is to ++ * weight-raise all the queues created during the burst. This is the + * exact opposite of the best strategy for the other type of bursts. + * +- * In the end, to take the best action for each of the two cases, the two +- * types of bursts need to be distinguished. Fortunately, this seems +- * relatively easy to do, by looking at the sizes of the bursts. In +- * particular, we found a threshold such that bursts with a larger size +- * than that threshold are apparently caused only by services or commands +- * such as systemd or git grep. For brevity, hereafter we call just 'large' +- * these bursts. BFQ *does not* weight-raise queues whose activations occur +- * in a large burst. In addition, for each of these queues BFQ performs or +- * does not perform idling depending on which choice boosts the throughput +- * most. The exact choice depends on the device and request pattern at ++ * In the end, to take the best action for each of the two cases, the ++ * two types of bursts need to be distinguished. Fortunately, this ++ * seems relatively easy, by looking at the sizes of the bursts. In ++ * particular, we found a threshold such that only bursts with a ++ * larger size than that threshold are apparently caused by ++ * services or commands such as systemd or git grep. For brevity, ++ * hereafter we call just 'large' these bursts. BFQ *does not* ++ * weight-raise queues whose creation occurs in a large burst. In ++ * addition, for each of these queues BFQ performs or does not perform ++ * idling depending on which choice boosts the throughput more. The ++ * exact choice depends on the device and request pattern at + * hand. + * +- * Turning back to the next function, it implements all the steps needed +- * to detect the occurrence of a large burst and to properly mark all the +- * queues belonging to it (so that they can then be treated in a different +- * way). This goal is achieved by maintaining a special "burst list" that +- * holds, temporarily, the queues that belong to the burst in progress. The +- * list is then used to mark these queues as belonging to a large burst if +- * the burst does become large. The main steps are the following. ++ * Unfortunately, false positives may occur while an interactive task ++ * is starting (e.g., an application is being started). The ++ * consequence is that the queues associated with the task do not ++ * enjoy weight raising as expected. Fortunately these false positives ++ * are very rare. They typically occur if some service happens to ++ * start doing I/O exactly when the interactive task starts. ++ * ++ * Turning back to the next function, it implements all the steps ++ * needed to detect the occurrence of a large burst and to properly ++ * mark all the queues belonging to it (so that they can then be ++ * treated in a different way). This goal is achieved by maintaining a ++ * "burst list" that holds, temporarily, the queues that belong to the ++ * burst in progress. The list is then used to mark these queues as ++ * belonging to a large burst if the burst does become large. The main ++ * steps are the following. + * +- * . when the very first queue is activated, the queue is inserted into the ++ * . when the very first queue is created, the queue is inserted into the + * list (as it could be the first queue in a possible burst) + * + * . if the current burst has not yet become large, and a queue Q that does +@@ -773,13 +799,13 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) + * + * . the device enters a large-burst mode + * +- * . if a queue Q that does not belong to the burst is activated while ++ * . if a queue Q that does not belong to the burst is created while + * the device is in large-burst mode and shortly after the last time + * at which a queue either entered the burst list or was marked as + * belonging to the current large burst, then Q is immediately marked + * as belonging to a large burst. + * +- * . if a queue Q that does not belong to the burst is activated a while ++ * . if a queue Q that does not belong to the burst is created a while + * later, i.e., not shortly after, than the last time at which a queue + * either entered the burst list or was marked as belonging to the + * current large burst, then the current burst is deemed as finished and: +@@ -792,52 +818,44 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) + * in a possible new burst (then the burst list contains just Q + * after this step). + */ +-static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, +- bool idle_for_long_time) ++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) + { + /* +- * If bfqq happened to be activated in a burst, but has been idle +- * for at least as long as an interactive queue, then we assume +- * that, in the overall I/O initiated in the burst, the I/O +- * associated to bfqq is finished. So bfqq does not need to be +- * treated as a queue belonging to a burst anymore. Accordingly, +- * we reset bfqq's in_large_burst flag if set, and remove bfqq +- * from the burst list if it's there. We do not decrement instead +- * burst_size, because the fact that bfqq does not need to belong +- * to the burst list any more does not invalidate the fact that +- * bfqq may have been activated during the current burst. +- */ +- if (idle_for_long_time) { +- hlist_del_init(&bfqq->burst_list_node); +- bfq_clear_bfqq_in_large_burst(bfqq); +- } +- +- /* + * If bfqq is already in the burst list or is part of a large +- * burst, then there is nothing else to do. ++ * burst, or finally has just been split, then there is ++ * nothing else to do. + */ + if (!hlist_unhashed(&bfqq->burst_list_node) || +- bfq_bfqq_in_large_burst(bfqq)) ++ bfq_bfqq_in_large_burst(bfqq) || ++ time_is_after_eq_jiffies(bfqq->split_time + ++ msecs_to_jiffies(10))) + return; + + /* +- * If bfqq's activation happens late enough, then the current +- * burst is finished, and related data structures must be reset. ++ * If bfqq's creation happens late enough, or bfqq belongs to ++ * a different group than the burst group, then the current ++ * burst is finished, and related data structures must be ++ * reset. + * +- * In this respect, consider the special case where bfqq is the very +- * first queue being activated. In this case, last_ins_in_burst is +- * not yet significant when we get here. But it is easy to verify +- * that, whether or not the following condition is true, bfqq will +- * end up being inserted into the burst list. In particular the +- * list will happen to contain only bfqq. And this is exactly what +- * has to happen, as bfqq may be the first queue in a possible ++ * In this respect, consider the special case where bfqq is ++ * the very first queue created after BFQ is selected for this ++ * device. In this case, last_ins_in_burst and ++ * burst_parent_entity are not yet significant when we get ++ * here. But it is easy to verify that, whether or not the ++ * following condition is true, bfqq will end up being ++ * inserted into the burst list. In particular the list will ++ * happen to contain only bfqq. And this is exactly what has ++ * to happen, as bfqq may be the first queue of the first + * burst. + */ + if (time_is_before_jiffies(bfqd->last_ins_in_burst + +- bfqd->bfq_burst_interval)) { ++ bfqd->bfq_burst_interval) || ++ bfqq->entity.parent != bfqd->burst_parent_entity) { + bfqd->large_burst = false; + bfq_reset_burst_list(bfqd, bfqq); +- return; ++ bfq_log_bfqq(bfqd, bfqq, ++ "handle_burst: late activation or different group"); ++ goto end; + } + + /* +@@ -846,8 +864,9 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, + * bfqq as belonging to this large burst immediately. + */ + if (bfqd->large_burst) { ++ bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); + bfq_mark_bfqq_in_large_burst(bfqq); +- return; ++ goto end; + } + + /* +@@ -856,25 +875,497 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, + * queue. Then we add bfqq to the burst. + */ + bfq_add_to_burst(bfqd, bfqq); ++end: ++ /* ++ * At this point, bfqq either has been added to the current ++ * burst or has caused the current burst to terminate and a ++ * possible new burst to start. In particular, in the second ++ * case, bfqq has become the first queue in the possible new ++ * burst. In both cases last_ins_in_burst needs to be moved ++ * forward. ++ */ ++ bfqd->last_ins_in_burst = jiffies; ++ ++} ++ ++static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ return entity->budget - entity->service; ++} ++ ++/* ++ * If enough samples have been computed, return the current max budget ++ * stored in bfqd, which is dynamically updated according to the ++ * estimated disk peak rate; otherwise return the default max budget ++ */ ++static int bfq_max_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) ++ return bfq_default_max_budget; ++ else ++ return bfqd->bfq_max_budget; ++} ++ ++/* ++ * Return min budget, which is a fraction of the current or default ++ * max budget (trying with 1/32) ++ */ ++static int bfq_min_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) ++ return bfq_default_max_budget / 32; ++ else ++ return bfqd->bfq_max_budget / 32; ++} ++ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool compensate, ++ enum bfqq_expiration reason); ++ ++/* ++ * The next function, invoked after the input queue bfqq switches from ++ * idle to busy, updates the budget of bfqq. The function also tells ++ * whether the in-service queue should be expired, by returning ++ * true. The purpose of expiring the in-service queue is to give bfqq ++ * the chance to possibly preempt the in-service queue, and the reason ++ * for preempting the in-service queue is to achieve one of the two ++ * goals below. ++ * ++ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has ++ * expired because it has remained idle. In particular, bfqq may have ++ * expired for one of the following two reasons: ++ * ++ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and ++ * did not make it to issue a new request before its last request ++ * was served; ++ * ++ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue ++ * a new request before the expiration of the idling-time. ++ * ++ * Even if bfqq has expired for one of the above reasons, the process ++ * associated with the queue may be however issuing requests greedily, ++ * and thus be sensitive to the bandwidth it receives (bfqq may have ++ * remained idle for other reasons: CPU high load, bfqq not enjoying ++ * idling, I/O throttling somewhere in the path from the process to ++ * the I/O scheduler, ...). But if, after every expiration for one of ++ * the above two reasons, bfqq has to wait for the service of at least ++ * one full budget of another queue before being served again, then ++ * bfqq is likely to get a much lower bandwidth or resource time than ++ * its reserved ones. To address this issue, two countermeasures need ++ * to be taken. ++ * ++ * First, the budget and the timestamps of bfqq need to be updated in ++ * a special way on bfqq reactivation: they need to be updated as if ++ * bfqq did not remain idle and did not expire. In fact, if they are ++ * computed as if bfqq expired and remained idle until reactivation, ++ * then the process associated with bfqq is treated as if, instead of ++ * being greedy, it stopped issuing requests when bfqq remained idle, ++ * and restarts issuing requests only on this reactivation. In other ++ * words, the scheduler does not help the process recover the "service ++ * hole" between bfqq expiration and reactivation. As a consequence, ++ * the process receives a lower bandwidth than its reserved one. In ++ * contrast, to recover this hole, the budget must be updated as if ++ * bfqq was not expired at all before this reactivation, i.e., it must ++ * be set to the value of the remaining budget when bfqq was ++ * expired. Along the same line, timestamps need to be assigned the ++ * value they had the last time bfqq was selected for service, i.e., ++ * before last expiration. Thus timestamps need to be back-shifted ++ * with respect to their normal computation (see [1] for more details ++ * on this tricky aspect). ++ * ++ * Secondly, to allow the process to recover the hole, the in-service ++ * queue must be expired too, to give bfqq the chance to preempt it ++ * immediately. In fact, if bfqq has to wait for a full budget of the ++ * in-service queue to be completed, then it may become impossible to ++ * let the process recover the hole, even if the back-shifted ++ * timestamps of bfqq are lower than those of the in-service queue. If ++ * this happens for most or all of the holes, then the process may not ++ * receive its reserved bandwidth. In this respect, it is worth noting ++ * that, being the service of outstanding requests unpreemptible, a ++ * little fraction of the holes may however be unrecoverable, thereby ++ * causing a little loss of bandwidth. ++ * ++ * The last important point is detecting whether bfqq does need this ++ * bandwidth recovery. In this respect, the next function deems the ++ * process associated with bfqq greedy, and thus allows it to recover ++ * the hole, if: 1) the process is waiting for the arrival of a new ++ * request (which implies that bfqq expired for one of the above two ++ * reasons), and 2) such a request has arrived soon. The first ++ * condition is controlled through the flag non_blocking_wait_rq, ++ * while the second through the flag arrived_in_time. If both ++ * conditions hold, then the function computes the budget in the ++ * above-described special way, and signals that the in-service queue ++ * should be expired. Timestamp back-shifting is done later in ++ * __bfq_activate_entity. ++ * ++ * 2. Reduce latency. Even if timestamps are not backshifted to let ++ * the process associated with bfqq recover a service hole, bfqq may ++ * however happen to have, after being (re)activated, a lower finish ++ * timestamp than the in-service queue. That is, the next budget of ++ * bfqq may have to be completed before the one of the in-service ++ * queue. If this is the case, then preempting the in-service queue ++ * allows this goal to be achieved, apart from the unpreemptible, ++ * outstanding requests mentioned above. ++ * ++ * Unfortunately, regardless of which of the above two goals one wants ++ * to achieve, service trees need first to be updated to know whether ++ * the in-service queue must be preempted. To have service trees ++ * correctly updated, the in-service queue must be expired and ++ * rescheduled, and bfqq must be scheduled too. This is one of the ++ * most costly operations (in future versions, the scheduling ++ * mechanism may be re-designed in such a way to make it possible to ++ * know whether preemption is needed without needing to update service ++ * trees). In addition, queue preemptions almost always cause random ++ * I/O, and thus loss of throughput. Because of these facts, the next ++ * function adopts the following simple scheme to avoid both costly ++ * operations and too frequent preemptions: it requests the expiration ++ * of the in-service queue (unconditionally) only for queues that need ++ * to recover a hole, or that either are weight-raised or deserve to ++ * be weight-raised. ++ */ ++static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool arrived_in_time, ++ bool wr_or_deserves_wr) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { ++ /* ++ * We do not clear the flag non_blocking_wait_rq here, as ++ * the latter is used in bfq_activate_bfqq to signal ++ * that timestamps need to be back-shifted (and is ++ * cleared right after). ++ */ ++ ++ /* ++ * In next assignment we rely on that either ++ * entity->service or entity->budget are not updated ++ * on expiration if bfqq is empty (see ++ * __bfq_bfqq_recalc_budget). Thus both quantities ++ * remain unchanged after such an expiration, and the ++ * following statement therefore assigns to ++ * entity->budget the remaining budget on such an ++ * expiration. For clarity, entity->service is not ++ * updated on expiration in any case, and, in normal ++ * operation, is reset only when bfqq is selected for ++ * service (see bfq_get_next_queue). ++ */ ++ entity->budget = min_t(unsigned long, ++ bfq_bfqq_budget_left(bfqq), ++ bfqq->max_budget); ++ ++ BUG_ON(entity->budget < 0); ++ return true; ++ } ++ ++ entity->budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(bfqq->next_rq,bfqq)); ++ BUG_ON(entity->budget < 0); ++ ++ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); ++ return wr_or_deserves_wr; ++} ++ ++static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ unsigned int old_wr_coeff, ++ bool wr_or_deserves_wr, ++ bool interactive, ++ bool in_burst, ++ bool soft_rt) ++{ ++ if (old_wr_coeff == 1 && wr_or_deserves_wr) { ++ /* start a weight-raising period */ ++ if (interactive) { ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ } else { ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff * ++ BFQ_SOFTRT_WEIGHT_FACTOR; ++ bfqq->wr_cur_max_time = ++ bfqd->bfq_wr_rt_max_time; ++ } ++ /* ++ * If needed, further reduce budget to make sure it is ++ * close to bfqq's backlog, so as to reduce the ++ * scheduling-error component due to a too large ++ * budget. Do not care about throughput consequences, ++ * but only about latency. Finally, do not assign a ++ * too small budget either, to avoid increasing ++ * latency by causing too frequent expirations. ++ */ ++ bfqq->entity.budget = min_t(unsigned long, ++ bfqq->entity.budget, ++ 2 * bfq_min_budget(bfqd)); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais starting at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } else if (old_wr_coeff > 1) { ++ if (interactive) /* update wr duration */ ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ else if (in_burst) { ++ bfqq->wr_coeff = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais ending at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq-> ++ wr_cur_max_time)); ++ } else if (time_before( ++ bfqq->last_wr_start_finish + ++ bfqq->wr_cur_max_time, ++ jiffies + ++ bfqd->bfq_wr_rt_max_time) && ++ soft_rt) { ++ /* ++ * The remaining weight-raising time is lower ++ * than bfqd->bfq_wr_rt_max_time, which means ++ * that the application is enjoying weight ++ * raising either because deemed soft-rt in ++ * the near past, or because deemed interactive ++ * a long ago. ++ * In both cases, resetting now the current ++ * remaining weight-raising time for the ++ * application to the weight-raising duration ++ * for soft rt applications would not cause any ++ * latency increase for the application (as the ++ * new duration would be higher than the ++ * remaining time). ++ * ++ * In addition, the application is now meeting ++ * the requirements for being deemed soft rt. ++ * In the end we can correctly and safely ++ * (re)charge the weight-raising duration for ++ * the application with the weight-raising ++ * duration for soft rt applications. ++ * ++ * In particular, doing this recharge now, i.e., ++ * before the weight-raising period for the ++ * application finishes, reduces the probability ++ * of the following negative scenario: ++ * 1) the weight of a soft rt application is ++ * raised at startup (as for any newly ++ * created application), ++ * 2) since the application is not interactive, ++ * at a certain time weight-raising is ++ * stopped for the application, ++ * 3) at that time the application happens to ++ * still have pending requests, and hence ++ * is destined to not have a chance to be ++ * deemed soft rt before these requests are ++ * completed (see the comments to the ++ * function bfq_bfqq_softrt_next_start() ++ * for details on soft rt detection), ++ * 4) these pending requests experience a high ++ * latency because the application is not ++ * weight-raised while they are pending. ++ */ ++ bfqq->last_wr_start_finish = jiffies; ++ bfqq->wr_cur_max_time = ++ bfqd->bfq_wr_rt_max_time; ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff * ++ BFQ_SOFTRT_WEIGHT_FACTOR; ++ bfq_log_bfqq(bfqd, bfqq, ++ "switching to soft_rt wr, or " ++ " just moving forward duration"); ++ } ++ } ++} ++ ++static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ return bfqq->dispatched == 0 && ++ time_is_before_jiffies( ++ bfqq->budget_timeout + ++ bfqd->bfq_wr_min_idle_time); ++} ++ ++static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ int old_wr_coeff, ++ struct request *rq, ++ bool *interactive) ++{ ++ bool soft_rt, in_burst, wr_or_deserves_wr, ++ bfqq_wants_to_preempt, ++ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), ++ /* ++ * See the comments on ++ * bfq_bfqq_update_budg_for_activation for ++ * details on the usage of the next variable. ++ */ ++ arrived_in_time = time_is_after_jiffies( ++ RQ_BIC(rq)->ttime.last_end_request + ++ bfqd->bfq_slice_idle * 3); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "bfq_add_request non-busy: " ++ "jiffies %lu, in_time %d, idle_long %d busyw %d " ++ "wr_coeff %u", ++ jiffies, arrived_in_time, ++ idle_for_long_time, ++ bfq_bfqq_non_blocking_wait_rq(bfqq), ++ old_wr_coeff); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, ++ rq->cmd_flags); ++ ++ /* ++ * bfqq deserves to be weight-raised if: ++ * - it is sync, ++ * - it does not belong to a large burst, ++ * - it has been idle for enough time or is soft real-time, ++ * - is linked to a bfq_io_cq (it is not shared in any sense) ++ */ ++ in_burst = bfq_bfqq_in_large_burst(bfqq); ++ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && ++ !in_burst && ++ time_is_before_jiffies(bfqq->soft_rt_next_start); ++ *interactive = ++ !in_burst && ++ idle_for_long_time; ++ wr_or_deserves_wr = bfqd->low_latency && ++ (bfqq->wr_coeff > 1 || ++ (bfq_bfqq_sync(bfqq) && ++ bfqq->bic && (*interactive || soft_rt))); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "bfq_add_request: " ++ "in_burst %d, " ++ "soft_rt %d (next %lu), inter %d, bic %p", ++ bfq_bfqq_in_large_burst(bfqq), soft_rt, ++ bfqq->soft_rt_next_start, ++ *interactive, ++ bfqq->bic); ++ ++ /* ++ * Using the last flag, update budget and check whether bfqq ++ * may want to preempt the in-service queue. ++ */ ++ bfqq_wants_to_preempt = ++ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, ++ arrived_in_time, ++ wr_or_deserves_wr); ++ ++ /* ++ * If bfqq happened to be activated in a burst, but has been ++ * idle for much more than an interactive queue, then we ++ * assume that, in the overall I/O initiated in the burst, the ++ * I/O associated with bfqq is finished. So bfqq does not need ++ * to be treated as a queue belonging to a burst ++ * anymore. Accordingly, we reset bfqq's in_large_burst flag ++ * if set, and remove bfqq from the burst list if it's ++ * there. We do not decrement burst_size, because the fact ++ * that bfqq does not need to belong to the burst list any ++ * more does not invalidate the fact that bfqq was created in ++ * a burst. ++ */ ++ if (likely(!bfq_bfqq_just_created(bfqq)) && ++ idle_for_long_time && ++ time_is_before_jiffies( ++ bfqq->budget_timeout + ++ msecs_to_jiffies(10000))) { ++ hlist_del_init(&bfqq->burst_list_node); ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ } ++ ++ bfq_clear_bfqq_just_created(bfqq); ++ ++ if (!bfq_bfqq_IO_bound(bfqq)) { ++ if (arrived_in_time) { ++ bfqq->requests_within_timer++; ++ if (bfqq->requests_within_timer >= ++ bfqd->bfq_requests_within_timer) ++ bfq_mark_bfqq_IO_bound(bfqq); ++ } else ++ bfqq->requests_within_timer = 0; ++ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", ++ bfqq->requests_within_timer); ++ } ++ ++ if (bfqd->low_latency) { ++ if (unlikely(time_is_after_jiffies(bfqq->split_time))) ++ /* wraparound */ ++ bfqq->split_time = ++ jiffies - bfqd->bfq_wr_min_idle_time - 1; ++ ++ if (time_is_before_jiffies(bfqq->split_time + ++ bfqd->bfq_wr_min_idle_time)) { ++ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, ++ old_wr_coeff, ++ wr_or_deserves_wr, ++ *interactive, ++ in_burst, ++ soft_rt); ++ ++ if (old_wr_coeff != bfqq->wr_coeff) ++ bfqq->entity.prio_changed = 1; ++ } ++ } ++ ++ bfqq->last_idle_bklogged = jiffies; ++ bfqq->service_from_backlogged = 0; ++ bfq_clear_bfqq_softrt_update(bfqq); ++ ++ bfq_add_bfqq_busy(bfqd, bfqq); ++ ++ /* ++ * Expire in-service queue only if preemption may be needed ++ * for guarantees. In this respect, the function ++ * next_queue_may_preempt just checks a simple, necessary ++ * condition, and not a sufficient condition based on ++ * timestamps. In fact, for the latter condition to be ++ * evaluated, timestamps would need first to be updated, and ++ * this operation is quite costly (see the comments on the ++ * function bfq_bfqq_update_budg_for_activation). ++ */ ++ if (bfqd->in_service_queue && bfqq_wants_to_preempt && ++ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && ++ next_queue_may_preempt(bfqd)) { ++ struct bfq_queue *in_serv = ++ bfqd->in_service_queue; ++ BUG_ON(in_serv == bfqq); ++ ++ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, ++ false, BFQ_BFQQ_PREEMPTED); ++ BUG_ON(in_serv->entity.budget < 0); ++ } + } + + static void bfq_add_request(struct request *rq) + { + struct bfq_queue *bfqq = RQ_BFQQ(rq); +- struct bfq_entity *entity = &bfqq->entity; + struct bfq_data *bfqd = bfqq->bfqd; + struct request *next_rq, *prev; +- unsigned long old_wr_coeff = bfqq->wr_coeff; ++ unsigned int old_wr_coeff = bfqq->wr_coeff; + bool interactive = false; + +- bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); ++ bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", ++ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); ++ ++ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ ++ bfq_log_bfqq(bfqd, bfqq, ++ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqq->wr_coeff, ++ bfqq->entity.weight, bfqq->entity.orig_weight); ++ + bfqq->queued[rq_is_sync(rq)]++; + bfqd->queued++; + + elv_rb_add(&bfqq->sort_list, rq); + + /* +- * Check if this request is a better next-serve candidate. ++ * Check if this request is a better next-to-serve candidate. + */ + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); +@@ -887,160 +1378,10 @@ static void bfq_add_request(struct request *rq) + if (prev != bfqq->next_rq) + bfq_pos_tree_add_move(bfqd, bfqq); + +- if (!bfq_bfqq_busy(bfqq)) { +- bool soft_rt, coop_or_in_burst, +- idle_for_long_time = time_is_before_jiffies( +- bfqq->budget_timeout + +- bfqd->bfq_wr_min_idle_time); +- +-#ifdef CONFIG_BFQ_GROUP_IOSCHED +- bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, +- rq->cmd_flags); +-#endif +- if (bfq_bfqq_sync(bfqq)) { +- bool already_in_burst = +- !hlist_unhashed(&bfqq->burst_list_node) || +- bfq_bfqq_in_large_burst(bfqq); +- bfq_handle_burst(bfqd, bfqq, idle_for_long_time); +- /* +- * If bfqq was not already in the current burst, +- * then, at this point, bfqq either has been +- * added to the current burst or has caused the +- * current burst to terminate. In particular, in +- * the second case, bfqq has become the first +- * queue in a possible new burst. +- * In both cases last_ins_in_burst needs to be +- * moved forward. +- */ +- if (!already_in_burst) +- bfqd->last_ins_in_burst = jiffies; +- } +- +- coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) || +- bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh; +- soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && +- !coop_or_in_burst && +- time_is_before_jiffies(bfqq->soft_rt_next_start); +- interactive = !coop_or_in_burst && idle_for_long_time; +- entity->budget = max_t(unsigned long, bfqq->max_budget, +- bfq_serv_to_charge(next_rq, bfqq)); +- +- if (!bfq_bfqq_IO_bound(bfqq)) { +- if (time_before(jiffies, +- RQ_BIC(rq)->ttime.last_end_request + +- bfqd->bfq_slice_idle)) { +- bfqq->requests_within_timer++; +- if (bfqq->requests_within_timer >= +- bfqd->bfq_requests_within_timer) +- bfq_mark_bfqq_IO_bound(bfqq); +- } else +- bfqq->requests_within_timer = 0; +- } +- +- if (!bfqd->low_latency) +- goto add_bfqq_busy; +- +- if (bfq_bfqq_just_split(bfqq)) +- goto set_prio_changed; +- +- /* +- * If the queue: +- * - is not being boosted, +- * - has been idle for enough time, +- * - is not a sync queue or is linked to a bfq_io_cq (it is +- * shared "for its nature" or it is not shared and its +- * requests have not been redirected to a shared queue) +- * start a weight-raising period. +- */ +- if (old_wr_coeff == 1 && (interactive || soft_rt) && +- (!bfq_bfqq_sync(bfqq) || bfqq->bic)) { +- bfqq->wr_coeff = bfqd->bfq_wr_coeff; +- if (interactive) +- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); +- else +- bfqq->wr_cur_max_time = +- bfqd->bfq_wr_rt_max_time; +- bfq_log_bfqq(bfqd, bfqq, +- "wrais starting at %lu, rais_max_time %u", +- jiffies, +- jiffies_to_msecs(bfqq->wr_cur_max_time)); +- } else if (old_wr_coeff > 1) { +- if (interactive) +- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); +- else if (coop_or_in_burst || +- (bfqq->wr_cur_max_time == +- bfqd->bfq_wr_rt_max_time && +- !soft_rt)) { +- bfqq->wr_coeff = 1; +- bfq_log_bfqq(bfqd, bfqq, +- "wrais ending at %lu, rais_max_time %u", +- jiffies, +- jiffies_to_msecs(bfqq-> +- wr_cur_max_time)); +- } else if (time_before( +- bfqq->last_wr_start_finish + +- bfqq->wr_cur_max_time, +- jiffies + +- bfqd->bfq_wr_rt_max_time) && +- soft_rt) { +- /* +- * +- * The remaining weight-raising time is lower +- * than bfqd->bfq_wr_rt_max_time, which means +- * that the application is enjoying weight +- * raising either because deemed soft-rt in +- * the near past, or because deemed interactive +- * a long ago. +- * In both cases, resetting now the current +- * remaining weight-raising time for the +- * application to the weight-raising duration +- * for soft rt applications would not cause any +- * latency increase for the application (as the +- * new duration would be higher than the +- * remaining time). +- * +- * In addition, the application is now meeting +- * the requirements for being deemed soft rt. +- * In the end we can correctly and safely +- * (re)charge the weight-raising duration for +- * the application with the weight-raising +- * duration for soft rt applications. +- * +- * In particular, doing this recharge now, i.e., +- * before the weight-raising period for the +- * application finishes, reduces the probability +- * of the following negative scenario: +- * 1) the weight of a soft rt application is +- * raised at startup (as for any newly +- * created application), +- * 2) since the application is not interactive, +- * at a certain time weight-raising is +- * stopped for the application, +- * 3) at that time the application happens to +- * still have pending requests, and hence +- * is destined to not have a chance to be +- * deemed soft rt before these requests are +- * completed (see the comments to the +- * function bfq_bfqq_softrt_next_start() +- * for details on soft rt detection), +- * 4) these pending requests experience a high +- * latency because the application is not +- * weight-raised while they are pending. +- */ +- bfqq->last_wr_start_finish = jiffies; +- bfqq->wr_cur_max_time = +- bfqd->bfq_wr_rt_max_time; +- } +- } +-set_prio_changed: +- if (old_wr_coeff != bfqq->wr_coeff) +- entity->prio_changed = 1; +-add_bfqq_busy: +- bfqq->last_idle_bklogged = jiffies; +- bfqq->service_from_backlogged = 0; +- bfq_clear_bfqq_softrt_update(bfqq); +- bfq_add_bfqq_busy(bfqd, bfqq); +- } else { ++ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ ++ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, ++ rq, &interactive); ++ else { + if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && + time_is_before_jiffies( + bfqq->last_wr_start_finish + +@@ -1049,16 +1390,43 @@ add_bfqq_busy: + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + + bfqd->wr_busy_queues++; +- entity->prio_changed = 1; ++ bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqd, bfqq, +- "non-idle wrais starting at %lu, rais_max_time %u", +- jiffies, +- jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ "non-idle wrais starting, " ++ "wr_max_time %u wr_busy %d", ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqd->wr_busy_queues); + } + if (prev != bfqq->next_rq) + bfq_updated_next_req(bfqd, bfqq); + } + ++ /* ++ * Assign jiffies to last_wr_start_finish in the following ++ * cases: ++ * ++ * . if bfqq is not going to be weight-raised, because, for ++ * non weight-raised queues, last_wr_start_finish stores the ++ * arrival time of the last request; as of now, this piece ++ * of information is used only for deciding whether to ++ * weight-raise async queues ++ * ++ * . if bfqq is not weight-raised, because, if bfqq is now ++ * switching to weight-raised, then last_wr_start_finish ++ * stores the time when weight-raising starts ++ * ++ * . if bfqq is interactive, because, regardless of whether ++ * bfqq is currently weight-raised, the weight-raising ++ * period must start or restart (this case is considered ++ * separately because it is not detected by the above ++ * conditions, if bfqq is already weight-raised) ++ * ++ * last_wr_start_finish has to be updated also if bfqq is soft ++ * real-time, because the weight-raising period is constantly ++ * restarted on idle-to-busy transitions for these queues, but ++ * this is already done in bfq_bfqq_handle_idle_busy_switch if ++ * needed. ++ */ + if (bfqd->low_latency && + (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) + bfqq->last_wr_start_finish = jiffies; +@@ -1106,6 +1474,9 @@ static void bfq_remove_request(struct request *rq) + struct bfq_data *bfqd = bfqq->bfqd; + const int sync = rq_is_sync(rq); + ++ BUG_ON(bfqq->entity.service > bfqq->entity.budget && ++ bfqq == bfqd->in_service_queue); ++ + if (bfqq->next_rq == rq) { + bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); + bfq_updated_next_req(bfqd, bfqq); +@@ -1119,8 +1490,25 @@ static void bfq_remove_request(struct request *rq) + elv_rb_del(&bfqq->sort_list, rq); + + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { +- if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) ++ BUG_ON(bfqq->entity.budget < 0); ++ ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { + bfq_del_bfqq_busy(bfqd, bfqq, 1); ++ ++ /* bfqq emptied. In normal operation, when ++ * bfqq is empty, bfqq->entity.service and ++ * bfqq->entity.budget must contain, ++ * respectively, the service received and the ++ * budget used last time bfqq emptied. These ++ * facts do not hold in this case, as at least ++ * this last removal occurred while bfqq is ++ * not in service. To avoid inconsistencies, ++ * reset both bfqq->entity.service and ++ * bfqq->entity.budget. ++ */ ++ bfqq->entity.budget = bfqq->entity.service = 0; ++ } ++ + /* + * Remove queue from request-position tree as it is empty. + */ +@@ -1134,9 +1522,7 @@ static void bfq_remove_request(struct request *rq) + BUG_ON(bfqq->meta_pending == 0); + bfqq->meta_pending--; + } +-#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); +-#endif + } + + static int bfq_merge(struct request_queue *q, struct request **req, +@@ -1221,21 +1607,25 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, + bfqq->next_rq = rq; + + bfq_remove_request(next); +-#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); +-#endif + } + + /* Must be called with bfqq != NULL */ + static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) + { + BUG_ON(!bfqq); ++ + if (bfq_bfqq_busy(bfqq)) + bfqq->bfqd->wr_busy_queues--; + bfqq->wr_coeff = 1; + bfqq->wr_cur_max_time = 0; +- /* Trigger a weight change on the next activation of the queue */ ++ /* ++ * Trigger a weight change on the next invocation of ++ * __bfq_entity_update_weight_prio. ++ */ + bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", ++ bfqq->bfqd->wr_busy_queues); + } + + static void bfq_end_wr_async_queues(struct bfq_data *bfqd, +@@ -1278,7 +1668,7 @@ static int bfq_rq_close_to_sector(void *io_struct, bool request, + sector_t sector) + { + return abs(bfq_io_struct_pos(io_struct, request) - sector) <= +- BFQQ_SEEK_THR; ++ BFQQ_CLOSE_THR; + } + + static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, +@@ -1400,7 +1790,7 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) + * throughput. + */ + bfqq->new_bfqq = new_bfqq; +- atomic_add(process_refs, &new_bfqq->ref); ++ new_bfqq->ref += process_refs; + return new_bfqq; + } + +@@ -1431,9 +1821,23 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, + } + + /* +- * Attempt to schedule a merge of bfqq with the currently in-service queue +- * or with a close queue among the scheduled queues. +- * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue ++ * If this function returns true, then bfqq cannot be merged. The idea ++ * is that true cooperation happens very early after processes start ++ * to do I/O. Usually, late cooperations are just accidental false ++ * positives. In case bfqq is weight-raised, such false positives ++ * would evidently degrade latency guarantees for bfqq. ++ */ ++bool wr_from_too_long(struct bfq_queue *bfqq) ++{ ++ return bfqq->wr_coeff > 1 && ++ time_is_before_jiffies(bfqq->last_wr_start_finish + ++ msecs_to_jiffies(100)); ++} ++ ++/* ++ * Attempt to schedule a merge of bfqq with the currently in-service ++ * queue or with a close queue among the scheduled queues. Return ++ * NULL if no merge was scheduled, a pointer to the shared bfq_queue + * structure otherwise. + * + * The OOM queue is not allowed to participate to cooperation: in fact, since +@@ -1442,6 +1846,18 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, + * handle merging with the OOM queue would be quite complex and expensive + * to maintain. Besides, in such a critical condition as an out of memory, + * the benefits of queue merging may be little relevant, or even negligible. ++ * ++ * Weight-raised queues can be merged only if their weight-raising ++ * period has just started. In fact cooperating processes are usually ++ * started together. Thus, with this filter we avoid false positives ++ * that would jeopardize low-latency guarantees. ++ * ++ * WARNING: queue merging may impair fairness among non-weight raised ++ * queues, for at least two reasons: 1) the original weight of a ++ * merged queue may change during the merged state, 2) even being the ++ * weight the same, a merged queue may be bloated with many more ++ * requests than the ones produced by its originally-associated ++ * process. + */ + static struct bfq_queue * + bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, +@@ -1451,16 +1867,32 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + + if (bfqq->new_bfqq) + return bfqq->new_bfqq; +- if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) ++ ++ if (io_struct && wr_from_too_long(bfqq) && ++ likely(bfqq != &bfqd->oom_bfqq)) ++ bfq_log_bfqq(bfqd, bfqq, ++ "would have looked for coop, but bfq%d wr", ++ bfqq->pid); ++ ++ if (!io_struct || ++ wr_from_too_long(bfqq) || ++ unlikely(bfqq == &bfqd->oom_bfqq)) + return NULL; +- /* If device has only one backlogged bfq_queue, don't search. */ ++ ++ /* If there is only one backlogged queue, don't search. */ + if (bfqd->busy_queues == 1) + return NULL; + + in_service_bfqq = bfqd->in_service_queue; + ++ if (in_service_bfqq && in_service_bfqq != bfqq && ++ bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) ++ && likely(in_service_bfqq == &bfqd->oom_bfqq)) ++ bfq_log_bfqq(bfqd, bfqq, ++ "would have tried merge with in-service-queue, but wr"); ++ + if (!in_service_bfqq || in_service_bfqq == bfqq || +- !bfqd->in_service_bic || ++ !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || + unlikely(in_service_bfqq == &bfqd->oom_bfqq)) + goto check_scheduled; + +@@ -1482,7 +1914,15 @@ check_scheduled: + + BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); + +- if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && ++ if (new_bfqq && wr_from_too_long(new_bfqq) && ++ likely(new_bfqq != &bfqd->oom_bfqq) && ++ bfq_may_be_close_cooperator(bfqq, new_bfqq)) ++ bfq_log_bfqq(bfqd, bfqq, ++ "would have merged with bfq%d, but wr", ++ new_bfqq->pid); ++ ++ if (new_bfqq && !wr_from_too_long(new_bfqq) && ++ likely(new_bfqq != &bfqd->oom_bfqq) && + bfq_may_be_close_cooperator(bfqq, new_bfqq)) + return bfq_setup_merge(bfqq, new_bfqq); + +@@ -1498,46 +1938,11 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) + */ + if (!bfqq->bic) + return; +- if (bfqq->bic->wr_time_left) +- /* +- * This is the queue of a just-started process, and would +- * deserve weight raising: we set wr_time_left to the full +- * weight-raising duration to trigger weight-raising when +- * and if the queue is split and the first request of the +- * queue is enqueued. +- */ +- bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); +- else if (bfqq->wr_coeff > 1) { +- unsigned long wr_duration = +- jiffies - bfqq->last_wr_start_finish; +- /* +- * It may happen that a queue's weight raising period lasts +- * longer than its wr_cur_max_time, as weight raising is +- * handled only when a request is enqueued or dispatched (it +- * does not use any timer). If the weight raising period is +- * about to end, don't save it. +- */ +- if (bfqq->wr_cur_max_time <= wr_duration) +- bfqq->bic->wr_time_left = 0; +- else +- bfqq->bic->wr_time_left = +- bfqq->wr_cur_max_time - wr_duration; +- /* +- * The bfq_queue is becoming shared or the requests of the +- * process owning the queue are being redirected to a shared +- * queue. Stop the weight raising period of the queue, as in +- * both cases it should not be owned by an interactive or +- * soft real-time application. +- */ +- bfq_bfqq_end_wr(bfqq); +- } else +- bfqq->bic->wr_time_left = 0; ++ + bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); + bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); +- bfqq->bic->cooperations++; +- bfqq->bic->failed_cooperations = 0; + } + + static void bfq_get_bic_reference(struct bfq_queue *bfqq) +@@ -1562,6 +1967,40 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, + if (bfq_bfqq_IO_bound(bfqq)) + bfq_mark_bfqq_IO_bound(new_bfqq); + bfq_clear_bfqq_IO_bound(bfqq); ++ ++ /* ++ * If bfqq is weight-raised, then let new_bfqq inherit ++ * weight-raising. To reduce false positives, neglect the case ++ * where bfqq has just been created, but has not yet made it ++ * to be weight-raised (which may happen because EQM may merge ++ * bfqq even before bfq_add_request is executed for the first ++ * time for bfqq). Handling this case would however be very ++ * easy, thanks to the flag just_created. ++ */ ++ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { ++ new_bfqq->wr_coeff = bfqq->wr_coeff; ++ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; ++ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; ++ if (bfq_bfqq_busy(new_bfqq)) ++ bfqd->wr_busy_queues++; ++ new_bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, new_bfqq, ++ "wr starting after merge with %d, " ++ "rais_max_time %u", ++ bfqq->pid, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ ++ bfqq->wr_coeff = 1; ++ bfqq->entity.prio_changed = 1; ++ if (bfq_bfqq_busy(bfqq)) ++ bfqd->wr_busy_queues--; ++ } ++ ++ bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", ++ bfqd->wr_busy_queues); ++ + /* + * Grab a reference to the bic, to prevent it from being destroyed + * before being possibly touched by a bfq_split_bfqq(). +@@ -1588,18 +2027,6 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, + bfq_put_queue(bfqq); + } + +-static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) +-{ +- struct bfq_io_cq *bic = bfqq->bic; +- struct bfq_data *bfqd = bfqq->bfqd; +- +- if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { +- bic->failed_cooperations++; +- if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) +- bic->cooperations = 0; +- } +-} +- + static int bfq_allow_merge(struct request_queue *q, struct request *rq, + struct bio *bio) + { +@@ -1637,30 +2064,86 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, + * to decide whether bio and rq can be merged. + */ + bfqq = new_bfqq; +- } else +- bfq_bfqq_increase_failed_cooperations(bfqq); ++ } + } + + return bfqq == RQ_BFQQ(rq); + } + ++/* ++ * Set the maximum time for the in-service queue to consume its ++ * budget. This prevents seeky processes from lowering the throughput. ++ * In practice, a time-slice service scheme is used with seeky ++ * processes. ++ */ ++static void bfq_set_budget_timeout(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ unsigned int timeout_coeff; ++ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) ++ timeout_coeff = 1; ++ else ++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; ++ ++ bfqd->last_budget_start = ktime_get(); ++ ++ bfqq->budget_timeout = jiffies + ++ bfqd->bfq_timeout * timeout_coeff; ++ ++ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", ++ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); ++} ++ + static void __bfq_set_in_service_queue(struct bfq_data *bfqd, + struct bfq_queue *bfqq) + { + if (bfqq) { +-#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); +-#endif + bfq_mark_bfqq_must_alloc(bfqq); +- bfq_mark_bfqq_budget_new(bfqq); + bfq_clear_bfqq_fifo_expire(bfqq); + + bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; + ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ if (bfqq->wr_coeff > 1 && ++ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ time_is_before_jiffies(bfqq->budget_timeout)) { ++ /* ++ * For soft real-time queues, move the start ++ * of the weight-raising period forward by the ++ * time the queue has not received any ++ * service. Otherwise, a relatively long ++ * service delay is likely to cause the ++ * weight-raising period of the queue to end, ++ * because of the short duration of the ++ * weight-raising period of a soft real-time ++ * queue. It is worth noting that this move ++ * is not so dangerous for the other queues, ++ * because soft real-time queues are not ++ * greedy. ++ * ++ * To not add a further variable, we use the ++ * overloaded field budget_timeout to ++ * determine for how long the queue has not ++ * received service, i.e., how much time has ++ * elapsed since the queue expired. However, ++ * this is a little imprecise, because ++ * budget_timeout is set to jiffies if bfqq ++ * not only expires, but also remains with no ++ * request. ++ */ ++ bfqq->last_wr_start_finish += jiffies - ++ bfqq->budget_timeout; ++ } ++ ++ bfq_set_budget_timeout(bfqd, bfqq); + bfq_log_bfqq(bfqd, bfqq, + "set_in_service_queue, cur-budget = %d", + bfqq->entity.budget); +- } ++ } else ++ bfq_log(bfqd, "set_in_service_queue: NULL"); + + bfqd->in_service_queue = bfqq; + } +@@ -1676,31 +2159,6 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) + return bfqq; + } + +-/* +- * If enough samples have been computed, return the current max budget +- * stored in bfqd, which is dynamically updated according to the +- * estimated disk peak rate; otherwise return the default max budget +- */ +-static int bfq_max_budget(struct bfq_data *bfqd) +-{ +- if (bfqd->budgets_assigned < bfq_stats_min_budgets) +- return bfq_default_max_budget; +- else +- return bfqd->bfq_max_budget; +-} +- +-/* +- * Return min budget, which is a fraction of the current or default +- * max budget (trying with 1/32) +- */ +-static int bfq_min_budget(struct bfq_data *bfqd) +-{ +- if (bfqd->budgets_assigned < bfq_stats_min_budgets) +- return bfq_default_max_budget / 32; +- else +- return bfqd->bfq_max_budget / 32; +-} +- + static void bfq_arm_slice_timer(struct bfq_data *bfqd) + { + struct bfq_queue *bfqq = bfqd->in_service_queue; +@@ -1725,62 +2183,34 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) + * being too ill-treated, grant them a small fraction of the + * assigned budget before reducing the waiting time to + * BFQ_MIN_TT. This happened to help reduce latency. +- */ +- sl = bfqd->bfq_slice_idle; +- /* +- * Unless the queue is being weight-raised or the scenario is +- * asymmetric, grant only minimum idle time if the queue either +- * has been seeky for long enough or has already proved to be +- * constantly seeky. +- */ +- if (bfq_sample_valid(bfqq->seek_samples) && +- ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > +- bfq_max_budget(bfqq->bfqd) / 8) || +- bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && +- bfq_symmetric_scenario(bfqd)) +- sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); +- else if (bfqq->wr_coeff > 1) +- sl = sl * 3; +- bfqd->last_idling_start = ktime_get(); +- mod_timer(&bfqd->idle_slice_timer, jiffies + sl); +-#ifdef CONFIG_BFQ_GROUP_IOSCHED +- bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); +-#endif +- bfq_log(bfqd, "arm idle: %u/%u ms", +- jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); +-} +- +-/* +- * Set the maximum time for the in-service queue to consume its +- * budget. This prevents seeky processes from lowering the disk +- * throughput (always guaranteed with a time slice scheme as in CFQ). +- */ +-static void bfq_set_budget_timeout(struct bfq_data *bfqd) +-{ +- struct bfq_queue *bfqq = bfqd->in_service_queue; +- unsigned int timeout_coeff; +- if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) +- timeout_coeff = 1; +- else +- timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; +- +- bfqd->last_budget_start = ktime_get(); +- +- bfq_clear_bfqq_budget_new(bfqq); +- bfqq->budget_timeout = jiffies + +- bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; ++ */ ++ sl = bfqd->bfq_slice_idle; ++ /* ++ * Unless the queue is being weight-raised or the scenario is ++ * asymmetric, grant only minimum idle time if the queue ++ * is seeky. A long idling is preserved for a weight-raised ++ * queue, or, more in general, in an asymemtric scenario, ++ * because a long idling is needed for guaranteeing to a queue ++ * its reserved share of the throughput (in particular, it is ++ * needed if the queue has a higher weight than some other ++ * queue). ++ */ ++ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && ++ bfq_symmetric_scenario(bfqd)) ++ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); + +- bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", +- jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * +- timeout_coeff)); ++ bfqd->last_idling_start = ktime_get(); ++ mod_timer(&bfqd->idle_slice_timer, jiffies + sl); ++ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); ++ bfq_log(bfqd, "arm idle: %u/%u ms", ++ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); + } + + /* +- * Move request from internal lists to the request queue dispatch list. ++ * Move request from internal lists to the dispatch list of the request queue + */ + static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) + { +- struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + /* +@@ -1794,15 +2224,9 @@ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) + * incrementing bfqq->dispatched. + */ + bfqq->dispatched++; ++ + bfq_remove_request(rq); + elv_dispatch_sort(q, rq); +- +- if (bfq_bfqq_sync(bfqq)) +- bfqd->sync_flight++; +-#ifdef CONFIG_BFQ_GROUP_IOSCHED +- bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq), +- rq->cmd_flags); +-#endif + } + + /* +@@ -1822,18 +2246,12 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq) + + rq = rq_entry_fifo(bfqq->fifo.next); + +- if (time_before(jiffies, rq->fifo_time)) ++ if (time_is_after_jiffies(rq->fifo_time)) + return NULL; + + return rq; + } + +-static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) +-{ +- struct bfq_entity *entity = &bfqq->entity; +- return entity->budget - entity->service; +-} +- + static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) + { + BUG_ON(bfqq != bfqd->in_service_queue); +@@ -1850,12 +2268,15 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) + bfq_mark_bfqq_split_coop(bfqq); + + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { +- /* +- * Overloading budget_timeout field to store the time +- * at which the queue remains with no backlog; used by +- * the weight-raising mechanism. +- */ +- bfqq->budget_timeout = jiffies; ++ if (bfqq->dispatched == 0) ++ /* ++ * Overloading budget_timeout field to store ++ * the time at which the queue remains with no ++ * backlog and no outstanding request; used by ++ * the weight-raising mechanism. ++ */ ++ bfqq->budget_timeout = jiffies; ++ + bfq_del_bfqq_busy(bfqd, bfqq, 1); + } else { + bfq_activate_bfqq(bfqd, bfqq); +@@ -1882,10 +2303,19 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, + struct request *next_rq; + int budget, min_budget; + +- budget = bfqq->max_budget; ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ + min_budget = bfq_min_budget(bfqd); + +- BUG_ON(bfqq != bfqd->in_service_queue); ++ if (bfqq->wr_coeff == 1) ++ budget = bfqq->max_budget; ++ else /* ++ * Use a constant, low budget for weight-raised queues, ++ * to help achieve a low latency. Keep it slightly higher ++ * than the minimum possible budget, to cause a little ++ * bit fewer expirations. ++ */ ++ budget = 2 * min_budget; + + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", + bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); +@@ -1894,7 +2324,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", + bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); + +- if (bfq_bfqq_sync(bfqq)) { ++ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { + switch (reason) { + /* + * Caveat: in all the following cases we trade latency +@@ -1936,14 +2366,10 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, + break; + case BFQ_BFQQ_BUDGET_TIMEOUT: + /* +- * We double the budget here because: 1) it +- * gives the chance to boost the throughput if +- * this is not a seeky process (which may have +- * bumped into this timeout because of, e.g., +- * ZBR), 2) together with charge_full_budget +- * it helps give seeky processes higher +- * timestamps, and hence be served less +- * frequently. ++ * We double the budget here because it gives ++ * the chance to boost the throughput if this ++ * is not a seeky process (and has bumped into ++ * this timeout because of, e.g., ZBR). + */ + budget = min(budget * 2, bfqd->bfq_max_budget); + break; +@@ -1960,17 +2386,49 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, + budget = min(budget * 4, bfqd->bfq_max_budget); + break; + case BFQ_BFQQ_NO_MORE_REQUESTS: +- /* +- * Leave the budget unchanged. +- */ ++ /* ++ * For queues that expire for this reason, it ++ * is particularly important to keep the ++ * budget close to the actual service they ++ * need. Doing so reduces the timestamp ++ * misalignment problem described in the ++ * comments in the body of ++ * __bfq_activate_entity. In fact, suppose ++ * that a queue systematically expires for ++ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a ++ * new request in time to enjoy timestamp ++ * back-shifting. The larger the budget of the ++ * queue is with respect to the service the ++ * queue actually requests in each service ++ * slot, the more times the queue can be ++ * reactivated with the same virtual finish ++ * time. It follows that, even if this finish ++ * time is pushed to the system virtual time ++ * to reduce the consequent timestamp ++ * misalignment, the queue unjustly enjoys for ++ * many re-activations a lower finish time ++ * than all newly activated queues. ++ * ++ * The service needed by bfqq is measured ++ * quite precisely by bfqq->entity.service. ++ * Since bfqq does not enjoy device idling, ++ * bfqq->entity.service is equal to the number ++ * of sectors that the process associated with ++ * bfqq requested to read/write before waiting ++ * for request completions, or blocking for ++ * other reasons. ++ */ ++ budget = max_t(int, bfqq->entity.service, min_budget); ++ break; + default: + return; + } +- } else ++ } else if (!bfq_bfqq_sync(bfqq)) + /* +- * Async queues get always the maximum possible budget +- * (their ability to dispatch is limited by +- * @bfqd->bfq_max_budget_async_rq). ++ * Async queues get always the maximum possible ++ * budget, as for them we do not care about latency ++ * (in addition, their ability to dispatch is limited ++ * by the charging factor). + */ + budget = bfqd->bfq_max_budget; + +@@ -1981,65 +2439,105 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, + bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); + + /* +- * Make sure that we have enough budget for the next request. +- * Since the finish time of the bfqq must be kept in sync with +- * the budget, be sure to call __bfq_bfqq_expire() after the ++ * If there is still backlog, then assign a new budget, making ++ * sure that it is large enough for the next request. Since ++ * the finish time of bfqq must be kept in sync with the ++ * budget, be sure to call __bfq_bfqq_expire() *after* this + * update. ++ * ++ * If there is no backlog, then no need to update the budget; ++ * it will be updated on the arrival of a new request. + */ + next_rq = bfqq->next_rq; +- if (next_rq) ++ if (next_rq) { ++ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || ++ reason == BFQ_BFQQ_NO_MORE_REQUESTS); + bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); +- else +- bfqq->entity.budget = bfqq->max_budget; ++ BUG_ON(!bfq_bfqq_busy(bfqq)); ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ } + + bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", + next_rq ? blk_rq_sectors(next_rq) : 0, + bfqq->entity.budget); + } + +-static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) ++static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) + { +- unsigned long max_budget; +- + /* + * The max_budget calculated when autotuning is equal to the +- * amount of sectors transfered in timeout_sync at the ++ * amount of sectors transfered in timeout at the + * estimated peak rate. + */ +- max_budget = (unsigned long)(peak_rate * 1000 * +- timeout >> BFQ_RATE_SHIFT); +- +- return max_budget; ++ return bfqd->peak_rate * 1000 * jiffies_to_msecs(bfqd->bfq_timeout) >> ++ BFQ_RATE_SHIFT; + } + + /* +- * In addition to updating the peak rate, checks whether the process +- * is "slow", and returns 1 if so. This slow flag is used, in addition +- * to the budget timeout, to reduce the amount of service provided to +- * seeky processes, and hence reduce their chances to lower the +- * throughput. See the code for more details. ++ * Update the read peak rate (quantity used for auto-tuning) as a ++ * function of the rate at which bfqq has been served, and check ++ * whether the process associated with bfqq is "slow". Return true if ++ * the process is slow. The slow flag is used, in addition to the ++ * budget timeout, to reduce the amount of service provided to seeky ++ * processes, and hence reduce their chances to lower the ++ * throughput. More details in the body of the function. ++ * ++ * An important observation is in order: with devices with internal ++ * queues, it is hard if ever possible to know when and for how long ++ * an I/O request is processed by the device (apart from the trivial ++ * I/O pattern where a new request is dispatched only after the ++ * previous one has been completed). This makes it hard to evaluate ++ * the real rate at which the I/O requests of each bfq_queue are ++ * served. In fact, for an I/O scheduler like BFQ, serving a ++ * bfq_queue means just dispatching its requests during its service ++ * slot, i.e., until the budget of the queue is exhausted, or the ++ * queue remains idle, or, finally, a timeout fires. But, during the ++ * service slot of a bfq_queue, the device may be still processing ++ * requests of bfq_queues served in previous service slots. On the ++ * opposite end, the requests of the in-service bfq_queue may be ++ * completed after the service slot of the queue finishes. Anyway, ++ * unless more sophisticated solutions are used (where possible), the ++ * sum of the sizes of the requests dispatched during the service slot ++ * of a bfq_queue is probably the only approximation available for ++ * the service received by the bfq_queue during its service slot. And, ++ * as written above, this sum is the quantity used in this function to ++ * evaluate the peak rate. + */ + static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, +- bool compensate, enum bfqq_expiration reason) ++ bool compensate, enum bfqq_expiration reason, ++ unsigned long *delta_ms) + { +- u64 bw, usecs, expected, timeout; +- ktime_t delta; ++ u64 bw, bwdiv10, delta_usecs, delta_ms_tmp; ++ ktime_t delta_ktime; + int update = 0; ++ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ + +- if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) ++ if (!bfq_bfqq_sync(bfqq)) + return false; + + if (compensate) +- delta = bfqd->last_idling_start; ++ delta_ktime = bfqd->last_idling_start; + else +- delta = ktime_get(); +- delta = ktime_sub(delta, bfqd->last_budget_start); +- usecs = ktime_to_us(delta); ++ delta_ktime = ktime_get(); ++ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); ++ delta_usecs = ktime_to_us(delta_ktime); + + /* Don't trust short/unrealistic values. */ +- if (usecs < 100 || usecs >= LONG_MAX) +- return false; ++ if (delta_usecs < 1000 || delta_usecs >= LONG_MAX) { ++ if (blk_queue_nonrot(bfqd->queue)) ++ *delta_ms = BFQ_MIN_TT; /* give same worst-case ++ guarantees as ++ idling for seeky ++ */ ++ else /* Charge at least one seek */ ++ *delta_ms = jiffies_to_msecs(bfq_slice_idle); ++ return slow; ++ } ++ ++ delta_ms_tmp = delta_usecs; ++ do_div(delta_ms_tmp, 1000); ++ *delta_ms = delta_ms_tmp; + + /* + * Calculate the bandwidth for the last slice. We use a 64 bit +@@ -2048,32 +2546,51 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, + * and to avoid overflows. + */ + bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; +- do_div(bw, (unsigned long)usecs); +- +- timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); ++ do_div(bw, (unsigned long)delta_usecs); + ++ bfq_log(bfqd, "measured bw = %llu sects/sec", ++ (1000000*bw)>>BFQ_RATE_SHIFT); + /* + * Use only long (> 20ms) intervals to filter out spikes for + * the peak rate estimation. + */ +- if (usecs > 20000) { ++ if (delta_usecs > 20000) { ++ bool fully_sequential = bfqq->seek_history == 0; ++ /* ++ * Soft real-time queues are not good candidates for ++ * evaluating bw, as they are likely to be slow even ++ * if sequential. ++ */ ++ bool non_soft_rt = bfqq->wr_coeff == 1 || ++ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time; ++ bool consumed_large_budget = ++ reason == BFQ_BFQQ_BUDGET_EXHAUSTED && ++ bfqq->entity.budget >= bfqd->bfq_max_budget * 2 / 3; ++ bool served_for_long_time = ++ reason == BFQ_BFQQ_BUDGET_TIMEOUT || ++ consumed_large_budget; ++ ++ BUG_ON(bfqq->seek_history == 0 && ++ hweight32(bfqq->seek_history) != 0); ++ + if (bw > bfqd->peak_rate || +- (!BFQQ_SEEKY(bfqq) && +- reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { +- bfq_log(bfqd, "measured bw =%llu", bw); ++ (bfq_bfqq_sync(bfqq) && fully_sequential && non_soft_rt && ++ served_for_long_time)) { + /* + * To smooth oscillations use a low-pass filter with +- * alpha=7/8, i.e., +- * new_rate = (7/8) * old_rate + (1/8) * bw ++ * alpha=9/10, i.e., ++ * new_rate = (9/10) * old_rate + (1/10) * bw + */ +- do_div(bw, 8); +- if (bw == 0) +- return 0; +- bfqd->peak_rate *= 7; +- do_div(bfqd->peak_rate, 8); +- bfqd->peak_rate += bw; ++ bwdiv10 = bw; ++ do_div(bwdiv10, 10); ++ if (bwdiv10 == 0) ++ return false; /* bw too low to be used */ ++ bfqd->peak_rate *= 9; ++ do_div(bfqd->peak_rate, 10); ++ bfqd->peak_rate += bwdiv10; + update = 1; +- bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); ++ bfq_log(bfqd, "new peak_rate = %llu sects/sec", ++ (1000000*bfqd->peak_rate)>>BFQ_RATE_SHIFT); + } + + update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; +@@ -2086,9 +2603,8 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, + int dev_type = blk_queue_nonrot(bfqd->queue); + if (bfqd->bfq_user_max_budget == 0) { + bfqd->bfq_max_budget = +- bfq_calc_max_budget(bfqd->peak_rate, +- timeout); +- bfq_log(bfqd, "new max_budget=%d", ++ bfq_calc_max_budget(bfqd); ++ bfq_log(bfqd, "new max_budget = %d", + bfqd->bfq_max_budget); + } + if (bfqd->device_speed == BFQ_BFQD_FAST && +@@ -2102,38 +2618,35 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bfqd->RT_prod = R_fast[dev_type] * + T_fast[dev_type]; + } ++ bfq_log(bfqd, "dev_speed_class = %d (%d sects/sec), " ++ "thresh %d setcs/sec", ++ bfqd->device_speed, ++ bfqd->device_speed == BFQ_BFQD_FAST ? ++ (1000000*R_fast[dev_type])>>BFQ_RATE_SHIFT : ++ (1000000*R_slow[dev_type])>>BFQ_RATE_SHIFT, ++ (1000000*device_speed_thresh[dev_type])>> ++ BFQ_RATE_SHIFT); + } ++ /* ++ * Caveat: processes doing IO in the slower disk zones ++ * tend to be slow(er) even if not seeky. In this ++ * respect, the estimated peak rate is likely to be an ++ * average over the disk surface. Accordingly, to not ++ * be too harsh with unlucky processes, a process is ++ * deemed slow only if its bw has been lower than half ++ * of the estimated peak rate. ++ */ ++ slow = bw < bfqd->peak_rate / 2; + } + +- /* +- * If the process has been served for a too short time +- * interval to let its possible sequential accesses prevail on +- * the initial seek time needed to move the disk head on the +- * first sector it requested, then give the process a chance +- * and for the moment return false. +- */ +- if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) +- return false; +- +- /* +- * A process is considered ``slow'' (i.e., seeky, so that we +- * cannot treat it fairly in the service domain, as it would +- * slow down too much the other processes) if, when a slice +- * ends for whatever reason, it has received service at a +- * rate that would not be high enough to complete the budget +- * before the budget timeout expiration. +- */ +- expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; ++ bfq_log_bfqq(bfqd, bfqq, ++ "update_peak_rate: bw %llu sect/s, peak rate %llu, " ++ "slow %d", ++ (1000000*bw)>>BFQ_RATE_SHIFT, ++ (1000000*bfqd->peak_rate)>>BFQ_RATE_SHIFT, ++ bw < bfqd->peak_rate / 2); + +- /* +- * Caveat: processes doing IO in the slower disk zones will +- * tend to be slow(er) even if not seeky. And the estimated +- * peak rate will actually be an average over the disk +- * surface. Hence, to not be too harsh with unlucky processes, +- * we keep a budget/3 margin of safety before declaring a +- * process slow. +- */ +- return expected > (4 * bfqq->entity.budget) / 3; ++ return slow; + } + + /* +@@ -2191,6 +2704,15 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, + static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + struct bfq_queue *bfqq) + { ++ bfq_log_bfqq(bfqd, bfqq, ++ "softrt_next_start: service_blkg %lu " ++ "soft_rate %u sects/sec" ++ "interval %u", ++ bfqq->service_from_backlogged, ++ bfqd->bfq_wr_max_softrt_rate, ++ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / ++ bfqd->bfq_wr_max_softrt_rate)); ++ + return max(bfqq->last_idle_bklogged + + HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate, +@@ -2198,13 +2720,21 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + } + + /* +- * Return the largest-possible time instant such that, for as long as possible, +- * the current time will be lower than this time instant according to the macro +- * time_is_before_jiffies(). ++ * Return the farthest future time instant according to jiffies ++ * macros. ++ */ ++static unsigned long bfq_greatest_from_now(void) ++{ ++ return jiffies + MAX_JIFFY_OFFSET; ++} ++ ++/* ++ * Return the farthest past time instant according to jiffies ++ * macros. + */ +-static unsigned long bfq_infinity_from_now(unsigned long now) ++static unsigned long bfq_smallest_from_now(void) + { +- return now + ULONG_MAX / 2; ++ return jiffies - MAX_JIFFY_OFFSET; + } + + /** +@@ -2214,28 +2744,24 @@ static unsigned long bfq_infinity_from_now(unsigned long now) + * @compensate: if true, compensate for the time spent idling. + * @reason: the reason causing the expiration. + * ++ * If the process associated with bfqq does slow I/O (e.g., because it ++ * issues random requests), we charge bfqq with the time it has been ++ * in service instead of the service it has received (see ++ * bfq_bfqq_charge_time for details on how this goal is achieved). As ++ * a consequence, bfqq will typically get higher timestamps upon ++ * reactivation, and hence it will be rescheduled as if it had ++ * received more service than what it has actually received. In the ++ * end, bfqq receives less service in proportion to how slowly its ++ * associated process consumes its budgets (and hence how seriously it ++ * tends to lower the throughput). In addition, this time-charging ++ * strategy guarantees time fairness among slow processes. In ++ * contrast, if the process associated with bfqq is not slow, we ++ * charge bfqq exactly with the service it has received. + * +- * If the process associated to the queue is slow (i.e., seeky), or in +- * case of budget timeout, or, finally, if it is async, we +- * artificially charge it an entire budget (independently of the +- * actual service it received). As a consequence, the queue will get +- * higher timestamps than the correct ones upon reactivation, and +- * hence it will be rescheduled as if it had received more service +- * than what it actually received. In the end, this class of processes +- * will receive less service in proportion to how slowly they consume +- * their budgets (and hence how seriously they tend to lower the +- * throughput). +- * +- * In contrast, when a queue expires because it has been idling for +- * too much or because it exhausted its budget, we do not touch the +- * amount of service it has received. Hence when the queue will be +- * reactivated and its timestamps updated, the latter will be in sync +- * with the actual service received by the queue until expiration. +- * +- * Charging a full budget to the first type of queues and the exact +- * service to the others has the effect of using the WF2Q+ policy to +- * schedule the former on a timeslice basis, without violating the +- * service domain guarantees of the latter. ++ * Charging time to the first type of queues and the exact service to ++ * the other has the effect of using the WF2Q+ policy to schedule the ++ * former on a timeslice basis, without violating service domain ++ * guarantees among the latter. + */ + static void bfq_bfqq_expire(struct bfq_data *bfqd, + struct bfq_queue *bfqq, +@@ -2243,40 +2769,53 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, + enum bfqq_expiration reason) + { + bool slow; ++ unsigned long delta = 0; ++ struct bfq_entity *entity = &bfqq->entity; ++ + BUG_ON(bfqq != bfqd->in_service_queue); + + /* +- * Update disk peak rate for autotuning and check whether the ++ * Update device peak rate for autotuning and check whether the + * process is slow (see bfq_update_peak_rate). + */ +- slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); ++ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason, &delta); + + /* +- * As above explained, 'punish' slow (i.e., seeky), timed-out +- * and async queues, to favor sequential sync workloads. +- * +- * Processes doing I/O in the slower disk zones will tend to be +- * slow(er) even if not seeky. Hence, since the estimated peak +- * rate is actually an average over the disk surface, these +- * processes may timeout just for bad luck. To avoid punishing +- * them we do not charge a full budget to a process that +- * succeeded in consuming at least 2/3 of its budget. ++ * Increase service_from_backlogged before next statement, ++ * because the possible next invocation of ++ * bfq_bfqq_charge_time would likely inflate ++ * entity->service. In contrast, service_from_backlogged must ++ * contain real service, to enable the soft real-time ++ * heuristic to correctly compute the bandwidth consumed by ++ * bfqq. + */ +- if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && +- bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) +- bfq_bfqq_charge_full_budget(bfqq); ++ bfqq->service_from_backlogged += entity->service; + +- bfqq->service_from_backlogged += bfqq->entity.service; ++ /* ++ * As above explained, charge slow (typically seeky) and ++ * timed-out queues with the time and not the service ++ * received, to favor sequential workloads. ++ * ++ * Processes doing I/O in the slower disk zones will tend to ++ * be slow(er) even if not seeky. Therefore, since the ++ * estimated peak rate is actually an average over the disk ++ * surface, these processes may timeout just for bad luck. To ++ * avoid punishing them, do not charge time to processes that ++ * succeeded in consuming at least 2/3 of their budget. This ++ * allows BFQ to preserve enough elasticity to still perform ++ * bandwidth, and not time, distribution with little unlucky ++ * or quasi-sequential processes. ++ */ ++ if (bfqq->wr_coeff == 1 && ++ (slow || ++ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && ++ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) ++ bfq_bfqq_charge_time(bfqd, bfqq, delta); + +- if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && +- !bfq_bfqq_constantly_seeky(bfqq)) { +- bfq_mark_bfqq_constantly_seeky(bfqq); +- if (!blk_queue_nonrot(bfqd->queue)) +- bfqd->const_seeky_busy_in_flight_queues++; +- } ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); + + if (reason == BFQ_BFQQ_TOO_IDLE && +- bfqq->entity.service <= 2 * bfqq->entity.budget / 10 ) ++ entity->service <= 2 * entity->budget / 10 ) + bfq_clear_bfqq_IO_bound(bfqq); + + if (bfqd->low_latency && bfqq->wr_coeff == 1) +@@ -2285,19 +2824,23 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, + if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && + RB_EMPTY_ROOT(&bfqq->sort_list)) { + /* +- * If we get here, and there are no outstanding requests, +- * then the request pattern is isochronous (see the comments +- * to the function bfq_bfqq_softrt_next_start()). Hence we +- * can compute soft_rt_next_start. If, instead, the queue +- * still has outstanding requests, then we have to wait +- * for the completion of all the outstanding requests to ++ * If we get here, and there are no outstanding ++ * requests, then the request pattern is isochronous ++ * (see the comments on the function ++ * bfq_bfqq_softrt_next_start()). Thus we can compute ++ * soft_rt_next_start. If, instead, the queue still ++ * has outstanding requests, then we have to wait for ++ * the completion of all the outstanding requests to + * discover whether the request pattern is actually + * isochronous. + */ +- if (bfqq->dispatched == 0) ++ BUG_ON(bfqd->busy_queues < 1); ++ if (bfqq->dispatched == 0) { + bfqq->soft_rt_next_start = + bfq_bfqq_softrt_next_start(bfqd, bfqq); +- else { ++ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", ++ bfqq->soft_rt_next_start); ++ } else { + /* + * The application is still waiting for the + * completion of one or more requests: +@@ -2314,7 +2857,7 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, + * happened to be in the past. + */ + bfqq->soft_rt_next_start = +- bfq_infinity_from_now(jiffies); ++ bfq_greatest_from_now(); + /* + * Schedule an update of soft_rt_next_start to when + * the task may be discovered to be isochronous. +@@ -2324,15 +2867,27 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, + } + + bfq_log_bfqq(bfqd, bfqq, +- "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, +- slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); ++ "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", ++ reason, slow, bfqq->dispatched, ++ bfq_bfqq_idle_window(bfqq), entity->weight); + + /* + * Increase, decrease or leave budget unchanged according to + * reason. + */ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); + __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); ++ BUG_ON(bfqq->next_rq == NULL && ++ bfqq->entity.budget < bfqq->entity.service); + __bfq_bfqq_expire(bfqd, bfqq); ++ ++ BUG_ON(!bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && ++ !bfq_class_idle(bfqq)); ++ ++ if (!bfq_bfqq_busy(bfqq) && ++ reason != BFQ_BFQQ_BUDGET_TIMEOUT && ++ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) ++ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); + } + + /* +@@ -2342,20 +2897,17 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, + */ + static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) + { +- if (bfq_bfqq_budget_new(bfqq) || +- time_before(jiffies, bfqq->budget_timeout)) +- return false; +- return true; ++ return time_is_before_eq_jiffies(bfqq->budget_timeout); + } + + /* +- * If we expire a queue that is waiting for the arrival of a new +- * request, we may prevent the fictitious timestamp back-shifting that +- * allows the guarantees of the queue to be preserved (see [1] for +- * this tricky aspect). Hence we return true only if this condition +- * does not hold, or if the queue is slow enough to deserve only to be +- * kicked off for preserving a high throughput. +-*/ ++ * If we expire a queue that is actively waiting (i.e., with the ++ * device idled) for the arrival of a new request, then we may incur ++ * the timestamp misalignment problem described in the body of the ++ * function __bfq_activate_entity. Hence we return true only if this ++ * condition does not hold, or if the queue is slow enough to deserve ++ * only to be kicked off for preserving a high throughput. ++ */ + static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) + { + bfq_log_bfqq(bfqq->bfqd, bfqq, +@@ -2397,10 +2949,12 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) + { + struct bfq_data *bfqd = bfqq->bfqd; + bool idling_boosts_thr, idling_boosts_thr_without_issues, +- all_queues_seeky, on_hdd_and_not_all_queues_seeky, + idling_needed_for_service_guarantees, + asymmetric_scenario; + ++ if (bfqd->strict_guarantees) ++ return true; ++ + /* + * The next variable takes into account the cases where idling + * boosts the throughput. +@@ -2422,7 +2976,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) + */ + idling_boosts_thr = !bfqd->hw_tag || + (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && +- bfq_bfqq_idle_window(bfqq)) ; ++ bfq_bfqq_idle_window(bfqq)); + + /* + * The value of the next variable, +@@ -2463,74 +3017,27 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) + bfqd->wr_busy_queues == 0; + + /* +- * There are then two cases where idling must be performed not ++ * There is then a case where idling must be performed not + * for throughput concerns, but to preserve service +- * guarantees. In the description of these cases, we say, for +- * short, that a queue is sequential/random if the process +- * associated to the queue issues sequential/random requests +- * (in the second case the queue may be tagged as seeky or +- * even constantly_seeky). +- * +- * To introduce the first case, we note that, since +- * bfq_bfqq_idle_window(bfqq) is false if the device is +- * NCQ-capable and bfqq is random (see +- * bfq_update_idle_window()), then, from the above two +- * assignments it follows that +- * idling_boosts_thr_without_issues is false if the device is +- * NCQ-capable and bfqq is random. Therefore, for this case, +- * device idling would never be allowed if we used just +- * idling_boosts_thr_without_issues to decide whether to allow +- * it. And, beneficially, this would imply that throughput +- * would always be boosted also with random I/O on NCQ-capable +- * HDDs. ++ * guarantees. + * +- * But we must be careful on this point, to avoid an unfair +- * treatment for bfqq. In fact, because of the same above +- * assignments, idling_boosts_thr_without_issues is, on the +- * other hand, true if 1) the device is an HDD and bfqq is +- * sequential, and 2) there are no busy weight-raised +- * queues. As a consequence, if we used just +- * idling_boosts_thr_without_issues to decide whether to idle +- * the device, then with an HDD we might easily bump into a +- * scenario where queues that are sequential and I/O-bound +- * would enjoy idling, whereas random queues would not. The +- * latter might then get a low share of the device throughput, +- * simply because the former would get many requests served +- * after being set as in service, while the latter would not. +- * +- * To address this issue, we start by setting to true a +- * sentinel variable, on_hdd_and_not_all_queues_seeky, if the +- * device is rotational and not all queues with pending or +- * in-flight requests are constantly seeky (i.e., there are +- * active sequential queues, and bfqq might then be mistreated +- * if it does not enjoy idling because it is random). +- */ +- all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) && +- bfqd->busy_in_flight_queues == +- bfqd->const_seeky_busy_in_flight_queues; +- +- on_hdd_and_not_all_queues_seeky = +- !blk_queue_nonrot(bfqd->queue) && !all_queues_seeky; +- +- /* +- * To introduce the second case where idling needs to be +- * performed to preserve service guarantees, we can note that +- * allowing the drive to enqueue more than one request at a +- * time, and hence delegating de facto final scheduling +- * decisions to the drive's internal scheduler, causes loss of +- * control on the actual request service order. In particular, +- * the critical situation is when requests from different +- * processes happens to be present, at the same time, in the +- * internal queue(s) of the drive. In such a situation, the +- * drive, by deciding the service order of the +- * internally-queued requests, does determine also the actual +- * throughput distribution among these processes. But the +- * drive typically has no notion or concern about per-process +- * throughput distribution, and makes its decisions only on a +- * per-request basis. Therefore, the service distribution +- * enforced by the drive's internal scheduler is likely to +- * coincide with the desired device-throughput distribution +- * only in a completely symmetric scenario where: ++ * To introduce this case, we can note that allowing the drive ++ * to enqueue more than one request at a time, and hence ++ * delegating de facto final scheduling decisions to the ++ * drive's internal scheduler, entails loss of control on the ++ * actual request service order. In particular, the critical ++ * situation is when requests from different processes happen ++ * to be present, at the same time, in the internal queue(s) ++ * of the drive. In such a situation, the drive, by deciding ++ * the service order of the internally-queued requests, does ++ * determine also the actual throughput distribution among ++ * these processes. But the drive typically has no notion or ++ * concern about per-process throughput distribution, and ++ * makes its decisions only on a per-request basis. Therefore, ++ * the service distribution enforced by the drive's internal ++ * scheduler is likely to coincide with the desired ++ * device-throughput distribution only in a completely ++ * symmetric scenario where: + * (i) each of these processes must get the same throughput as + * the others; + * (ii) all these processes have the same I/O pattern +@@ -2552,26 +3059,53 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) + * words, only if sub-condition (i) holds, then idling is + * allowed, and the device tends to be prevented from queueing + * many requests, possibly of several processes. The reason +- * for not controlling also sub-condition (ii) is that, first, +- * in the case of an HDD, the asymmetry in terms of types of +- * I/O patterns is already taken in to account in the above +- * sentinel variable +- * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a +- * flash-based device, we prefer however to privilege +- * throughput (and idling lowers throughput for this type of +- * devices), for the following reasons: +- * 1) differently from HDDs, the service time of random +- * requests is not orders of magnitudes lower than the service +- * time of sequential requests; thus, even if processes doing +- * sequential I/O get a preferential treatment with respect to +- * others doing random I/O, the consequences are not as +- * dramatic as with HDDs; +- * 2) if a process doing random I/O does need strong +- * throughput guarantees, it is hopefully already being +- * weight-raised, or the user is likely to have assigned it a +- * higher weight than the other processes (and thus +- * sub-condition (i) is likely to be false, which triggers +- * idling). ++ * for not controlling also sub-condition (ii) is that we ++ * exploit preemption to preserve guarantees in case of ++ * symmetric scenarios, even if (ii) does not hold, as ++ * explained in the next two paragraphs. ++ * ++ * Even if a queue, say Q, is expired when it remains idle, Q ++ * can still preempt the new in-service queue if the next ++ * request of Q arrives soon (see the comments on ++ * bfq_bfqq_update_budg_for_activation). If all queues and ++ * groups have the same weight, this form of preemption, ++ * combined with the hole-recovery heuristic described in the ++ * comments on function bfq_bfqq_update_budg_for_activation, ++ * are enough to preserve a correct bandwidth distribution in ++ * the mid term, even without idling. In fact, even if not ++ * idling allows the internal queues of the device to contain ++ * many requests, and thus to reorder requests, we can rather ++ * safely assume that the internal scheduler still preserves a ++ * minimum of mid-term fairness. The motivation for using ++ * preemption instead of idling is that, by not idling, ++ * service guarantees are preserved without minimally ++ * sacrificing throughput. In other words, both a high ++ * throughput and its desired distribution are obtained. ++ * ++ * More precisely, this preemption-based, idleless approach ++ * provides fairness in terms of IOPS, and not sectors per ++ * second. This can be seen with a simple example. Suppose ++ * that there are two queues with the same weight, but that ++ * the first queue receives requests of 8 sectors, while the ++ * second queue receives requests of 1024 sectors. In ++ * addition, suppose that each of the two queues contains at ++ * most one request at a time, which implies that each queue ++ * always remains idle after it is served. Finally, after ++ * remaining idle, each queue receives very quickly a new ++ * request. It follows that the two queues are served ++ * alternatively, preempting each other if needed. This ++ * implies that, although both queues have the same weight, ++ * the queue with large requests receives a service that is ++ * 1024/8 times as high as the service received by the other ++ * queue. ++ * ++ * On the other hand, device idling is performed, and thus ++ * pure sector-domain guarantees are provided, for the ++ * following queues, which are likely to need stronger ++ * throughput guarantees: weight-raised queues, and queues ++ * with a higher weight than other queues. When such queues ++ * are active, sub-condition (i) is false, which triggers ++ * device idling. + * + * According to the above considerations, the next variable is + * true (only) if sub-condition (i) holds. To compute the +@@ -2579,7 +3113,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) + * the function bfq_symmetric_scenario(), but also check + * whether bfqq is being weight-raised, because + * bfq_symmetric_scenario() does not take into account also +- * weight-raised queues (see comments to ++ * weight-raised queues (see comments on + * bfq_weights_tree_add()). + * + * As a side note, it is worth considering that the above +@@ -2601,17 +3135,16 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) + * bfqq. Such a case is when bfqq became active in a burst of + * queue activations. Queues that became active during a large + * burst benefit only from throughput, as discussed in the +- * comments to bfq_handle_burst. Thus, if bfqq became active ++ * comments on bfq_handle_burst. Thus, if bfqq became active + * in a burst and not idling the device maximizes throughput, + * then the device must no be idled, because not idling the + * device provides bfqq and all other queues in the burst with +- * maximum benefit. Combining this and the two cases above, we +- * can now establish when idling is actually needed to +- * preserve service guarantees. ++ * maximum benefit. Combining this and the above case, we can ++ * now establish when idling is actually needed to preserve ++ * service guarantees. + */ + idling_needed_for_service_guarantees = +- (on_hdd_and_not_all_queues_seeky || asymmetric_scenario) && +- !bfq_bfqq_in_large_burst(bfqq); ++ asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); + + /* + * We have now all the components we need to compute the return +@@ -2621,6 +3154,14 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) + * 2) idling either boosts the throughput (without issues), or + * is necessary to preserve service guarantees. + */ ++ bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d " ++ "wr_busy %d boosts %d IO-bound %d guar %d", ++ bfq_bfqq_sync(bfqq), idling_boosts_thr, ++ bfqd->wr_busy_queues, ++ idling_boosts_thr_without_issues, ++ bfq_bfqq_IO_bound(bfqq), ++ idling_needed_for_service_guarantees); ++ + return bfq_bfqq_sync(bfqq) && + (idling_boosts_thr_without_issues || + idling_needed_for_service_guarantees); +@@ -2632,7 +3173,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) + * 1) the queue must remain in service and cannot be expired, and + * 2) the device must be idled to wait for the possible arrival of a new + * request for the queue. +- * See the comments to the function bfq_bfqq_may_idle for the reasons ++ * See the comments on the function bfq_bfqq_may_idle for the reasons + * why performing device idling is the best choice to boost the throughput + * and preserve service guarantees when bfq_bfqq_may_idle itself + * returns true. +@@ -2698,9 +3239,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) + */ + bfq_clear_bfqq_wait_request(bfqq); + del_timer(&bfqd->idle_slice_timer); +-#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_idle_time(bfqq_group(bfqq)); +-#endif + } + goto keep_queue; + } +@@ -2745,14 +3284,11 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) + bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); + + /* +- * If the queue was activated in a burst, or +- * too much time has elapsed from the beginning +- * of this weight-raising period, or the queue has +- * exceeded the acceptable number of cooperations, +- * then end weight raising. ++ * If the queue was activated in a burst, or too much ++ * time has elapsed from the beginning of this ++ * weight-raising period, then end weight raising. + */ + if (bfq_bfqq_in_large_burst(bfqq) || +- bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || + time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time)) { + bfqq->last_wr_start_finish = jiffies; +@@ -2811,13 +3347,29 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, + */ + if (!bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); + goto expire; + } + ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); + /* Finally, insert request into driver dispatch list. */ + bfq_bfqq_served(bfqq, service_to_charge); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ + bfq_dispatch_insert(bfqd->queue, rq); + ++ /* ++ * If weight raising has to terminate for bfqq, then next ++ * function causes an immediate update of bfqq's weight, ++ * without waiting for next activation. As a consequence, on ++ * expiration, bfqq will be timestamped as if has never been ++ * weight-raised during this service slot, even if it has ++ * received part or even most of the service as a ++ * weight-raised queue. This inflates bfqq's timestamps, which ++ * is beneficial, as bfqq is then more willing to leave the ++ * device immediately to possible other weight-raised queues. ++ */ + bfq_update_wr_data(bfqd, bfqq); + + bfq_log_bfqq(bfqd, bfqq, +@@ -2833,9 +3385,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, + bfqd->in_service_bic = RQ_BIC(rq); + } + +- if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && +- dispatched >= bfqd->bfq_max_budget_async_rq) || +- bfq_class_idle(bfqq))) ++ if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) + goto expire; + + return dispatched; +@@ -2881,8 +3431,8 @@ static int bfq_forced_dispatch(struct bfq_data *bfqd) + st = bfq_entity_service_tree(&bfqq->entity); + + dispatched += __bfq_forced_dispatch_bfqq(bfqq); +- bfqq->max_budget = bfq_max_budget(bfqd); + ++ bfqq->max_budget = bfq_max_budget(bfqd); + bfq_forget_idle(st); + } + +@@ -2895,9 +3445,9 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) + { + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq; +- int max_dispatch; + + bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); ++ + if (bfqd->busy_queues == 0) + return 0; + +@@ -2908,21 +3458,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) + if (!bfqq) + return 0; + +- if (bfq_class_idle(bfqq)) +- max_dispatch = 1; +- +- if (!bfq_bfqq_sync(bfqq)) +- max_dispatch = bfqd->bfq_max_budget_async_rq; +- +- if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) { +- if (bfqd->busy_queues > 1) +- return 0; +- if (bfqq->dispatched >= 4 * max_dispatch) +- return 0; +- } +- +- if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) +- return 0; ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); + + bfq_clear_bfqq_wait_request(bfqq); + BUG_ON(timer_pending(&bfqd->idle_slice_timer)); +@@ -2933,6 +3469,8 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) + bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", + bfq_bfqq_sync(bfqq) ? "sync" : "async"); + ++ BUG_ON(bfqq->next_rq == NULL && ++ bfqq->entity.budget < bfqq->entity.service); + return 1; + } + +@@ -2944,23 +3482,22 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) + */ + static void bfq_put_queue(struct bfq_queue *bfqq) + { +- struct bfq_data *bfqd = bfqq->bfqd; + #ifdef CONFIG_BFQ_GROUP_IOSCHED + struct bfq_group *bfqg = bfqq_group(bfqq); + #endif + +- BUG_ON(atomic_read(&bfqq->ref) <= 0); ++ BUG_ON(bfqq->ref <= 0); + +- bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, +- atomic_read(&bfqq->ref)); +- if (!atomic_dec_and_test(&bfqq->ref)) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); ++ bfqq->ref--; ++ if (bfqq->ref) + return; + + BUG_ON(rb_first(&bfqq->sort_list)); + BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); + BUG_ON(bfqq->entity.tree); + BUG_ON(bfq_bfqq_busy(bfqq)); +- BUG_ON(bfqd->in_service_queue == bfqq); ++ BUG_ON(bfqq->bfqd->in_service_queue == bfqq); + + if (bfq_bfqq_sync(bfqq)) + /* +@@ -2973,7 +3510,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) + */ + hlist_del_init(&bfqq->burst_list_node); + +- bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); + + kmem_cache_free(bfq_pool, bfqq); + #ifdef CONFIG_BFQ_GROUP_IOSCHED +@@ -3007,8 +3544,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) + bfq_schedule_dispatch(bfqd); + } + +- bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, +- atomic_read(&bfqq->ref)); ++ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); + + bfq_put_cooperator(bfqq); + +@@ -3019,26 +3555,7 @@ static void bfq_init_icq(struct io_cq *icq) + { + struct bfq_io_cq *bic = icq_to_bic(icq); + +- bic->ttime.last_end_request = jiffies; +- /* +- * A newly created bic indicates that the process has just +- * started doing I/O, and is probably mapping into memory its +- * executable and libraries: it definitely needs weight raising. +- * There is however the possibility that the process performs, +- * for a while, I/O close to some other process. EQM intercepts +- * this behavior and may merge the queue corresponding to the +- * process with some other queue, BEFORE the weight of the queue +- * is raised. Merged queues are not weight-raised (they are assumed +- * to belong to processes that benefit only from high throughput). +- * If the merge is basically the consequence of an accident, then +- * the queue will be split soon and will get back its old weight. +- * It is then important to write down somewhere that this queue +- * does need weight raising, even if it did not make it to get its +- * weight raised before being merged. To this purpose, we overload +- * the field raising_time_left and assign 1 to it, to mark the queue +- * as needing weight raising. +- */ +- bic->wr_time_left = 1; ++ bic->ttime.last_end_request = bfq_smallest_from_now(); + } + + static void bfq_exit_icq(struct io_cq *icq) +@@ -3046,21 +3563,21 @@ static void bfq_exit_icq(struct io_cq *icq) + struct bfq_io_cq *bic = icq_to_bic(icq); + struct bfq_data *bfqd = bic_to_bfqd(bic); + +- if (bic->bfqq[BLK_RW_ASYNC]) { +- bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); +- bic->bfqq[BLK_RW_ASYNC] = NULL; ++ if (bic_to_bfqq(bic, false)) { ++ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); ++ bic_set_bfqq(bic, NULL, false); + } + +- if (bic->bfqq[BLK_RW_SYNC]) { ++ if (bic_to_bfqq(bic, true)) { + /* + * If the bic is using a shared queue, put the reference + * taken on the io_context when the bic started using a + * shared bfq_queue. + */ +- if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) ++ if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) + put_io_context(icq->ioc); +- bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); +- bic->bfqq[BLK_RW_SYNC] = NULL; ++ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); ++ bic_set_bfqq(bic, NULL, true); + } + } + +@@ -3068,7 +3585,8 @@ static void bfq_exit_icq(struct io_cq *icq) + * Update the entity prio values; note that the new values will not + * be used until the next (re)activation. + */ +-static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) ++static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic) + { + struct task_struct *tsk = current; + int ioprio_class; +@@ -3100,7 +3618,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *b + break; + } + +- if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) { ++ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { + printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n", + bfqq->new_ioprio); + BUG(); +@@ -3108,45 +3626,40 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *b + + bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); + bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "set_next_ioprio_data: bic_class %d prio %d class %d", ++ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); + } + + static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) + { +- struct bfq_data *bfqd; +- struct bfq_queue *bfqq, *new_bfqq; ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ struct bfq_queue *bfqq; + unsigned long uninitialized_var(flags); + int ioprio = bic->icq.ioc->ioprio; + +- bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), +- &flags); + /* + * This condition may trigger on a newly created bic, be sure to + * drop the lock before returning. + */ + if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) +- goto out; ++ return; + + bic->ioprio = ioprio; + +- bfqq = bic->bfqq[BLK_RW_ASYNC]; ++ bfqq = bic_to_bfqq(bic, false); + if (bfqq) { +- new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, +- GFP_ATOMIC); +- if (new_bfqq) { +- bic->bfqq[BLK_RW_ASYNC] = new_bfqq; +- bfq_log_bfqq(bfqd, bfqq, +- "check_ioprio_change: bfqq %p %d", +- bfqq, atomic_read(&bfqq->ref)); +- bfq_put_queue(bfqq); +- } ++ bfq_put_queue(bfqq); ++ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); ++ bic_set_bfqq(bic, bfqq, false); ++ bfq_log_bfqq(bfqd, bfqq, ++ "check_ioprio_change: bfqq %p %d", ++ bfqq, bfqq->ref); + } + +- bfqq = bic->bfqq[BLK_RW_SYNC]; ++ bfqq = bic_to_bfqq(bic, true); + if (bfqq) + bfq_set_next_ioprio_data(bfqq, bic); +- +-out: +- bfq_put_bfqd_unlock(bfqd, &flags); + } + + static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, +@@ -3155,8 +3668,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + RB_CLEAR_NODE(&bfqq->entity.rb_node); + INIT_LIST_HEAD(&bfqq->fifo); + INIT_HLIST_NODE(&bfqq->burst_list_node); ++ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); + +- atomic_set(&bfqq->ref, 0); ++ bfqq->ref = 0; + bfqq->bfqd = bfqd; + + if (bic) +@@ -3166,6 +3680,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + if (!bfq_class_idle(bfqq)) + bfq_mark_bfqq_idle_window(bfqq); + bfq_mark_bfqq_sync(bfqq); ++ bfq_mark_bfqq_just_created(bfqq); + } else + bfq_clear_bfqq_sync(bfqq); + bfq_mark_bfqq_IO_bound(bfqq); +@@ -3175,72 +3690,17 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bfqq->pid = pid; + + bfqq->wr_coeff = 1; +- bfqq->last_wr_start_finish = 0; ++ bfqq->last_wr_start_finish = bfq_smallest_from_now(); ++ bfqq->budget_timeout = bfq_smallest_from_now(); ++ bfqq->split_time = bfq_smallest_from_now(); + /* + * Set to the value for which bfqq will not be deemed as + * soft rt when it becomes backlogged. + */ +- bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); +-} +- +-static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, +- struct bio *bio, int is_sync, +- struct bfq_io_cq *bic, +- gfp_t gfp_mask) +-{ +- struct bfq_group *bfqg; +- struct bfq_queue *bfqq, *new_bfqq = NULL; +- struct blkcg *blkcg; +- +-retry: +- rcu_read_lock(); +- +- blkcg = bio_blkcg(bio); +- bfqg = bfq_find_alloc_group(bfqd, blkcg); +- /* bic always exists here */ +- bfqq = bic_to_bfqq(bic, is_sync); +- +- /* +- * Always try a new alloc if we fall back to the OOM bfqq +- * originally, since it should just be a temporary situation. +- */ +- if (!bfqq || bfqq == &bfqd->oom_bfqq) { +- bfqq = NULL; +- if (new_bfqq) { +- bfqq = new_bfqq; +- new_bfqq = NULL; +- } else if (gfpflags_allow_blocking(gfp_mask)) { +- rcu_read_unlock(); +- spin_unlock_irq(bfqd->queue->queue_lock); +- new_bfqq = kmem_cache_alloc_node(bfq_pool, +- gfp_mask | __GFP_ZERO, +- bfqd->queue->node); +- spin_lock_irq(bfqd->queue->queue_lock); +- if (new_bfqq) +- goto retry; +- } else { +- bfqq = kmem_cache_alloc_node(bfq_pool, +- gfp_mask | __GFP_ZERO, +- bfqd->queue->node); +- } +- +- if (bfqq) { +- bfq_init_bfqq(bfqd, bfqq, bic, current->pid, +- is_sync); +- bfq_init_entity(&bfqq->entity, bfqg); +- bfq_log_bfqq(bfqd, bfqq, "allocated"); +- } else { +- bfqq = &bfqd->oom_bfqq; +- bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); +- } +- } +- +- if (new_bfqq) +- kmem_cache_free(bfq_pool, new_bfqq); +- +- rcu_read_unlock(); ++ bfqq->soft_rt_next_start = bfq_greatest_from_now(); + +- return bfqq; ++ /* first request is almost certainly seeky */ ++ bfqq->seek_history = 1; + } + + static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, +@@ -3263,44 +3723,60 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, + } + + static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, +- struct bio *bio, int is_sync, +- struct bfq_io_cq *bic, gfp_t gfp_mask) ++ struct bio *bio, bool is_sync, ++ struct bfq_io_cq *bic) + { + const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); + struct bfq_queue **async_bfqq = NULL; +- struct bfq_queue *bfqq = NULL; ++ struct bfq_queue *bfqq; ++ struct bfq_group *bfqg; + +- if (!is_sync) { +- struct blkcg *blkcg; +- struct bfq_group *bfqg; ++ rcu_read_lock(); ++ ++ bfqg = bfq_find_set_group(bfqd,bio_blkcg(bio)); ++ if (!bfqg) { ++ bfqq = &bfqd->oom_bfqq; ++ goto out; ++ } + +- rcu_read_lock(); +- blkcg = bio_blkcg(bio); +- rcu_read_unlock(); +- bfqg = bfq_find_alloc_group(bfqd, blkcg); ++ if (!is_sync) { + async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, + ioprio); + bfqq = *async_bfqq; ++ if (bfqq) ++ goto out; + } + +- if (!bfqq) +- bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask); ++ bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO, ++ bfqd->queue->node); ++ ++ if (bfqq) { ++ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, ++ is_sync); ++ bfq_init_entity(&bfqq->entity, bfqg); ++ bfq_log_bfqq(bfqd, bfqq, "allocated"); ++ } else { ++ bfqq = &bfqd->oom_bfqq; ++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); ++ goto out; ++ } + + /* + * Pin the queue now that it's allocated, scheduler exit will + * prune it. + */ +- if (!is_sync && !(*async_bfqq)) { +- atomic_inc(&bfqq->ref); ++ if (async_bfqq) { ++ bfqq->ref++; + bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", +- bfqq, atomic_read(&bfqq->ref)); ++ bfqq, bfqq->ref); + *async_bfqq = bfqq; + } + +- atomic_inc(&bfqq->ref); +- bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, +- atomic_read(&bfqq->ref)); ++out: ++ bfqq->ref++; ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); ++ rcu_read_unlock(); + return bfqq; + } + +@@ -3316,37 +3792,21 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, + bic->ttime.ttime_samples; + } + +-static void bfq_update_io_seektime(struct bfq_data *bfqd, +- struct bfq_queue *bfqq, +- struct request *rq) +-{ +- sector_t sdist; +- u64 total; +- +- if (bfqq->last_request_pos < blk_rq_pos(rq)) +- sdist = blk_rq_pos(rq) - bfqq->last_request_pos; +- else +- sdist = bfqq->last_request_pos - blk_rq_pos(rq); +- +- /* +- * Don't allow the seek distance to get too large from the +- * odd fragment, pagein, etc. +- */ +- if (bfqq->seek_samples == 0) /* first request, not really a seek */ +- sdist = 0; +- else if (bfqq->seek_samples <= 60) /* second & third seek */ +- sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); +- else +- sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); + +- bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; +- bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; +- total = bfqq->seek_total + (bfqq->seek_samples/2); +- do_div(total, bfqq->seek_samples); +- bfqq->seek_mean = (sector_t)total; ++static void ++bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ sector_t sdist = 0; ++ if (bfqq->last_request_pos) { ++ if (bfqq->last_request_pos < blk_rq_pos(rq)) ++ sdist = blk_rq_pos(rq) - bfqq->last_request_pos; ++ else ++ sdist = bfqq->last_request_pos - blk_rq_pos(rq); ++ } + +- bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, +- (u64)bfqq->seek_mean); ++ bfqq->seek_history <<= 1; ++ bfqq->seek_history |= (sdist > BFQQ_SEEK_THR); + } + + /* +@@ -3364,7 +3824,8 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, + return; + + /* Idle window just restored, statistics are meaningless. */ +- if (bfq_bfqq_just_split(bfqq)) ++ if (time_is_after_eq_jiffies(bfqq->split_time + ++ bfqd->bfq_wr_min_idle_time)) + return; + + enable_idle = bfq_bfqq_idle_window(bfqq); +@@ -3404,22 +3865,13 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + + bfq_update_io_thinktime(bfqd, bic); + bfq_update_io_seektime(bfqd, bfqq, rq); +- if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { +- bfq_clear_bfqq_constantly_seeky(bfqq); +- if (!blk_queue_nonrot(bfqd->queue)) { +- BUG_ON(!bfqd->const_seeky_busy_in_flight_queues); +- bfqd->const_seeky_busy_in_flight_queues--; +- } +- } + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || + !BFQQ_SEEKY(bfqq)) + bfq_update_idle_window(bfqd, bfqq, bic); +- bfq_clear_bfqq_just_split(bfqq); + + bfq_log_bfqq(bfqd, bfqq, +- "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", +- bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), +- (long long unsigned)bfqq->seek_mean); ++ "rq_enqueued: idle_window=%d (seeky %d)", ++ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); + + bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); + +@@ -3433,14 +3885,15 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + * is small and the queue is not to be expired, then + * just exit. + * +- * In this way, if the disk is being idled to wait for +- * a new request from the in-service queue, we avoid +- * unplugging the device and committing the disk to serve +- * just a small request. On the contrary, we wait for +- * the block layer to decide when to unplug the device: +- * hopefully, new requests will be merged to this one +- * quickly, then the device will be unplugged and +- * larger requests will be dispatched. ++ * In this way, if the device is being idled to wait ++ * for a new request from the in-service queue, we ++ * avoid unplugging the device and committing the ++ * device to serve just a small request. On the ++ * contrary, we wait for the block layer to decide ++ * when to unplug the device: hopefully, new requests ++ * will be merged to this one quickly, then the device ++ * will be unplugged and larger requests will be ++ * dispatched. + */ + if (small_req && !budget_timeout) + return; +@@ -3453,9 +3906,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + */ + bfq_clear_bfqq_wait_request(bfqq); + del_timer(&bfqd->idle_slice_timer); +-#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_idle_time(bfqq_group(bfqq)); +-#endif + + /* + * The queue is not empty, because a new request just +@@ -3499,27 +3950,19 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) + */ + new_bfqq->allocated[rq_data_dir(rq)]++; + bfqq->allocated[rq_data_dir(rq)]--; +- atomic_inc(&new_bfqq->ref); ++ new_bfqq->ref++; ++ bfq_clear_bfqq_just_created(bfqq); + bfq_put_queue(bfqq); + if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) + bfq_merge_bfqqs(bfqd, RQ_BIC(rq), + bfqq, new_bfqq); + rq->elv.priv[1] = new_bfqq; + bfqq = new_bfqq; +- } else +- bfq_bfqq_increase_failed_cooperations(bfqq); ++ } + } + + bfq_add_request(rq); + +- /* +- * Here a newly-created bfq_queue has already started a weight-raising +- * period: clear raising_time_left to prevent bfq_bfqq_save_state() +- * from assigning it a full weight-raising period. See the detailed +- * comments about this field in bfq_init_icq(). +- */ +- if (bfqq->bic) +- bfqq->bic->wr_time_left = 0; + rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; + list_add_tail(&rq->queuelist, &bfqq->fifo); + +@@ -3528,8 +3971,8 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) + + static void bfq_update_hw_tag(struct bfq_data *bfqd) + { +- bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, +- bfqd->rq_in_driver); ++ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, ++ bfqd->rq_in_driver); + + if (bfqd->hw_tag == 1) + return; +@@ -3555,48 +3998,45 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) + { + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; +- bool sync = bfq_bfqq_sync(bfqq); + +- bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)", +- blk_rq_sectors(rq), sync); ++ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", ++ blk_rq_sectors(rq)); + ++ assert_spin_locked(bfqd->queue->queue_lock); + bfq_update_hw_tag(bfqd); + + BUG_ON(!bfqd->rq_in_driver); + BUG_ON(!bfqq->dispatched); + bfqd->rq_in_driver--; + bfqq->dispatched--; +-#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_completion(bfqq_group(bfqq), + rq_start_time_ns(rq), + rq_io_start_time_ns(rq), rq->cmd_flags); +-#endif + + if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ /* ++ * Set budget_timeout (which we overload to store the ++ * time at which the queue remains with no backlog and ++ * no outstanding request; used by the weight-raising ++ * mechanism). ++ */ ++ bfqq->budget_timeout = jiffies; ++ + bfq_weights_tree_remove(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); +- if (!blk_queue_nonrot(bfqd->queue)) { +- BUG_ON(!bfqd->busy_in_flight_queues); +- bfqd->busy_in_flight_queues--; +- if (bfq_bfqq_constantly_seeky(bfqq)) { +- BUG_ON(!bfqd-> +- const_seeky_busy_in_flight_queues); +- bfqd->const_seeky_busy_in_flight_queues--; +- } +- } + } + +- if (sync) { +- bfqd->sync_flight--; +- RQ_BIC(rq)->ttime.last_end_request = jiffies; +- } ++ RQ_BIC(rq)->ttime.last_end_request = jiffies; + + /* +- * If we are waiting to discover whether the request pattern of the +- * task associated with the queue is actually isochronous, and +- * both requisites for this condition to hold are satisfied, then +- * compute soft_rt_next_start (see the comments to the function +- * bfq_bfqq_softrt_next_start()). ++ * If we are waiting to discover whether the request pattern ++ * of the task associated with the queue is actually ++ * isochronous, and both requisites for this condition to hold ++ * are now satisfied, then compute soft_rt_next_start (see the ++ * comments on the function bfq_bfqq_softrt_next_start()). We ++ * schedule this delayed check when bfqq expires, if it still ++ * has in-flight requests. + */ + if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && + RB_EMPTY_ROOT(&bfqq->sort_list)) +@@ -3608,10 +4048,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) + * or if we want to idle in case it has no pending requests. + */ + if (bfqd->in_service_queue == bfqq) { +- if (bfq_bfqq_budget_new(bfqq)) +- bfq_set_budget_timeout(bfqd); +- +- if (bfq_bfqq_must_idle(bfqq)) { ++ if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { + bfq_arm_slice_timer(bfqd); + goto out; + } else if (bfq_may_expire_for_budg_timeout(bfqq)) +@@ -3682,14 +4119,14 @@ static void bfq_put_request(struct request *rq) + rq->elv.priv[1] = NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", +- bfqq, atomic_read(&bfqq->ref)); ++ bfqq, bfqq->ref); + bfq_put_queue(bfqq); + } + } + + /* + * Returns NULL if a new bfqq should be allocated, or the old bfqq if this +- * was the last process referring to said bfqq. ++ * was the last process referring to that bfqq. + */ + static struct bfq_queue * + bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) +@@ -3727,11 +4164,8 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, + unsigned long flags; + bool split = false; + +- might_sleep_if(gfpflags_allow_blocking(gfp_mask)); +- +- bfq_check_ioprio_change(bic, bio); +- + spin_lock_irqsave(q->queue_lock, flags); ++ bfq_check_ioprio_change(bic, bio); + + if (!bic) + goto queue_fail; +@@ -3741,23 +4175,47 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, + new_queue: + bfqq = bic_to_bfqq(bic, is_sync); + if (!bfqq || bfqq == &bfqd->oom_bfqq) { +- bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); ++ if (bfqq) ++ bfq_put_queue(bfqq); ++ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); ++ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); ++ + bic_set_bfqq(bic, bfqq, is_sync); + if (split && is_sync) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "set_request: was_in_list %d " ++ "was_in_large_burst %d " ++ "large burst in progress %d", ++ bic->was_in_burst_list, ++ bic->saved_in_large_burst, ++ bfqd->large_burst); ++ + if ((bic->was_in_burst_list && bfqd->large_burst) || +- bic->saved_in_large_burst) ++ bic->saved_in_large_burst) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "set_request: marking in " ++ "large burst"); + bfq_mark_bfqq_in_large_burst(bfqq); +- else { +- bfq_clear_bfqq_in_large_burst(bfqq); +- if (bic->was_in_burst_list) +- hlist_add_head(&bfqq->burst_list_node, +- &bfqd->burst_list); ++ } else { ++ bfq_log_bfqq(bfqd, bfqq, ++ "set_request: clearing in " ++ "large burst"); ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ if (bic->was_in_burst_list) ++ hlist_add_head(&bfqq->burst_list_node, ++ &bfqd->burst_list); + } ++ bfqq->split_time = jiffies; + } + } else { + /* If the queue was seeky for too long, break it apart. */ + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); ++ ++ /* Update bic before losing reference to bfqq */ ++ if (bfq_bfqq_in_large_burst(bfqq)) ++ bic->saved_in_large_burst = true; ++ + bfqq = bfq_split_bfqq(bic, bfqq); + split = true; + if (!bfqq) +@@ -3766,9 +4224,8 @@ new_queue: + } + + bfqq->allocated[rw]++; +- atomic_inc(&bfqq->ref); +- bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, +- atomic_read(&bfqq->ref)); ++ bfqq->ref++; ++ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); + + rq->elv.priv[0] = bic; + rq->elv.priv[1] = bfqq; +@@ -3783,7 +4240,6 @@ new_queue: + if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { + bfqq->bic = bic; + if (split) { +- bfq_mark_bfqq_just_split(bfqq); + /* + * If the queue has just been split from a shared + * queue, restore the idle window and the possible +@@ -3793,6 +4249,9 @@ new_queue: + } + } + ++ if (unlikely(bfq_bfqq_just_created(bfqq))) ++ bfq_handle_burst(bfqd, bfqq); ++ + spin_unlock_irqrestore(q->queue_lock, flags); + + return 0; +@@ -3872,6 +4331,7 @@ static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) + cancel_work_sync(&bfqd->unplug_work); + } + ++#ifdef CONFIG_BFQ_GROUP_IOSCHED + static void __bfq_put_async_bfqq(struct bfq_data *bfqd, + struct bfq_queue **bfqq_ptr) + { +@@ -3880,9 +4340,9 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, + + bfq_log(bfqd, "put_async_bfqq: %p", bfqq); + if (bfqq) { +- bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); ++ bfq_bfqq_move(bfqd, bfqq, root_group); + bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", +- bfqq, atomic_read(&bfqq->ref)); ++ bfqq, bfqq->ref); + bfq_put_queue(bfqq); + *bfqq_ptr = NULL; + } +@@ -3904,6 +4364,7 @@ static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) + + __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); + } ++#endif + + static void bfq_exit_queue(struct elevator_queue *e) + { +@@ -3923,8 +4384,6 @@ static void bfq_exit_queue(struct elevator_queue *e) + + bfq_shutdown_timer_wq(bfqd); + +- synchronize_rcu(); +- + BUG_ON(timer_pending(&bfqd->idle_slice_timer)); + + #ifdef CONFIG_BFQ_GROUP_IOSCHED +@@ -3973,11 +4432,14 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + * will not attempt to free it. + */ + bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); +- atomic_inc(&bfqd->oom_bfqq.ref); ++ bfqd->oom_bfqq.ref++; + bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; + bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; + bfqd->oom_bfqq.entity.new_weight = + bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); ++ ++ /* oom_bfqq does not participate to bursts */ ++ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); + /* + * Trigger weight initialization, according to ioprio, at the + * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio +@@ -3996,9 +4458,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + goto out_free; + bfq_init_root_group(bfqd->root_group, bfqd); + bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); +-#ifdef CONFIG_BFQ_GROUP_IOSCHED +- bfqd->active_numerous_groups = 0; +-#endif + + init_timer(&bfqd->idle_slice_timer); + bfqd->idle_slice_timer.function = bfq_idle_slice_timer; +@@ -4023,20 +4482,19 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + bfqd->bfq_back_penalty = bfq_back_penalty; + bfqd->bfq_slice_idle = bfq_slice_idle; + bfqd->bfq_class_idle_last_service = 0; +- bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; +- bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; +- bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; ++ bfqd->bfq_timeout = bfq_timeout; + +- bfqd->bfq_coop_thresh = 2; +- bfqd->bfq_failed_cooperations = 7000; + bfqd->bfq_requests_within_timer = 120; + +- bfqd->bfq_large_burst_thresh = 11; +- bfqd->bfq_burst_interval = msecs_to_jiffies(500); ++ bfqd->bfq_large_burst_thresh = 8; ++ bfqd->bfq_burst_interval = msecs_to_jiffies(180); + + bfqd->low_latency = true; + +- bfqd->bfq_wr_coeff = 20; ++ /* ++ * Trade-off between responsiveness and fairness. ++ */ ++ bfqd->bfq_wr_coeff = 30; + bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); + bfqd->bfq_wr_max_time = 0; + bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); +@@ -4048,16 +4506,15 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + * video. + */ + bfqd->wr_busy_queues = 0; +- bfqd->busy_in_flight_queues = 0; +- bfqd->const_seeky_busy_in_flight_queues = 0; + + /* +- * Begin by assuming, optimistically, that the device peak rate is +- * equal to the highest reference rate. ++ * Begin by assuming, optimistically, that the device is a ++ * high-speed one, and that its peak rate is equal to 2/3 of ++ * the highest reference rate. + */ + bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * + T_fast[blk_queue_nonrot(bfqd->queue)]; +- bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)]; ++ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; + bfqd->device_speed = BFQ_BFQD_FAST; + + return 0; +@@ -4161,10 +4618,8 @@ SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); + SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); + SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); + SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); +-SHOW_FUNCTION(bfq_max_budget_async_rq_show, +- bfqd->bfq_max_budget_async_rq, 0); +-SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); +-SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); ++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); ++SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); + SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); + SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); + SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); +@@ -4199,10 +4654,6 @@ STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); + STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, + INT_MAX, 0); + STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); +-STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, +- 1, INT_MAX, 0); +-STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, +- INT_MAX, 1); + STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); + STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); + STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, +@@ -4224,10 +4675,8 @@ static ssize_t bfq_weights_store(struct elevator_queue *e, + + static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) + { +- u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); +- + if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) +- return bfq_calc_max_budget(bfqd->peak_rate, timeout); ++ return bfq_calc_max_budget(bfqd); + else + return bfq_default_max_budget; + } +@@ -4252,6 +4701,10 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, + return ret; + } + ++/* ++ * Leaving this name to preserve name compatibility with cfq ++ * parameters, but this timeout is used for both sync and async. ++ */ + static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, + const char *page, size_t count) + { +@@ -4264,13 +4717,31 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, + else if (__data > INT_MAX) + __data = INT_MAX; + +- bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); ++ bfqd->bfq_timeout = msecs_to_jiffies(__data); + if (bfqd->bfq_user_max_budget == 0) + bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + + return ret; + } + ++static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data > 1) ++ __data = 1; ++ if (!bfqd->strict_guarantees && __data == 1 ++ && bfqd->bfq_slice_idle < msecs_to_jiffies(8)) ++ bfqd->bfq_slice_idle = msecs_to_jiffies(8); ++ ++ bfqd->strict_guarantees = __data; ++ ++ return ret; ++} ++ + static ssize_t bfq_low_latency_store(struct elevator_queue *e, + const char *page, size_t count) + { +@@ -4297,9 +4768,8 @@ static struct elv_fs_entry bfq_attrs[] = { + BFQ_ATTR(back_seek_penalty), + BFQ_ATTR(slice_idle), + BFQ_ATTR(max_budget), +- BFQ_ATTR(max_budget_async_rq), + BFQ_ATTR(timeout_sync), +- BFQ_ATTR(timeout_async), ++ BFQ_ATTR(strict_guarantees), + BFQ_ATTR(low_latency), + BFQ_ATTR(wr_coeff), + BFQ_ATTR(wr_max_time), +@@ -4342,9 +4812,28 @@ static struct elevator_type iosched_bfq = { + .elevator_owner = THIS_MODULE, + }; + ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++static struct blkcg_policy blkcg_policy_bfq = { ++ .dfl_cftypes = bfq_blkg_files, ++ .legacy_cftypes = bfq_blkcg_legacy_files, ++ ++ .cpd_alloc_fn = bfq_cpd_alloc, ++ .cpd_init_fn = bfq_cpd_init, ++ .cpd_bind_fn = bfq_cpd_init, ++ .cpd_free_fn = bfq_cpd_free, ++ ++ .pd_alloc_fn = bfq_pd_alloc, ++ .pd_init_fn = bfq_pd_init, ++ .pd_offline_fn = bfq_pd_offline, ++ .pd_free_fn = bfq_pd_free, ++ .pd_reset_stats_fn = bfq_pd_reset_stats, ++}; ++#endif ++ + static int __init bfq_init(void) + { + int ret; ++ char msg[50] = "BFQ I/O-scheduler: v8r2"; + + /* + * Can be 0 on HZ < 1000 setups. +@@ -4352,9 +4841,6 @@ static int __init bfq_init(void) + if (bfq_slice_idle == 0) + bfq_slice_idle = 1; + +- if (bfq_timeout_async == 0) +- bfq_timeout_async = 1; +- + #ifdef CONFIG_BFQ_GROUP_IOSCHED + ret = blkcg_policy_register(&blkcg_policy_bfq); + if (ret) +@@ -4370,23 +4856,34 @@ static int __init bfq_init(void) + * installed on the reference devices (see the comments before the + * definitions of the two arrays). + */ +- T_slow[0] = msecs_to_jiffies(2600); +- T_slow[1] = msecs_to_jiffies(1000); +- T_fast[0] = msecs_to_jiffies(5500); +- T_fast[1] = msecs_to_jiffies(2000); ++ T_slow[0] = msecs_to_jiffies(3500); ++ T_slow[1] = msecs_to_jiffies(1500); ++ T_fast[0] = msecs_to_jiffies(8000); ++ T_fast[1] = msecs_to_jiffies(3000); + + /* +- * Thresholds that determine the switch between speed classes (see +- * the comments before the definition of the array). ++ * Thresholds that determine the switch between speed classes ++ * (see the comments before the definition of the array ++ * device_speed_thresh). These thresholds are biased towards ++ * transitions to the fast class. This is safer than the ++ * opposite bias. In fact, a wrong transition to the slow ++ * class results in short weight-raising periods, because the ++ * speed of the device then tends to be higher that the ++ * reference peak rate. On the opposite end, a wrong ++ * transition to the fast class tends to increase ++ * weight-raising periods, because of the opposite reason. + */ +- device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; +- device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; ++ device_speed_thresh[0] = (4 * R_slow[0]) / 3; ++ device_speed_thresh[1] = (4 * R_slow[1]) / 3; + + ret = elv_register(&iosched_bfq); + if (ret) + goto err_pol_unreg; + +- pr_info("BFQ I/O-scheduler: v7r11"); ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ strcat(msg, " (with cgroups support)"); ++#endif ++ pr_info("%s", msg); + + return 0; + +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +index a64fec1..7d73b9d 100644 +--- a/block/bfq-sched.c ++++ b/block/bfq-sched.c +@@ -7,9 +7,11 @@ + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * +- * Copyright (C) 2010 Paolo Valente ++ * Copyright (C) 2016 Paolo Valente + */ + ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); ++ + #ifdef CONFIG_BFQ_GROUP_IOSCHED + #define for_each_entity(entity) \ + for (; entity ; entity = entity->parent) +@@ -22,8 +24,6 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + int extract, + struct bfq_data *bfqd); + +-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); +- + static void bfq_update_budget(struct bfq_entity *next_in_service) + { + struct bfq_entity *bfqg_entity; +@@ -48,6 +48,7 @@ static void bfq_update_budget(struct bfq_entity *next_in_service) + static int bfq_update_next_in_service(struct bfq_sched_data *sd) + { + struct bfq_entity *next_in_service; ++ struct bfq_queue *bfqq; + + if (sd->in_service_entity) + /* will update/requeue at the end of service */ +@@ -65,14 +66,29 @@ static int bfq_update_next_in_service(struct bfq_sched_data *sd) + + if (next_in_service) + bfq_update_budget(next_in_service); ++ else ++ goto exit; + ++ bfqq = bfq_entity_to_bfqq(next_in_service); ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "update_next_in_service: chosen this queue"); ++ else { ++ struct bfq_group *bfqg = ++ container_of(next_in_service, ++ struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "update_next_in_service: chosen this entity"); ++ } ++exit: + return 1; + } + + static void bfq_check_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *entity) + { +- BUG_ON(sd->next_in_service != entity); ++ WARN_ON(sd->next_in_service != entity); + } + #else + #define for_each_entity(entity) \ +@@ -151,20 +167,35 @@ static u64 bfq_delta(unsigned long service, unsigned long weight) + static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) + { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); +- ++ unsigned long long start, finish, delta ; + BUG_ON(entity->weight == 0); + + entity->finish = entity->start + + bfq_delta(service, entity->weight); + ++ start = ((entity->start>>10)*1000)>>12; ++ finish = ((entity->finish>>10)*1000)>>12; ++ delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12; ++ + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_finish: serv %lu, w %d", + service, entity->weight); + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_finish: start %llu, finish %llu, delta %llu", +- entity->start, entity->finish, +- bfq_delta(service, entity->weight)); ++ start, finish, delta); ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ } else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "calc_finish group: serv %lu, w %d", ++ service, entity->weight); ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "calc_finish group: start %llu, finish %llu, delta %llu", ++ start, finish, delta); ++#endif + } + } + +@@ -386,8 +417,6 @@ static void bfq_active_insert(struct bfq_service_tree *st, + BUG_ON(!bfqg); + BUG_ON(!bfqd); + bfqg->active_entities++; +- if (bfqg->active_entities == 2) +- bfqd->active_numerous_groups++; + } + #endif + } +@@ -399,7 +428,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, + static unsigned short bfq_ioprio_to_weight(int ioprio) + { + BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); +- return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio; ++ return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF ; + } + + /** +@@ -422,9 +451,9 @@ static void bfq_get_entity(struct bfq_entity *entity) + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + if (bfqq) { +- atomic_inc(&bfqq->ref); ++ bfqq->ref++; + bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", +- bfqq, atomic_read(&bfqq->ref)); ++ bfqq, bfqq->ref); + } + } + +@@ -499,10 +528,6 @@ static void bfq_active_extract(struct bfq_service_tree *st, + BUG_ON(!bfqd); + BUG_ON(!bfqg->active_entities); + bfqg->active_entities--; +- if (bfqg->active_entities == 1) { +- BUG_ON(!bfqd->active_numerous_groups); +- bfqd->active_numerous_groups--; +- } + } + #endif + } +@@ -552,7 +577,7 @@ static void bfq_forget_entity(struct bfq_service_tree *st, + if (bfqq) { + sd = entity->sched_data; + bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", +- bfqq, atomic_read(&bfqq->ref)); ++ bfqq, bfqq->ref); + bfq_put_queue(bfqq); + } + } +@@ -602,7 +627,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, + + if (entity->prio_changed) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); +- unsigned short prev_weight, new_weight; ++ unsigned int prev_weight, new_weight; + struct bfq_data *bfqd = NULL; + struct rb_root *root; + #ifdef CONFIG_BFQ_GROUP_IOSCHED +@@ -628,12 +653,14 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, + if (entity->new_weight != entity->orig_weight) { + if (entity->new_weight < BFQ_MIN_WEIGHT || + entity->new_weight > BFQ_MAX_WEIGHT) { +- printk(KERN_CRIT "update_weight_prio: " +- "new_weight %d\n", ++ pr_crit("update_weight_prio: new_weight %d\n", + entity->new_weight); +- BUG(); ++ if (entity->new_weight < BFQ_MIN_WEIGHT) ++ entity->new_weight = BFQ_MIN_WEIGHT; ++ else ++ entity->new_weight = BFQ_MAX_WEIGHT; + } +- entity->orig_weight = entity->new_weight; ++ entity->orig_weight = entity->new_weight; + if (bfqq) + bfqq->ioprio = + bfq_weight_to_ioprio(entity->orig_weight); +@@ -662,6 +689,13 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, + * associated with its new weight. + */ + if (prev_weight != new_weight) { ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "weight changed %d %d(%d %d)", ++ prev_weight, new_weight, ++ entity->orig_weight, ++ bfqq->wr_coeff); ++ + root = bfqq ? &bfqd->queue_weights_tree : + &bfqd->group_weights_tree; + bfq_weights_tree_remove(bfqd, entity, root); +@@ -708,7 +742,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) + st = bfq_entity_service_tree(entity); + + entity->service += served; +- BUG_ON(entity->service > entity->budget); ++ + BUG_ON(st->wsum == 0); + + st->vtime += bfq_delta(served, st->wsum); +@@ -717,31 +751,69 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) + #ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); + #endif +- bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); ++ st = bfq_entity_service_tree(&bfqq->entity); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", ++ served, ((st->vtime>>10)*1000)>>12, st); + } + + /** +- * bfq_bfqq_charge_full_budget - set the service to the entity budget. ++ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length ++ * of the time interval during which bfqq has been in ++ * service. ++ * @bfqd: the device + * @bfqq: the queue that needs a service update. ++ * @time_ms: the amount of time during which the queue has received service ++ * ++ * If a queue does not consume its budget fast enough, then providing ++ * the queue with service fairness may impair throughput, more or less ++ * severely. For this reason, queues that consume their budget slowly ++ * are provided with time fairness instead of service fairness. This ++ * goal is achieved through the BFQ scheduling engine, even if such an ++ * engine works in the service, and not in the time domain. The trick ++ * is charging these queues with an inflated amount of service, equal ++ * to the amount of service that they would have received during their ++ * service slot if they had been fast, i.e., if their requests had ++ * been dispatched at a rate equal to the estimated peak rate. + * +- * When it's not possible to be fair in the service domain, because +- * a queue is not consuming its budget fast enough (the meaning of +- * fast depends on the timeout parameter), we charge it a full +- * budget. In this way we should obtain a sort of time-domain +- * fairness among all the seeky/slow queues. ++ * It is worth noting that time fairness can cause important ++ * distortions in terms of bandwidth distribution, on devices with ++ * internal queueing. The reason is that I/O requests dispatched ++ * during the service slot of a queue may be served after that service ++ * slot is finished, and may have a total processing time loosely ++ * correlated with the duration of the service slot. This is ++ * especially true for short service slots. + */ +-static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) ++static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ unsigned long time_ms) + { + struct bfq_entity *entity = &bfqq->entity; ++ int tot_serv_to_charge = entity->service; ++ unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout); ++ ++ if (time_ms > 0 && time_ms < timeout_ms) ++ tot_serv_to_charge = ++ (bfqd->bfq_max_budget * time_ms) / timeout_ms; ++ ++ if (tot_serv_to_charge < entity->service) ++ tot_serv_to_charge = entity->service; + +- bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "charge_time: %lu/%u ms, %d/%d/%d sectors", ++ time_ms, timeout_ms, entity->service, ++ tot_serv_to_charge, entity->budget); + +- bfq_bfqq_served(bfqq, entity->budget - entity->service); ++ /* Increase budget to avoid inconsistencies */ ++ if (tot_serv_to_charge > entity->budget) ++ entity->budget = tot_serv_to_charge; ++ ++ bfq_bfqq_served(bfqq, ++ max_t(int, 0, tot_serv_to_charge - entity->service)); + } + + /** + * __bfq_activate_entity - activate an entity. + * @entity: the entity being activated. ++ * @non_blocking_wait_rq: true if this entity was waiting for a request + * + * Called whenever an entity is activated, i.e., it is not active and one + * of its children receives a new request, or has to be reactivated due to +@@ -749,11 +821,16 @@ static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) + * service received if @entity is active) of the queue to calculate its + * timestamps. + */ +-static void __bfq_activate_entity(struct bfq_entity *entity) ++static void __bfq_activate_entity(struct bfq_entity *entity, ++ bool non_blocking_wait_rq) + { + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ bool backshifted = false; + ++ BUG_ON(!sd); ++ BUG_ON(!st); + if (entity == sd->in_service_entity) { + BUG_ON(entity->tree); + /* +@@ -771,45 +848,133 @@ static void __bfq_activate_entity(struct bfq_entity *entity) + * old start time. + */ + bfq_active_extract(st, entity); +- } else if (entity->tree == &st->idle) { +- /* +- * Must be on the idle tree, bfq_idle_extract() will +- * check for that. +- */ +- bfq_idle_extract(st, entity); +- entity->start = bfq_gt(st->vtime, entity->finish) ? +- st->vtime : entity->finish; + } else { +- /* +- * The finish time of the entity may be invalid, and +- * it is in the past for sure, otherwise the queue +- * would have been on the idle tree. +- */ +- entity->start = st->vtime; +- st->wsum += entity->weight; +- bfq_get_entity(entity); ++ unsigned long long min_vstart; + +- BUG_ON(entity->on_st); +- entity->on_st = 1; ++ /* See comments on bfq_fqq_update_budg_for_activation */ ++ if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { ++ backshifted = true; ++ min_vstart = entity->finish; ++ } else ++ min_vstart = st->vtime; ++ ++ if (entity->tree == &st->idle) { ++ /* ++ * Must be on the idle tree, bfq_idle_extract() will ++ * check for that. ++ */ ++ bfq_idle_extract(st, entity); ++ entity->start = bfq_gt(min_vstart, entity->finish) ? ++ min_vstart : entity->finish; ++ } else { ++ /* ++ * The finish time of the entity may be invalid, and ++ * it is in the past for sure, otherwise the queue ++ * would have been on the idle tree. ++ */ ++ entity->start = min_vstart; ++ st->wsum += entity->weight; ++ bfq_get_entity(entity); ++ ++ BUG_ON(entity->on_st); ++ entity->on_st = 1; ++ } + } + + st = __bfq_entity_update_weight_prio(st, entity); + bfq_calc_finish(entity, entity->budget); ++ ++ /* ++ * If some queues enjoy backshifting for a while, then their ++ * (virtual) finish timestamps may happen to become lower and ++ * lower than the system virtual time. In particular, if ++ * these queues often happen to be idle for short time ++ * periods, and during such time periods other queues with ++ * higher timestamps happen to be busy, then the backshifted ++ * timestamps of the former queues can become much lower than ++ * the system virtual time. In fact, to serve the queues with ++ * higher timestamps while the ones with lower timestamps are ++ * idle, the system virtual time may be pushed-up to much ++ * higher values than the finish timestamps of the idle ++ * queues. As a consequence, the finish timestamps of all new ++ * or newly activated queues may end up being much larger than ++ * those of lucky queues with backshifted timestamps. The ++ * latter queues may then monopolize the device for a lot of ++ * time. This would simply break service guarantees. ++ * ++ * To reduce this problem, push up a little bit the ++ * backshifted timestamps of the queue associated with this ++ * entity (only a queue can happen to have the backshifted ++ * flag set): just enough to let the finish timestamp of the ++ * queue be equal to the current value of the system virtual ++ * time. This may introduce a little unfairness among queues ++ * with backshifted timestamps, but it does not break ++ * worst-case fairness guarantees. ++ * ++ * As a special case, if bfqq is weight-raised, push up ++ * timestamps much less, to keep very low the probability that ++ * this push up causes the backshifted finish timestamps of ++ * weight-raised queues to become higher than the backshifted ++ * finish timestamps of non weight-raised queues. ++ */ ++ if (backshifted && bfq_gt(st->vtime, entity->finish)) { ++ unsigned long delta = st->vtime - entity->finish; ++ ++ if (bfqq) ++ delta /= bfqq->wr_coeff; ++ ++ entity->start += delta; ++ entity->finish += delta; ++ ++ if (bfqq) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "__activate_entity: new queue finish %llu", ++ ((entity->finish>>10)*1000)>>12); ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ } else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "__activate_entity: new group finish %llu", ++ ((entity->finish>>10)*1000)>>12); ++#endif ++ } ++ } ++ + bfq_active_insert(st, entity); ++ ++ if (bfqq) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "__activate_entity: queue %seligible in st %p", ++ entity->start <= st->vtime ? "" : "non ", st); ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ } else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "__activate_entity: group %seligible in st %p", ++ entity->start <= st->vtime ? "" : "non ", st); ++#endif ++ } + } + + /** + * bfq_activate_entity - activate an entity and its ancestors if necessary. + * @entity: the entity to activate. ++ * @non_blocking_wait_rq: true if this entity was waiting for a request + * + * Activate @entity and all the entities on the path from it to the root. + */ +-static void bfq_activate_entity(struct bfq_entity *entity) ++static void bfq_activate_entity(struct bfq_entity *entity, ++ bool non_blocking_wait_rq) + { + struct bfq_sched_data *sd; + + for_each_entity(entity) { +- __bfq_activate_entity(entity); ++ BUG_ON(!entity); ++ __bfq_activate_entity(entity, non_blocking_wait_rq); + + sd = entity->sched_data; + if (!bfq_update_next_in_service(sd)) +@@ -890,23 +1055,24 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) + + if (!__bfq_deactivate_entity(entity, requeue)) + /* +- * The parent entity is still backlogged, and +- * we don't need to update it as it is still +- * in service. ++ * next_in_service has not been changed, so ++ * no upwards update is needed + */ + break; + + if (sd->next_in_service) + /* +- * The parent entity is still backlogged and +- * the budgets on the path towards the root +- * need to be updated. ++ * The parent entity is still backlogged, ++ * because next_in_service is not NULL, and ++ * next_in_service has been updated (see ++ * comment on the body of the above if): ++ * upwards update of the schedule is needed. + */ + goto update; + + /* +- * If we reach there the parent is no more backlogged and +- * we want to propagate the dequeue upwards. ++ * If we get here, then the parent is no more backlogged and ++ * we want to propagate the deactivation upwards. + */ + requeue = 1; + } +@@ -916,9 +1082,23 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) + update: + entity = parent; + for_each_entity(entity) { +- __bfq_activate_entity(entity); ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ __bfq_activate_entity(entity, false); + + sd = entity->sched_data; ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "invoking udpdate_next for this queue"); ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, ++ struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "invoking udpdate_next for this entity"); ++ } ++#endif + if (!bfq_update_next_in_service(sd)) + break; + } +@@ -997,10 +1177,11 @@ left: + * Update the virtual time in @st and return the first eligible entity + * it contains. + */ +-static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, +- bool force) ++static struct bfq_entity * ++__bfq_lookup_next_entity(struct bfq_service_tree *st, bool force) + { + struct bfq_entity *entity, *new_next_in_service = NULL; ++ struct bfq_queue *bfqq; + + if (RB_EMPTY_ROOT(&st->active)) + return NULL; +@@ -1009,6 +1190,24 @@ static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, + entity = bfq_first_active_entity(st); + BUG_ON(bfq_gt(entity->start, st->vtime)); + ++ bfqq = bfq_entity_to_bfqq(entity); ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "__lookup_next: start %llu vtime %llu st %p", ++ ((entity->start>>10)*1000)>>12, ++ ((st->vtime>>10)*1000)>>12, st); ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "__lookup_next: start %llu vtime %llu st %p", ++ ((entity->start>>10)*1000)>>12, ++ ((st->vtime>>10)*1000)>>12, st); ++ } ++#endif ++ + /* + * If the chosen entity does not match with the sched_data's + * next_in_service and we are forcedly serving the IDLE priority +@@ -1045,10 +1244,28 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + BUG_ON(sd->in_service_entity); + + if (bfqd && +- jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { ++ jiffies - bfqd->bfq_class_idle_last_service > ++ BFQ_CL_IDLE_TIMEOUT) { + entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, + true); + if (entity) { ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ if (bfqq) ++ bfq_log_bfqq(bfqd, bfqq, ++ "idle chosen from st %p %d", ++ st + BFQ_IOPRIO_CLASSES - 1, ++ BFQ_IOPRIO_CLASSES - 1) ; ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg(bfqd, bfqg, ++ "idle chosen from st %p %d", ++ st + BFQ_IOPRIO_CLASSES - 1, ++ BFQ_IOPRIO_CLASSES - 1) ; ++ } ++#endif + i = BFQ_IOPRIO_CLASSES - 1; + bfqd->bfq_class_idle_last_service = jiffies; + sd->next_in_service = entity; +@@ -1057,6 +1274,24 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + for (; i < BFQ_IOPRIO_CLASSES; i++) { + entity = __bfq_lookup_next_entity(st + i, false); + if (entity) { ++ if (bfqd != NULL) { ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ if (bfqq) ++ bfq_log_bfqq(bfqd, bfqq, ++ "chosen from st %p %d", ++ st + i, i) ; ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg(bfqd, bfqg, ++ "chosen from st %p %d", ++ st + i, i) ; ++ } ++#endif ++ } ++ + if (extract) { + bfq_check_next_in_service(sd, entity); + bfq_active_extract(st + i, entity); +@@ -1070,6 +1305,13 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + return entity; + } + ++static bool next_queue_may_preempt(struct bfq_data *bfqd) ++{ ++ struct bfq_sched_data *sd = &bfqd->root_group->sched_data; ++ ++ return sd->next_in_service != sd->in_service_entity; ++} ++ + /* + * Get next queue for service. + */ +@@ -1086,7 +1328,36 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) + + sd = &bfqd->root_group->sched_data; + for (; sd ; sd = entity->my_sched_data) { ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ if (entity) { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg(bfqd, bfqg, ++ "get_next_queue: lookup in this group"); ++ } else ++ bfq_log_bfqg(bfqd, bfqd->root_group, ++ "get_next_queue: lookup in root group"); ++#endif ++ + entity = bfq_lookup_next_entity(sd, 1, bfqd); ++ ++ bfqq = bfq_entity_to_bfqq(entity); ++ if (bfqq) ++ bfq_log_bfqq(bfqd, bfqq, ++ "get_next_queue: this queue, finish %llu", ++ (((entity->finish>>10)*1000)>>10)>>2); ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg(bfqd, bfqg, ++ "get_next_queue: this entity, finish %llu", ++ (((entity->finish>>10)*1000)>>10)>>2); ++ } ++#endif ++ + BUG_ON(!entity); + entity->service = 0; + } +@@ -1113,9 +1384,7 @@ static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + { + struct bfq_entity *entity = &bfqq->entity; + +- if (bfqq == bfqd->in_service_queue) +- __bfq_bfqd_reset_in_service(bfqd); +- ++ BUG_ON(bfqq == bfqd->in_service_queue); + bfq_deactivate_entity(entity, requeue); + } + +@@ -1123,12 +1392,11 @@ static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) + { + struct bfq_entity *entity = &bfqq->entity; + +- bfq_activate_entity(entity); ++ bfq_activate_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq)); ++ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); + } + +-#ifdef CONFIG_BFQ_GROUP_IOSCHED + static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); +-#endif + + /* + * Called when the bfqq no longer has requests pending, remove it from +@@ -1139,6 +1407,7 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, + { + BUG_ON(!bfq_bfqq_busy(bfqq)); + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ BUG_ON(bfqq == bfqd->in_service_queue); + + bfq_log_bfqq(bfqd, bfqq, "del from busy"); + +@@ -1147,27 +1416,20 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, + BUG_ON(bfqd->busy_queues == 0); + bfqd->busy_queues--; + +- if (!bfqq->dispatched) { ++ if (!bfqq->dispatched) + bfq_weights_tree_remove(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); +- if (!blk_queue_nonrot(bfqd->queue)) { +- BUG_ON(!bfqd->busy_in_flight_queues); +- bfqd->busy_in_flight_queues--; +- if (bfq_bfqq_constantly_seeky(bfqq)) { +- BUG_ON(!bfqd-> +- const_seeky_busy_in_flight_queues); +- bfqd->const_seeky_busy_in_flight_queues--; +- } +- } +- } ++ + if (bfqq->wr_coeff > 1) + bfqd->wr_busy_queues--; + +-#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqg_stats_update_dequeue(bfqq_group(bfqq)); +-#endif + ++ BUG_ON(bfqq->entity.budget < 0); ++ + bfq_deactivate_bfqq(bfqd, bfqq, requeue); ++ ++ BUG_ON(bfqq->entity.budget < 0); + } + + /* +@@ -1185,16 +1447,11 @@ static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) + bfq_mark_bfqq_busy(bfqq); + bfqd->busy_queues++; + +- if (!bfqq->dispatched) { ++ if (!bfqq->dispatched) + if (bfqq->wr_coeff == 1) + bfq_weights_tree_add(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); +- if (!blk_queue_nonrot(bfqd->queue)) { +- bfqd->busy_in_flight_queues++; +- if (bfq_bfqq_constantly_seeky(bfqq)) +- bfqd->const_seeky_busy_in_flight_queues++; +- } +- } ++ + if (bfqq->wr_coeff > 1) + bfqd->wr_busy_queues++; + } +diff --git a/block/bfq.h b/block/bfq.h +index f73c942..c6ba099 100644 +--- a/block/bfq.h ++++ b/block/bfq.h +@@ -1,5 +1,5 @@ + /* +- * BFQ-v7r11 for 4.5.0: data structures and common functions prototypes. ++ * BFQ-v8r2 for 4.7.0: data structures and common functions prototypes. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe +@@ -28,20 +28,21 @@ + + #define BFQ_DEFAULT_QUEUE_IOPRIO 4 + +-#define BFQ_DEFAULT_GRP_WEIGHT 10 ++#define BFQ_WEIGHT_LEGACY_DFL 100 + #define BFQ_DEFAULT_GRP_IOPRIO 0 + #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE + ++/* ++ * Soft real-time applications are extremely more latency sensitive ++ * than interactive ones. Over-raise the weight of the former to ++ * privilege them against the latter. ++ */ ++#define BFQ_SOFTRT_WEIGHT_FACTOR 100 ++ + struct bfq_entity; + + /** + * struct bfq_service_tree - per ioprio_class service tree. +- * @active: tree for active entities (i.e., those backlogged). +- * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). +- * @first_idle: idle entity with minimum F_i. +- * @last_idle: idle entity with maximum F_i. +- * @vtime: scheduler virtual time. +- * @wsum: scheduler weight sum; active and idle entities contribute to it. + * + * Each service tree represents a B-WF2Q+ scheduler on its own. Each + * ioprio_class has its own independent scheduler, and so its own +@@ -49,27 +50,28 @@ struct bfq_entity; + * of the containing bfqd. + */ + struct bfq_service_tree { ++ /* tree for active entities (i.e., those backlogged) */ + struct rb_root active; ++ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ + struct rb_root idle; + +- struct bfq_entity *first_idle; +- struct bfq_entity *last_idle; ++ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ ++ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ + +- u64 vtime; ++ u64 vtime; /* scheduler virtual time */ ++ /* scheduler weight sum; active and idle entities contribute to it */ + unsigned long wsum; + }; + + /** + * struct bfq_sched_data - multi-class scheduler. +- * @in_service_entity: entity in service. +- * @next_in_service: head-of-the-line entity in the scheduler. +- * @service_tree: array of service trees, one per ioprio_class. + * + * bfq_sched_data is the basic scheduler queue. It supports three +- * ioprio_classes, and can be used either as a toplevel queue or as +- * an intermediate queue on a hierarchical setup. +- * @next_in_service points to the active entity of the sched_data +- * service trees that will be scheduled next. ++ * ioprio_classes, and can be used either as a toplevel queue or as an ++ * intermediate queue on a hierarchical setup. @next_in_service ++ * points to the active entity of the sched_data service trees that ++ * will be scheduled next. It is used to reduce the number of steps ++ * needed for each hierarchical-schedule update. + * + * The supported ioprio_classes are the same as in CFQ, in descending + * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. +@@ -79,48 +81,29 @@ struct bfq_service_tree { + * All the fields are protected by the queue lock of the containing bfqd. + */ + struct bfq_sched_data { +- struct bfq_entity *in_service_entity; ++ struct bfq_entity *in_service_entity; /* entity in service */ ++ /* head-of-the-line entity in the scheduler (see comments above) */ + struct bfq_entity *next_in_service; ++ /* array of service trees, one per ioprio_class */ + struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; + }; + + /** + * struct bfq_weight_counter - counter of the number of all active entities + * with a given weight. +- * @weight: weight of the entities that this counter refers to. +- * @num_active: number of active entities with this weight. +- * @weights_node: weights tree member (see bfq_data's @queue_weights_tree +- * and @group_weights_tree). + */ + struct bfq_weight_counter { +- short int weight; +- unsigned int num_active; ++ unsigned int weight; /* weight of the entities this counter refers to */ ++ unsigned int num_active; /* nr of active entities with this weight */ ++ /* ++ * Weights tree member (see bfq_data's @queue_weights_tree and ++ * @group_weights_tree) ++ */ + struct rb_node weights_node; + }; + + /** + * struct bfq_entity - schedulable entity. +- * @rb_node: service_tree member. +- * @weight_counter: pointer to the weight counter associated with this entity. +- * @on_st: flag, true if the entity is on a tree (either the active or +- * the idle one of its service_tree). +- * @finish: B-WF2Q+ finish timestamp (aka F_i). +- * @start: B-WF2Q+ start timestamp (aka S_i). +- * @tree: tree the entity is enqueued into; %NULL if not on a tree. +- * @min_start: minimum start time of the (active) subtree rooted at +- * this entity; used for O(log N) lookups into active trees. +- * @service: service received during the last round of service. +- * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. +- * @weight: weight of the queue +- * @parent: parent entity, for hierarchical scheduling. +- * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the +- * associated scheduler queue, %NULL on leaf nodes. +- * @sched_data: the scheduler queue this entity belongs to. +- * @ioprio: the ioprio in use. +- * @new_weight: when a weight change is requested, the new weight value. +- * @orig_weight: original weight, used to implement weight boosting +- * @prio_changed: flag, true when the user requested a weight, ioprio or +- * ioprio_class change. + * + * A bfq_entity is used to represent either a bfq_queue (leaf node in the + * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each +@@ -147,27 +130,52 @@ struct bfq_weight_counter { + * containing bfqd. + */ + struct bfq_entity { +- struct rb_node rb_node; ++ struct rb_node rb_node; /* service_tree member */ ++ /* pointer to the weight counter associated with this entity */ + struct bfq_weight_counter *weight_counter; + ++ /* ++ * flag, true if the entity is on a tree (either the active or ++ * the idle one of its service_tree). ++ */ + int on_st; + +- u64 finish; +- u64 start; ++ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ ++ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ + ++ /* tree the entity is enqueued into; %NULL if not on a tree */ + struct rb_root *tree; + ++ /* ++ * minimum start time of the (active) subtree rooted at this ++ * entity; used for O(log N) lookups into active trees ++ */ + u64 min_start; + +- int service, budget; +- unsigned short weight, new_weight; +- unsigned short orig_weight; ++ /* amount of service received during the last service slot */ ++ int service; ++ ++ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ ++ int budget; ++ ++ unsigned int weight; /* weight of the queue */ ++ unsigned int new_weight; /* next weight if a change is in progress */ ++ ++ /* original weight, used to implement weight boosting */ ++ unsigned int orig_weight; + ++ /* parent entity, for hierarchical scheduling */ + struct bfq_entity *parent; + ++ /* ++ * For non-leaf nodes in the hierarchy, the associated ++ * scheduler queue, %NULL on leaf nodes. ++ */ + struct bfq_sched_data *my_sched_data; ++ /* the scheduler queue this entity belongs to */ + struct bfq_sched_data *sched_data; + ++ /* flag, set to request a weight, ioprio or ioprio_class change */ + int prio_changed; + }; + +@@ -175,56 +183,6 @@ struct bfq_group; + + /** + * struct bfq_queue - leaf schedulable entity. +- * @ref: reference counter. +- * @bfqd: parent bfq_data. +- * @new_ioprio: when an ioprio change is requested, the new ioprio value. +- * @ioprio_class: the ioprio_class in use. +- * @new_ioprio_class: when an ioprio_class change is requested, the new +- * ioprio_class value. +- * @new_bfqq: shared bfq_queue if queue is cooperating with +- * one or more other queues. +- * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree). +- * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree). +- * @sort_list: sorted list of pending requests. +- * @next_rq: if fifo isn't expired, next request to serve. +- * @queued: nr of requests queued in @sort_list. +- * @allocated: currently allocated requests. +- * @meta_pending: pending metadata requests. +- * @fifo: fifo list of requests in sort_list. +- * @entity: entity representing this queue in the scheduler. +- * @max_budget: maximum budget allowed from the feedback mechanism. +- * @budget_timeout: budget expiration (in jiffies). +- * @dispatched: number of requests on the dispatch list or inside driver. +- * @flags: status flags. +- * @bfqq_list: node for active/idle bfqq list inside our bfqd. +- * @burst_list_node: node for the device's burst list. +- * @seek_samples: number of seeks sampled +- * @seek_total: sum of the distances of the seeks sampled +- * @seek_mean: mean seek distance +- * @last_request_pos: position of the last request enqueued +- * @requests_within_timer: number of consecutive pairs of request completion +- * and arrival, such that the queue becomes idle +- * after the completion, but the next request arrives +- * within an idle time slice; used only if the queue's +- * IO_bound has been cleared. +- * @pid: pid of the process owning the queue, used for logging purposes. +- * @last_wr_start_finish: start time of the current weight-raising period if +- * the @bfq-queue is being weight-raised, otherwise +- * finish time of the last weight-raising period +- * @wr_cur_max_time: current max raising time for this queue +- * @soft_rt_next_start: minimum time instant such that, only if a new +- * request is enqueued after this time instant in an +- * idle @bfq_queue with no outstanding requests, then +- * the task associated with the queue it is deemed as +- * soft real-time (see the comments to the function +- * bfq_bfqq_softrt_next_start()) +- * @last_idle_bklogged: time of the last transition of the @bfq_queue from +- * idle to backlogged +- * @service_from_backlogged: cumulative service received from the @bfq_queue +- * since the last transition from idle to +- * backlogged +- * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the +- * queue is shared + * + * A bfq_queue is a leaf request queue; it can be associated with an + * io_context or more, if it is async or shared between cooperating +@@ -235,117 +193,163 @@ struct bfq_group; + * All the fields are protected by the queue lock of the containing bfqd. + */ + struct bfq_queue { +- atomic_t ref; ++ /* reference counter */ ++ int ref; ++ /* parent bfq_data */ + struct bfq_data *bfqd; + +- unsigned short ioprio, new_ioprio; +- unsigned short ioprio_class, new_ioprio_class; ++ /* current ioprio and ioprio class */ ++ unsigned short ioprio, ioprio_class; ++ /* next ioprio and ioprio class if a change is in progress */ ++ unsigned short new_ioprio, new_ioprio_class; + +- /* fields for cooperating queues handling */ ++ /* ++ * Shared bfq_queue if queue is cooperating with one or more ++ * other queues. ++ */ + struct bfq_queue *new_bfqq; ++ /* request-position tree member (see bfq_group's @rq_pos_tree) */ + struct rb_node pos_node; ++ /* request-position tree root (see bfq_group's @rq_pos_tree) */ + struct rb_root *pos_root; + ++ /* sorted list of pending requests */ + struct rb_root sort_list; ++ /* if fifo isn't expired, next request to serve */ + struct request *next_rq; ++ /* number of sync and async requests queued */ + int queued[2]; ++ /* number of sync and async requests currently allocated */ + int allocated[2]; ++ /* number of pending metadata requests */ + int meta_pending; ++ /* fifo list of requests in sort_list */ + struct list_head fifo; + ++ /* entity representing this queue in the scheduler */ + struct bfq_entity entity; + ++ /* maximum budget allowed from the feedback mechanism */ + int max_budget; ++ /* budget expiration (in jiffies) */ + unsigned long budget_timeout; + ++ /* number of requests on the dispatch list or inside driver */ + int dispatched; + +- unsigned int flags; ++ unsigned int flags; /* status flags.*/ + ++ /* node for active/idle bfqq list inside parent bfqd */ + struct list_head bfqq_list; + ++ /* bit vector: a 1 for each seeky requests in history */ ++ u32 seek_history; ++ ++ /* node for the device's burst list */ + struct hlist_node burst_list_node; + +- unsigned int seek_samples; +- u64 seek_total; +- sector_t seek_mean; ++ /* position of the last request enqueued */ + sector_t last_request_pos; + ++ /* Number of consecutive pairs of request completion and ++ * arrival, such that the queue becomes idle after the ++ * completion, but the next request arrives within an idle ++ * time slice; used only if the queue's IO_bound flag has been ++ * cleared. ++ */ + unsigned int requests_within_timer; + ++ /* pid of the process owning the queue, used for logging purposes */ + pid_t pid; ++ ++ /* ++ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL ++ * if the queue is shared. ++ */ + struct bfq_io_cq *bic; + +- /* weight-raising fields */ ++ /* current maximum weight-raising time for this queue */ + unsigned long wr_cur_max_time; ++ /* ++ * Minimum time instant such that, only if a new request is ++ * enqueued after this time instant in an idle @bfq_queue with ++ * no outstanding requests, then the task associated with the ++ * queue it is deemed as soft real-time (see the comments on ++ * the function bfq_bfqq_softrt_next_start()) ++ */ + unsigned long soft_rt_next_start; ++ /* ++ * Start time of the current weight-raising period if ++ * the @bfq-queue is being weight-raised, otherwise ++ * finish time of the last weight-raising period. ++ */ + unsigned long last_wr_start_finish; ++ /* factor by which the weight of this queue is multiplied */ + unsigned int wr_coeff; ++ /* ++ * Time of the last transition of the @bfq_queue from idle to ++ * backlogged. ++ */ + unsigned long last_idle_bklogged; ++ /* ++ * Cumulative service received from the @bfq_queue since the ++ * last transition from idle to backlogged. ++ */ + unsigned long service_from_backlogged; ++ ++ unsigned long split_time; /* time of last split */ + }; + + /** + * struct bfq_ttime - per process thinktime stats. +- * @ttime_total: total process thinktime +- * @ttime_samples: number of thinktime samples +- * @ttime_mean: average process thinktime + */ + struct bfq_ttime { +- unsigned long last_end_request; ++ unsigned long last_end_request; /* completion time of last request */ ++ ++ unsigned long ttime_total; /* total process thinktime */ ++ unsigned long ttime_samples; /* number of thinktime samples */ ++ unsigned long ttime_mean; /* average process thinktime */ + +- unsigned long ttime_total; +- unsigned long ttime_samples; +- unsigned long ttime_mean; + }; + + /** + * struct bfq_io_cq - per (request_queue, io_context) structure. +- * @icq: associated io_cq structure +- * @bfqq: array of two process queues, the sync and the async +- * @ttime: associated @bfq_ttime struct +- * @ioprio: per (request_queue, blkcg) ioprio. +- * @blkcg_id: id of the blkcg the related io_cq belongs to. +- * @wr_time_left: snapshot of the time left before weight raising ends +- * for the sync queue associated to this process; this +- * snapshot is taken to remember this value while the weight +- * raising is suspended because the queue is merged with a +- * shared queue, and is used to set @raising_cur_max_time +- * when the queue is split from the shared queue and its +- * weight is raised again +- * @saved_idle_window: same purpose as the previous field for the idle +- * window +- * @saved_IO_bound: same purpose as the previous two fields for the I/O +- * bound classification of a queue +- * @saved_in_large_burst: same purpose as the previous fields for the +- * value of the field keeping the queue's belonging +- * to a large burst +- * @was_in_burst_list: true if the queue belonged to a burst list +- * before its merge with another cooperating queue +- * @cooperations: counter of consecutive successful queue merges underwent +- * by any of the process' @bfq_queues +- * @failed_cooperations: counter of consecutive failed queue merges of any +- * of the process' @bfq_queues + */ + struct bfq_io_cq { ++ /* associated io_cq structure */ + struct io_cq icq; /* must be the first member */ ++ /* array of two process queues, the sync and the async */ + struct bfq_queue *bfqq[2]; ++ /* associated @bfq_ttime struct */ + struct bfq_ttime ttime; ++ /* per (request_queue, blkcg) ioprio */ + int ioprio; +- + #ifdef CONFIG_BFQ_GROUP_IOSCHED +- uint64_t blkcg_id; /* the current blkcg ID */ ++ uint64_t blkcg_serial_nr; /* the current blkcg serial */ + #endif + +- unsigned int wr_time_left; ++ /* ++ * Snapshot of the idle window before merging; taken to ++ * remember this value while the queue is merged, so as to be ++ * able to restore it in case of split. ++ */ + bool saved_idle_window; ++ /* ++ * Same purpose as the previous two fields for the I/O bound ++ * classification of a queue. ++ */ + bool saved_IO_bound; + ++ /* ++ * Same purpose as the previous fields for the value of the ++ * field keeping the queue's belonging to a large burst ++ */ + bool saved_in_large_burst; ++ /* ++ * True if the queue belonged to a burst list before its merge ++ * with another cooperating queue. ++ */ + bool was_in_burst_list; +- +- unsigned int cooperations; +- unsigned int failed_cooperations; + }; + + enum bfq_device_speed { +@@ -354,224 +358,216 @@ enum bfq_device_speed { + }; + + /** +- * struct bfq_data - per device data structure. +- * @queue: request queue for the managed device. +- * @root_group: root bfq_group for the device. +- * @active_numerous_groups: number of bfq_groups containing more than one +- * active @bfq_entity. +- * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by +- * weight. Used to keep track of whether all @bfq_queues +- * have the same weight. The tree contains one counter +- * for each distinct weight associated to some active +- * and not weight-raised @bfq_queue (see the comments to +- * the functions bfq_weights_tree_[add|remove] for +- * further details). +- * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted +- * by weight. Used to keep track of whether all +- * @bfq_groups have the same weight. The tree contains +- * one counter for each distinct weight associated to +- * some active @bfq_group (see the comments to the +- * functions bfq_weights_tree_[add|remove] for further +- * details). +- * @busy_queues: number of bfq_queues containing requests (including the +- * queue in service, even if it is idling). +- * @busy_in_flight_queues: number of @bfq_queues containing pending or +- * in-flight requests, plus the @bfq_queue in +- * service, even if idle but waiting for the +- * possible arrival of its next sync request. This +- * field is updated only if the device is rotational, +- * but used only if the device is also NCQ-capable. +- * The reason why the field is updated also for non- +- * NCQ-capable rotational devices is related to the +- * fact that the value of @hw_tag may be set also +- * later than when busy_in_flight_queues may need to +- * be incremented for the first time(s). Taking also +- * this possibility into account, to avoid unbalanced +- * increments/decrements, would imply more overhead +- * than just updating busy_in_flight_queues +- * regardless of the value of @hw_tag. +- * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues +- * (that is, seeky queues that expired +- * for budget timeout at least once) +- * containing pending or in-flight +- * requests, including the in-service +- * @bfq_queue if constantly seeky. This +- * field is updated only if the device +- * is rotational, but used only if the +- * device is also NCQ-capable (see the +- * comments to @busy_in_flight_queues). +- * @wr_busy_queues: number of weight-raised busy @bfq_queues. +- * @queued: number of queued requests. +- * @rq_in_driver: number of requests dispatched and waiting for completion. +- * @sync_flight: number of sync requests in the driver. +- * @max_rq_in_driver: max number of reqs in driver in the last +- * @hw_tag_samples completed requests. +- * @hw_tag_samples: nr of samples used to calculate hw_tag. +- * @hw_tag: flag set to one if the driver is showing a queueing behavior. +- * @budgets_assigned: number of budgets assigned. +- * @idle_slice_timer: timer set when idling for the next sequential request +- * from the queue in service. +- * @unplug_work: delayed work to restart dispatching on the request queue. +- * @in_service_queue: bfq_queue in service. +- * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. +- * @last_position: on-disk position of the last served request. +- * @last_budget_start: beginning of the last budget. +- * @last_idling_start: beginning of the last idle slice. +- * @peak_rate: peak transfer rate observed for a budget. +- * @peak_rate_samples: number of samples used to calculate @peak_rate. +- * @bfq_max_budget: maximum budget allotted to a bfq_queue before +- * rescheduling. +- * @active_list: list of all the bfq_queues active on the device. +- * @idle_list: list of all the bfq_queues idle on the device. +- * @bfq_fifo_expire: timeout for async/sync requests; when it expires +- * requests are served in fifo order. +- * @bfq_back_penalty: weight of backward seeks wrt forward ones. +- * @bfq_back_max: maximum allowed backward seek. +- * @bfq_slice_idle: maximum idling time. +- * @bfq_user_max_budget: user-configured max budget value +- * (0 for auto-tuning). +- * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to +- * async queues. +- * @bfq_timeout: timeout for bfq_queues to consume their budget; used to +- * to prevent seeky queues to impose long latencies to well +- * behaved ones (this also implies that seeky queues cannot +- * receive guarantees in the service domain; after a timeout +- * they are charged for the whole allocated budget, to try +- * to preserve a behavior reasonably fair among them, but +- * without service-domain guarantees). +- * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is +- * no more granted any weight-raising. +- * @bfq_failed_cooperations: number of consecutive failed cooperation +- * chances after which weight-raising is restored +- * to a queue subject to more than bfq_coop_thresh +- * queue merges. +- * @bfq_requests_within_timer: number of consecutive requests that must be +- * issued within the idle time slice to set +- * again idling to a queue which was marked as +- * non-I/O-bound (see the definition of the +- * IO_bound flag for further details). +- * @last_ins_in_burst: last time at which a queue entered the current +- * burst of queues being activated shortly after +- * each other; for more details about this and the +- * following parameters related to a burst of +- * activations, see the comments to the function +- * @bfq_handle_burst. +- * @bfq_burst_interval: reference time interval used to decide whether a +- * queue has been activated shortly after +- * @last_ins_in_burst. +- * @burst_size: number of queues in the current burst of queue activations. +- * @bfq_large_burst_thresh: maximum burst size above which the current +- * queue-activation burst is deemed as 'large'. +- * @large_burst: true if a large queue-activation burst is in progress. +- * @burst_list: head of the burst list (as for the above fields, more details +- * in the comments to the function bfq_handle_burst). +- * @low_latency: if set to true, low-latency heuristics are enabled. +- * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised +- * queue is multiplied. +- * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies). +- * @bfq_wr_rt_max_time: maximum duration for soft real-time processes. +- * @bfq_wr_min_idle_time: minimum idle period after which weight-raising +- * may be reactivated for a queue (in jiffies). +- * @bfq_wr_min_inter_arr_async: minimum period between request arrivals +- * after which weight-raising may be +- * reactivated for an already busy queue +- * (in jiffies). +- * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue, +- * sectors per seconds. +- * @RT_prod: cached value of the product R*T used for computing the maximum +- * duration of the weight raising automatically. +- * @device_speed: device-speed class for the low-latency heuristic. +- * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions. ++ * struct bfq_data - per-device data structure. + * + * All the fields are protected by the @queue lock. + */ + struct bfq_data { ++ /* request queue for the device */ + struct request_queue *queue; + ++ /* root bfq_group for the device */ + struct bfq_group *root_group; + +-#ifdef CONFIG_BFQ_GROUP_IOSCHED +- int active_numerous_groups; +-#endif +- ++ /* ++ * rbtree of weight counters of @bfq_queues, sorted by ++ * weight. Used to keep track of whether all @bfq_queues have ++ * the same weight. The tree contains one counter for each ++ * distinct weight associated to some active and not ++ * weight-raised @bfq_queue (see the comments to the functions ++ * bfq_weights_tree_[add|remove] for further details). ++ */ + struct rb_root queue_weights_tree; ++ /* ++ * rbtree of non-queue @bfq_entity weight counters, sorted by ++ * weight. Used to keep track of whether all @bfq_groups have ++ * the same weight. The tree contains one counter for each ++ * distinct weight associated to some active @bfq_group (see ++ * the comments to the functions bfq_weights_tree_[add|remove] ++ * for further details). ++ */ + struct rb_root group_weights_tree; + ++ /* ++ * Number of bfq_queues containing requests (including the ++ * queue in service, even if it is idling). ++ */ + int busy_queues; +- int busy_in_flight_queues; +- int const_seeky_busy_in_flight_queues; ++ /* number of weight-raised busy @bfq_queues */ + int wr_busy_queues; ++ /* number of queued requests */ + int queued; ++ /* number of requests dispatched and waiting for completion */ + int rq_in_driver; +- int sync_flight; + ++ /* ++ * Maximum number of requests in driver in the last ++ * @hw_tag_samples completed requests. ++ */ + int max_rq_in_driver; ++ /* number of samples used to calculate hw_tag */ + int hw_tag_samples; ++ /* flag set to one if the driver is showing a queueing behavior */ + int hw_tag; + ++ /* number of budgets assigned */ + int budgets_assigned; + ++ /* ++ * Timer set when idling (waiting) for the next request from ++ * the queue in service. ++ */ + struct timer_list idle_slice_timer; ++ /* delayed work to restart dispatching on the request queue */ + struct work_struct unplug_work; + ++ /* bfq_queue in service */ + struct bfq_queue *in_service_queue; ++ /* bfq_io_cq (bic) associated with the @in_service_queue */ + struct bfq_io_cq *in_service_bic; + ++ /* on-disk position of the last served request */ + sector_t last_position; + ++ /* beginning of the last budget */ + ktime_t last_budget_start; ++ /* beginning of the last idle slice */ + ktime_t last_idling_start; ++ /* number of samples used to calculate @peak_rate */ + int peak_rate_samples; ++ /* peak transfer rate observed for a budget */ + u64 peak_rate; ++ /* maximum budget allotted to a bfq_queue before rescheduling */ + int bfq_max_budget; + ++ /* list of all the bfq_queues active on the device */ + struct list_head active_list; ++ /* list of all the bfq_queues idle on the device */ + struct list_head idle_list; + ++ /* ++ * Timeout for async/sync requests; when it fires, requests ++ * are served in fifo order. ++ */ + unsigned int bfq_fifo_expire[2]; ++ /* weight of backward seeks wrt forward ones */ + unsigned int bfq_back_penalty; ++ /* maximum allowed backward seek */ + unsigned int bfq_back_max; ++ /* maximum idling time */ + unsigned int bfq_slice_idle; ++ /* last time CLASS_IDLE was served */ + u64 bfq_class_idle_last_service; + ++ /* user-configured max budget value (0 for auto-tuning) */ + int bfq_user_max_budget; +- int bfq_max_budget_async_rq; +- unsigned int bfq_timeout[2]; +- +- unsigned int bfq_coop_thresh; +- unsigned int bfq_failed_cooperations; ++ /* ++ * Timeout for bfq_queues to consume their budget; used to ++ * prevent seeky queues from imposing long latencies to ++ * sequential or quasi-sequential ones (this also implies that ++ * seeky queues cannot receive guarantees in the service ++ * domain; after a timeout they are charged for the time they ++ * have been in service, to preserve fairness among them, but ++ * without service-domain guarantees). ++ */ ++ unsigned int bfq_timeout; ++ ++ /* ++ * Number of consecutive requests that must be issued within ++ * the idle time slice to set again idling to a queue which ++ * was marked as non-I/O-bound (see the definition of the ++ * IO_bound flag for further details). ++ */ + unsigned int bfq_requests_within_timer; + ++ /* ++ * Force device idling whenever needed to provide accurate ++ * service guarantees, without caring about throughput ++ * issues. CAVEAT: this may even increase latencies, in case ++ * of useless idling for processes that did stop doing I/O. ++ */ ++ bool strict_guarantees; ++ ++ /* ++ * Last time at which a queue entered the current burst of ++ * queues being activated shortly after each other; for more ++ * details about this and the following parameters related to ++ * a burst of activations, see the comments on the function ++ * bfq_handle_burst. ++ */ + unsigned long last_ins_in_burst; ++ /* ++ * Reference time interval used to decide whether a queue has ++ * been activated shortly after @last_ins_in_burst. ++ */ + unsigned long bfq_burst_interval; ++ /* number of queues in the current burst of queue activations */ + int burst_size; ++ ++ /* common parent entity for the queues in the burst */ ++ struct bfq_entity *burst_parent_entity; ++ /* Maximum burst size above which the current queue-activation ++ * burst is deemed as 'large'. ++ */ + unsigned long bfq_large_burst_thresh; ++ /* true if a large queue-activation burst is in progress */ + bool large_burst; ++ /* ++ * Head of the burst list (as for the above fields, more ++ * details in the comments on the function bfq_handle_burst). ++ */ + struct hlist_head burst_list; + ++ /* if set to true, low-latency heuristics are enabled */ + bool low_latency; +- +- /* parameters of the low_latency heuristics */ ++ /* ++ * Maximum factor by which the weight of a weight-raised queue ++ * is multiplied. ++ */ + unsigned int bfq_wr_coeff; ++ /* maximum duration of a weight-raising period (jiffies) */ + unsigned int bfq_wr_max_time; ++ ++ /* Maximum weight-raising duration for soft real-time processes */ + unsigned int bfq_wr_rt_max_time; ++ /* ++ * Minimum idle period after which weight-raising may be ++ * reactivated for a queue (in jiffies). ++ */ + unsigned int bfq_wr_min_idle_time; ++ /* ++ * Minimum period between request arrivals after which ++ * weight-raising may be reactivated for an already busy async ++ * queue (in jiffies). ++ */ + unsigned long bfq_wr_min_inter_arr_async; ++ ++ /* Max service-rate for a soft real-time queue, in sectors/sec */ + unsigned int bfq_wr_max_softrt_rate; ++ /* ++ * Cached value of the product R*T, used for computing the ++ * maximum duration of weight raising automatically. ++ */ + u64 RT_prod; ++ /* device-speed class for the low-latency heuristic */ + enum bfq_device_speed device_speed; + ++ /* fallback dummy bfqq for extreme OOM conditions */ + struct bfq_queue oom_bfqq; + }; + + enum bfqq_state_flags { +- BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ ++ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ ++ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ + BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ ++ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* ++ * waiting for a request ++ * without idling the device ++ */ + BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ + BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ + BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ + BFQ_BFQQ_FLAG_sync, /* synchronous queue */ +- BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ + BFQ_BFQQ_FLAG_IO_bound, /* + * bfqq has timed-out at least once + * having consumed at most 2/10 of +@@ -581,17 +577,12 @@ enum bfqq_state_flags { + * bfqq activated in a large burst, + * see comments to bfq_handle_burst. + */ +- BFQ_BFQQ_FLAG_constantly_seeky, /* +- * bfqq has proved to be slow and +- * seeky until budget timeout +- */ + BFQ_BFQQ_FLAG_softrt_update, /* + * may need softrt-next-start + * update + */ + BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ +- BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ +- BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ ++ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ + }; + + #define BFQ_BFQQ_FNS(name) \ +@@ -608,25 +599,53 @@ static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ + return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ + } + ++BFQ_BFQQ_FNS(just_created); + BFQ_BFQQ_FNS(busy); + BFQ_BFQQ_FNS(wait_request); ++BFQ_BFQQ_FNS(non_blocking_wait_rq); + BFQ_BFQQ_FNS(must_alloc); + BFQ_BFQQ_FNS(fifo_expire); + BFQ_BFQQ_FNS(idle_window); + BFQ_BFQQ_FNS(sync); +-BFQ_BFQQ_FNS(budget_new); + BFQ_BFQQ_FNS(IO_bound); + BFQ_BFQQ_FNS(in_large_burst); +-BFQ_BFQQ_FNS(constantly_seeky); + BFQ_BFQQ_FNS(coop); + BFQ_BFQQ_FNS(split_coop); +-BFQ_BFQQ_FNS(just_split); + BFQ_BFQQ_FNS(softrt_update); + #undef BFQ_BFQQ_FNS + + /* Logging facilities. */ +-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ +- blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); ++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ ++ char __pbuf[128]; \ ++ \ ++ assert_spin_locked((bfqd)->queue->queue_lock); \ ++ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ ++ (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ __pbuf, ##args); \ ++} while (0) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ ++ char __pbuf[128]; \ ++ \ ++ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ ++ blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ ++} while (0) ++ ++#else /* CONFIG_BFQ_GROUP_IOSCHED */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ ##args) ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) ++ ++#endif /* CONFIG_BFQ_GROUP_IOSCHED */ + + #define bfq_log(bfqd, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) +@@ -640,15 +659,12 @@ enum bfqq_expiration { + BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ + BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ + BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ ++ BFQ_BFQQ_PREEMPTED /* preemption in progress */ + }; + +-#ifdef CONFIG_BFQ_GROUP_IOSCHED + + struct bfqg_stats { +- /* total bytes transferred */ +- struct blkg_rwstat service_bytes; +- /* total IOs serviced, post merge */ +- struct blkg_rwstat serviced; ++#ifdef CONFIG_BFQ_GROUP_IOSCHED + /* number of ios merged */ + struct blkg_rwstat merged; + /* total time spent on device in ns, may not be accurate w/ queueing */ +@@ -657,12 +673,8 @@ struct bfqg_stats { + struct blkg_rwstat wait_time; + /* number of IOs queued up */ + struct blkg_rwstat queued; +- /* total sectors transferred */ +- struct blkg_stat sectors; + /* total disk time and nr sectors dispatched by this group */ + struct blkg_stat time; +- /* time not charged to this cgroup */ +- struct blkg_stat unaccounted_time; + /* sum of number of ios queued across all samples */ + struct blkg_stat avg_queue_size_sum; + /* count of samples taken for average */ +@@ -680,8 +692,10 @@ struct bfqg_stats { + uint64_t start_idle_time; + uint64_t start_empty_time; + uint16_t flags; ++#endif + }; + ++#ifdef CONFIG_BFQ_GROUP_IOSCHED + /* + * struct bfq_group_data - per-blkcg storage for the blkio subsystem. + * +@@ -692,7 +706,7 @@ struct bfq_group_data { + /* must be the first member */ + struct blkcg_policy_data pd; + +- unsigned short weight; ++ unsigned int weight; + }; + + /** +@@ -712,7 +726,7 @@ struct bfq_group_data { + * unused for the root group. Used to know whether there + * are groups with more than one active @bfq_entity + * (see the comments to the function +- * bfq_bfqq_must_not_expire()). ++ * bfq_bfqq_may_idle()). + * @rq_pos_tree: rbtree sorted by next_request position, used when + * determining if two or more queues have interleaving + * requests (see bfq_find_close_cooperator()). +@@ -745,7 +759,6 @@ struct bfq_group { + struct rb_root rq_pos_tree; + + struct bfqg_stats stats; +- struct bfqg_stats dead_stats; /* stats pushed from dead children */ + }; + + #else +@@ -767,11 +780,25 @@ bfq_entity_service_tree(struct bfq_entity *entity) + struct bfq_sched_data *sched_data = entity->sched_data; + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + unsigned int idx = bfqq ? bfqq->ioprio_class - 1 : +- BFQ_DEFAULT_GRP_CLASS; ++ BFQ_DEFAULT_GRP_CLASS - 1; + + BUG_ON(idx >= BFQ_IOPRIO_CLASSES); + BUG_ON(sched_data == NULL); + ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "entity_service_tree %p %d", ++ sched_data->service_tree + idx, idx) ; ++#ifdef CONFIG_BFQ_GROUP_IOSCHED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "entity_service_tree %p %d", ++ sched_data->service_tree + idx, idx) ; ++ } ++#endif + return sched_data->service_tree + idx; + } + +@@ -791,47 +818,6 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) + return bic->icq.q->elevator->elevator_data; + } + +-/** +- * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. +- * @ptr: a pointer to a bfqd. +- * @flags: storage for the flags to be saved. +- * +- * This function allows bfqg->bfqd to be protected by the +- * queue lock of the bfqd they reference; the pointer is dereferenced +- * under RCU, so the storage for bfqd is assured to be safe as long +- * as the RCU read side critical section does not end. After the +- * bfqd->queue->queue_lock is taken the pointer is rechecked, to be +- * sure that no other writer accessed it. If we raced with a writer, +- * the function returns NULL, with the queue unlocked, otherwise it +- * returns the dereferenced pointer, with the queue locked. +- */ +-static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags) +-{ +- struct bfq_data *bfqd; +- +- rcu_read_lock(); +- bfqd = rcu_dereference(*(struct bfq_data **)ptr); +- +- if (bfqd != NULL) { +- spin_lock_irqsave(bfqd->queue->queue_lock, *flags); +- if (ptr == NULL) +- printk(KERN_CRIT "get_bfqd_locked pointer NULL\n"); +- else if (*ptr == bfqd) +- goto out; +- spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); +- } +- +- bfqd = NULL; +-out: +- rcu_read_unlock(); +- return bfqd; +-} +- +-static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) +-{ +- spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); +-} +- + #ifdef CONFIG_BFQ_GROUP_IOSCHED + + static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) +@@ -857,11 +843,13 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); + static void bfq_put_queue(struct bfq_queue *bfqq); + static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); + static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, +- struct bio *bio, int is_sync, +- struct bfq_io_cq *bic, gfp_t gfp_mask); ++ struct bio *bio, bool is_sync, ++ struct bfq_io_cq *bic); + static void bfq_end_wr_async_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg); ++#ifdef CONFIG_BFQ_GROUP_IOSCHED + static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); ++#endif + static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); + + #endif /* _BFQ_H */ +-- +1.9.1 + diff --git a/README.BFQ b/README.BFQ new file mode 100644 index 0000000..2414a10 --- /dev/null +++ b/README.BFQ @@ -0,0 +1,539 @@ +Budget Fair Queueing I/O Scheduler +================================== + +This patchset introduces BFQ-v8r2 into Linux 4.7.0. +For further information: http://algogroup.unimore.it/people/paolo/disk_sched/. + +The overall diffstat is the following: + + block/Kconfig.iosched | 30 + + block/Makefile | 1 + + block/bfq-cgroup.c | 1178 +++++++++++++++++++++ + block/bfq-ioc.c | 36 + + block/bfq-iosched.c | 4895 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + block/bfq-sched.c | 1450 ++++++++++++++++++++++++++ + block/bfq.h | 848 +++++++++++++++ + include/linux/blkdev.h | 2 +- + 8 files changed, 8439 insertions(+), 1 deletion(-) + +CHANGELOG + +v8r2 + +. BUGFIX Removed variables that are not used if tracing is + disabled. Reported by Lee Tibbert + +. IMPROVEMENT Ported commit ae11889636: turned blkg_lookup_create into + blkg_lookup. As a side benefit, this finally enables BFQ to be used + as a module even with full hierarchical support. + +v8r1 + +. BUGFIX Fixed incorrect invariant check + +. IMPROVEMENT Privileged soft real-time applications against + interactive ones, to guarantee a lower and more stable latency to + the former + +v8 + +. BUGFIX: Fixed incorrect rcu locking in bfq_bic_update_cgroup + +. BUGFIX Fixed a few cgroups-related bugs, causing sporadic crashes + +. BUGFIX Fixed wrong computation of queue weights as a function of ioprios + +. BUGFIX Fixed wrong Kconfig.iosched dependency for BFQ_GROUP_IOSCHED + +. IMPROVEMENT Preemption-based, idle-less service guarantees. If + several processes are competing for the device at the same time, but + all processes and groups have the same weight, then the mechanism + introduced by this improvement enables BFQ to guarantee the expected + throughput distribution without ever idling the device. Throughput + is then much higher in this common scenario. + +. IMPROVEMENT Made burst handling more robust + +. IMPROVEMENT Reduced false positives in EQM + +. IMPROVEMENT Let queues preserve weight-raising also when shared + +. IMPROVEMENT Improved peak-rate estimation and autotuning of the + parameters related to the device rate + +. IMPROVEMENT Improved the weight-raising mechanism so as to further + reduce latency and to increase robustness + +. IMPROVEMENT Added a strict-guarantees tunable. If this tunable is + set, then device-idling is forced whenever needed to provide + accurate service guarantees. CAVEAT: idling unconditionally may even + increase latencies, in case of processes that did stop doing I/O. + +. IMPROVEMENT Improved handling of async (write) I/O requests + +. IMPROVEMENT Ported several good CFQ commits + +. CHANGE Changed default group weight to 100 + +. CODE IMPROVEMENT Refactored I/O-request-insertion code + +v7r11: +. BUGFIX Remove the group_list data structure, which ended up in an + inconsistent state if BFQ happened to be activated for some device + when some blkio groups already existed (these groups where not added + to the list). The blkg list for the request queue is now used where + the removed group_list was used. + +. BUGFIX Init and reset also dead_stats. + +. BUGFIX Added, in __bfq_deactivate_entity, the correct handling of the + case where the entity to deactivate has not yet been activated at all. + +. BUGFIX Added missing free of the root group for the case where full + hierarchical support is not activated. + +. IMPROVEMENT Removed the now useless bfq_disconnect_groups + function. The same functionality is achieved through multiple + invocations of bfq_pd_offline (which are in their turn guaranteed to + be executed, when needed, by the blk-cgroups code). + +v7r10 : +. BUGFIX: Fixed wrong check on whether cooperating processes belong + to the same cgroup. + +v7r9: +. IMPROVEMENT: Changed BFQ to use the blkio controller instead of its + own controller. BFQ now registers itself as a policy to the blkio + controller and implements its hierarchical scheduling support using + data structures that already exist in blk-cgroup. The bfqio + controller's code is completely removed. + +. CODE IMPROVEMENTS: Applied all suggestions from Tejun Heo, received + on the last submission to lkml: https://lkml.org/lkml/2014/5/27/314. + +v7r8: +. BUGFIX: Let weight-related fields of a bfq_entity be correctly initialized + (also) when the I/O priority of the entity is changed before the first + request is inserted into the bfq_queue associated to the entity. +. BUGFIX: When merging requests belonging to different bfq_queues, avoid + repositioning the surviving request. In fact, in this case the repositioning + may result in the surviving request being moved across bfq_queues, which + would ultimately cause bfq_queues' data structures to become inconsistent. +. BUGFIX: When merging requests belonging to the same bfq_queue, reposition + the surviving request so that it gets in the correct position, namely the + position of the dropped request, instead of always being moved to the head + of the FIFO of the bfq_queue (which means to let the request be considered + the eldest one). +. BUGFIX: Reduce the idling slice for seeky queues only if the scenario is + symmetric. This guarantees that also processes associated to seeky queues + do receive their reserved share of the throughput. + Contributed by Riccardo Pizzetti and Samuele Zecchini. +. IMPROVEMENT: Always perform device idling if the scenario is asymmetric in + terms of throughput distribution among processes. + This extends throughput-distribution guarantees to any process, regardless + of the properties of its request pattern and of the request patterns of the + other processes, and regardless of whether the device is NCQ-capable. +. IMPROVEMENT: Remove the current limitation on the maximum number of in-flight + requests allowed for a sync queue (limitation set in place for fairness + issues in CFQ, inherited by the first version of BFQ, but made unnecessary + by the latest accurate fairness strategies added to BFQ). Removing this + limitation enables devices with long internal queues to fill their queues + as much as they deem appropriate, also with sync requests. This avoids + throughput losses on these devices, because, to achieve a high throughput, + they often need to have a high number of requests queued internally. +. CODE IMPROVEMENT: Simplify I/O priority change logic by turning it into a + single-step procedure instead of a two-step one; improve readability by + rethinking the names of the functions involved in changing the I/O priority + of a bfq_queue. + +v7r7: +. BUGFIX: Prevent the OOM queue from being involved in the queue + cooperation mechanism. In fact, since the requests temporarily + redirected to the OOM queue could be redirected again to dedicated + queues at any time, the state needed to correctly handle merging + with the OOM queue would be quite complex and expensive to + maintain. Besides, in such a critical condition as an out of + memory, the benefits of queue merging may be little relevant, or + even negligible. +. IMPROVEMENT: Let the OOM queue be initialized only once. Previously, + the OOM queue was reinitialized, at each request enqueue, with the + parameters related to the process that issued that request. + Depending on the parameters of the processes doing I/O, this could + easily cause the OOM queue to be moved continuously across service + trees, or even across groups. It also caused the parameters of the + OOM queue to be continuously reset in any case. +. CODE IMPROVEMENT. Performed some minor code cleanups, and added some + BUG_ON()s that, if the weight of an entity becomes inconsistent, + should better help understand why. + +v7r6: +. IMPROVEMENT: Introduced a new mechanism that helps get the job done + more quickly with services and applications that create or reactivate + many parallel I/O-bound processes. This is the case, for example, with + systemd at boot, or with commands like git grep. +. CODE IMPROVEMENTS: Small code cleanups and improvements. + +v7r5: +. IMPROVEMENT: Improve throughput boosting by idling the device + only for processes that, in addition to perform sequential I/O, + are I/O-bound (apart from weight-raised queues, for which idling + is always performed to guarantee them a low latency). +. IMPROVEMENT: Improve throughput boosting by depriving processes + that cooperate often of weight-raising. +. CODE IMPROVEMENT: Pass of improvement of the readability of both + comments and actual code. + +v7r4: +. BUGFIX. Modified the code so as to be robust against late detection of + NCQ support for a rotational device. +. BUGFIX. Removed a bug that hindered the correct throughput distribution + on flash-based devices when not every process had to receive the same + fraction of the throughput. This fix entailed also a little efficiency + improvement, because it implied the removal of a short function executed + in a hot path. +. CODESTYLE IMPROVEMENT: removed quoted strings split across lines. + +v7r3: +. IMPROVEMENT: Improved throughput boosting with NCQ-capable HDDs and + random workloads. The mechanism that further boosts throghput with + these devices and workloads is activated only in the cases where it + does not cause any violation of throughput-distribution and latency + guarantees. +. IMPROVEMENT: Generalized the computation of the parameters of the + low-latency heuristic for interactive applications, so as to fit also + slower storage devices. The purpose of this improvement is to preserve + low-latency guarantees for interactive applications also on slower + devices, such as portable hard disks, multimedia and SD cards. +. BUGFIX: Re-added MODULE_LICENSE macro. +. CODE IMPROVEMENTS: Small code cleanups; introduced a coherent naming + scheme for all identifiers related to weight raising; refactored and + optimized a few hot paths. + +v7r2: +. BUGFIX/IMPROVEMENT. One of the requirements for an application to be + deemed as soft real-time is that it issues its requests in batches, and + stops doing I/O for a well-defined amount of time before issuing a new + batch. Imposing this minimum idle time allows BFQ to filter out I/O-bound + applications that may otherwise be incorrectly deemed as soft real-time + (under the circumstances described in detail in the comments to the + function bfq_bfqq_softrt_next_start()). Unfortunately, BFQ could however + start counting this idle time from two different events: either from the + expiration of the queue, if all requests of the queue had also been already + completed when the queue expired, or, if the previous condition did not + hold, from the first completion of one of the still outstanding requests. + In the second case, an application had more chances to be deemed as soft + real-time. + Actually, there was no reason for this differentiated treatment. We + addressed this issue by defining more precisely the above requirement for + an application to be deemed as soft real-time, and changing the code + consequently: a well-defined amount of time must elapse between the + completion of *all the requests* of the current pending batch and the + issuing of the first request of the next batch (this is, in the end, what + happens with a true soft real-time application). This change further + reduced false positives, and, as such, improved responsiveness and reduced + latency for actual soft real-time applications. +. CODE IMPROVEMENT. We cleaned up the code a little bit and addressed + some issues pointed out by the checkpatch.pl script. + +v7r1: +. BUGFIX. Replace the old value used to approximate 'infinity', with + the correct one to use in case times are compared through the macro + time_is_before_jiffies(). In fact, this macro, designed to take + wraparound issues into account, easily returns anomalous results if + its argument is equal to the value that we used as an approximation + of 'infinity', namely ((unsigned long) (-1)). The consequence was + that the logical expression used to determine whether a queue + belongs to a soft real-time application often yielded an incorrect + result. In the end, some application happened to be incorrectly + deemed as soft real-time and hence weight-raised. This affected both + throughput and latency guarantees. +. BUGFIX. Fixed a scriverner's error made in an attempt to use the + above macro in a logical expression. +. IMPROVEMENT/BUGFIX. On the expiration of a queue, use a more general + condition to allow a weight-raising period to start if the queue is + soft real-time. The previous condition could prevent an empty, + soft-real time queue from being correctly deemed as soft real-time. +. IMPROVEMENT/MINOR BUGFIX. Use jiffies-comparison macros also in the + following cases: + . to establish whether an application initially deemed as interactive + is now meeting the requirements for being classified as soft + real-time; + . to determine if a weight-raising period must be ended. +. CODE IMPROVEMENT. Change the type of the time quantities used in the + weight-raising heuristics to unsigned long, as the type of the time + (jiffies) is unsigned long. + +v7: +- IMPROVEMENT: In the presence of weight-raised queues and if the + device is NCQ-enabled, device idling is now disabled for non-raised + readers, i.e., for their associated sync queues. Hence a sync queue + is expired immediately if it becomes empty, and a new queue is + served. As explained in detail in the papers about BFQ, not idling + the device for sync queues when the latter become empty causes BFQ to + assign higher timestamps to these queues when they get backlogged + again, and hence to serve these queues less frequently. This fact, + plus to the fact that, because of the immediate expiration itself, + these queues get less service while they are granted access to the + disk, reduces the relative rate at which the processes associated to + these queues ask for requests from the I/O request pool. If the pool + is saturated, as it happens in the presence of write hogs, reducing + the above relative rate increases the probability that a request is + available (soon) in the pool when a weight-raised process needs it. + This change does seem to mitigate the typical starvation problems + that occur in the presence of write hogs and NCQ, and hence to + guarantee a higher application and system responsiveness in these + hostile scenarios. +- IMPROVEMENT/BUGFIX: Introduced a new classification rule to the soft + real-time heuristic, which takes into account also the isochronous + nature of such applications. The computation of next_start has been + fixed as well. Now it is correctly done from the time of the last + transition from idle to backlogged; the next_start is therefore + computed from the service received by the queue from its last + transition from idle to backlogged. Finally, the code which + preserved weight-raising for a soft real-time queue even with no + idle->backlogged transition has been removed. +- IMPROVEMENT: Add a few jiffies to the reference time interval used to + establish whether an application is greedy or not. This reference + interval was, by default, HZ/125 seconds, which could generate false + positives in the following two cases (especially if both cases occur): + 1) If HZ is so low that the duration of a jiffie is comparable to or + higher than the above reference time interval. This happens, e.g., + on slow devices with HZ=100. + 2) If jiffies, instead of increasing at a constant rate, may stop + increasing for some time, then suddenly 'jump' by several units to + recover the lost increments. This seems to happen, e.g., in virtual + machines. + The added number of jiffies has been found experimentally. In particular, + according to our experiments, adding this number of jiffies seems to make + the filter quite precise also in embedded systems and KVM/QEMU virtual + machines. Also contributed by + Alexander Spyridakis . +- IMPROVEMENT/BUGFIX: Keep disk idling also for NCQ-provided + rotational devices, which boosts the throughput on NCQ-enabled + rotational devices. +- BUGFIX: The budget-timeout condition in the bfq_rq_enqueued() function + was checked only if the request is large enough to provoke an unplug. As + a consequence, for a process always issuing small I/O requests the + budget timeout was never checked. The queue associated to the process + therefore expired only when its budget was exhausted, even if the + queue had already incurred a budget timeout from a while. + This fix lets a queue be checked for budget timeout at each request + enqueue, and, if needed, expires the queue accordingly even if the + request is small. +- BUGFIX: Make sure that weight-raising is resumed for a split queue, + if it was merged when already weight-raised. +- MINOR BUGFIX: Let bfq_end_raising_async() correctly end weight-raising + also for the queues belonging to the root group. +- IMPROVEMENT: Get rid of the some_coop_idle flag, which in its turn + was used to decide whether to disable idling for an in-service + shared queue whose seek mean decreased. In fact, disabling idling + for such a queue turned out to be useless. +- CODE IMPROVEMENT: The bfq_bfqq_must_idle() function and the + bfq_select_queue() function may not change the current in-service + queue in various cases. We have cleaned up the involved conditions, + by factoring out the common parts and getting rid of the useless + ones. +- MINOR CODE IMPROVEMENT: The idle_for_long_time condition in the + bfq_add_rq_rb() function should be evaluated only on an + idle->backlogged transition. Now the condition is set to false + by default, evaluating it only if the queue was not busy on a + request insertion. +- MINOR CODE IMPROVEMENT: Added a comment describing the rationale + behind the condition evaluated in the function + bfq_bfqq_must_not_expire(). + +v6r2: +- Fairness fix: the case of queue expiration for budget timeout is + now correctly handled also for sync queues, thus allowing also + the processes corresponding to these queues to be guaranteed their + reserved share of the disk throughput. +- Fixed a bug that prevented group weights from being correctly + set via the sysfs interface. +- Fixed a bug that cleared a previously-set group weight if the + same value was re-inserted via the sysfs interface. +- Fixed an EQM bug that allowed a newly-started process to skip + its initial weight-raising period if its queue was merged before + its first request was inserted. +- Fixed a bug that preserved already-started weight-raising periods + even if the low_latency tunable was disabled. +- The raising_max_time tunable now shows, more user-friendly, the + maximum raising time in milliseconds. + +v6r1: +- Fix use-after-free of queues in __bfq_bfqq_expire(). It may happen that + a call to bfq_del_bfqq_busy() puts the last reference taken on a queue + and frees it. Subsequent accesses to that same queue would result in a + use-after-free. Make sure that a queue that has just been deleted from + busy is no more touched. +- Use the uninitialized_var() macro when needed. It may happen that a + variable is initialized in a function that is called by the function + that defined it. Use the uninitialized_var() macro in these cases. + +v6: +- Replacement of the cooperating-queue merging mechanism borrowed from + CFQ with Early Queue Merge (EQM), a unified mechanism to get a + sequential read pattern, and hence a high throughput, with any set of + processes performing interleaved I/O. EQM also preserves low latency. + (see http://algogroup.unimore.it/people/paolo/disk_sched/description.php + for more details). Contributed by Mauro Andreolini and Arianna Avanzini. + The code for detecting whether two queues have to be merged is a + slightly modified version of the CFQ code for detecting whether two + queues belong to cooperating processes and whether the service of a + queue should be preempted to boost the throughput. +- Fix a bug that caused the peak rate of a disk to be computed as zero + in case of multiple I/O errors. Subsequent estimations of the weight + raising duration caused a division-by-zero error. + +v5r1: +- BUG FIX: Fixed stall occurring when the active queue is moved to + a different group while idling (this caused the idling timer to be + cancelled and hence no new queue to be selected, and no new + request to be dispatched). +- BUG FIX: Fixed wrong assignment of too high budgets to queues during + the first few seconds after initialization. +- BUG FIX: Added proper locking to the function handling the "weights" + tunable. + +v5: +- Added an heuristic that, if the tunable raising_max_time is set to + 0, automatically computes the duration of the weight raising + according to the estimated peak rate of the device. This enables + flash-based devices to reach maximum throughput as soon as possible, + without sacrificing latency. + +v4: +- Throughput-boosting for flash-based devices: improved version of commits + a68bbdd and f7d7b7a, which boosts the throughput while still preserving + latency guarantees for interactive and soft real-time applications. +- Better identification of NCQ-capable disks: port of commit e459dd0. + +v3-r4: +- Bugfixes + * Removed an important memory leak: under some circumstances the process references + to a queue were not decremented correctly, which prevented unused shared bfq_queue + to be correctly deallocated. + * Fixed various errors related to hierarchical scheduling: + * Removed an error causing tasks to be attached to the bfqio cgroup + controller even when BFQ was not the active scheduler + * Corrected wrong update of the budgets from the leaf to the root upon + forced selection of a service tree or a bfq_queue + * Fixed the way how active leaf entities are moved to the root group before + the group entity is deactivated when a cgroup is destroyed +- Throughput-boosting improvement for cooperating queues: close detection is now based + on a fixed threshold instead of the queue's average seek. This is a port of one of + the changes in the CFQ commit 3dde36d by Corrado Zoccolo. + +v3-r3: +- Bugfix: removed an important error causing occasional kernel panics when + moving a process to a new cgroup. The panic occurred if: + 1) the queue associated to the process was idle when the process was moved + and + 2) a new disk request was inserted into the queue just after the move. +- Further latency improvement through a better treatment of low-bandwidth + async queues. + +v3-r2: +- Bugfix: added a forgotten condition that prevents weights of low-bw async + queues from being raised when low_latency is off. +- Latency improvement: low-bw async queues are now better identified. + +v3-r1: +- Fixed an important request-dispatch bug causing occasional IO hangs. +- Added a new mechanism to reduce the latency of low-bw async queues. + This reduces the latency of also the sync queues synchronized with + the above async queues. +- Fixed a minor bug in iocontext locking (port of commits 9b50902 and 3181faa + from CFQ). + +v3: + +- Improved low-latency mechanisms, including a more accurate criterion to + distinguish between greedy-but-seeky and soft real-time applications. + Interactive applications now enjoy noticeably lower latencies. + +- Switch to the simpler one-request-dispatch-at-a-time scheme as in CFQ. + +- Ported cooperating-queues merging from CFQ (6d048f5, 1afba04, + d9e7620, a36e71f, 04dc6e7, 26a2ac0, 3ac6c9f, f2d1f0a, 83096eb, + 2e46e8b, df5fe3e, b3b6d04, e6c5bc7, c0324a0, f04a642, 8682e1f, + b9d8f4c, 2f7a2d8, ae54abe, e9ce335, 39c01b2, d02a2c0, c10b61f). + Contributed by Arianna Avanzini. Queues of processes performing IO + on interleaved, yet contiguous disk zones are merged to boost the + throughput. Some little optimizations to get a more stable throughput + have been added to the original CFQ version. + +- Added static fallback queue for extreme OOM conditions (porting of + CFQ commits d5036d7, 6118b70, b706f64, 32f2e80). Port contributed by + Francesco Allertsen. + +- Ported CFQ commits b0b78f8, 40bb54d, 30996f4, dddb745, ad5ebd2, cf7c25c; + mainly code cleanup and fix of minor bugs. Port contributed by + Francesco Allertsen. + +v2: + +- An issue that may cause little throughput loss on fast disks has been solved. + BFQ-v1 and CFQ may suffer from this problem. +- The disk-idling timeout has been better tuned to further file latency + (especially for the idle- or light-loaded-disk scenarios). +- One of the parameters of the low-latency heuristics has been tuned a little + bit more, so as to reduce the probability that a disk-bound process may + hamper the reduction of the latency of interactive and soft real-time + applications. + + - Same low-latency guarantees with and without NCQ. + + - Latency for interactive applications about halved with respect to BFQ-v1. + + - When the low_latency tunable is set, also soft real-time applications + now enjoy reduced latency. + + - A very little minimum bandwidth is now guaranteed to the + Idle IO-scheduling class also when the other classes are + backlogged, just to prevent them from starving. + +v1: + +This is a new version of BFQ with respect to the versions you can +find on Fabio's site: http://feanor.sssup.it/~fabio/linux/bfq. +Here is what we changed with respect to the previous versions: + +1) re-tuned the budget feedback mechanism: it is now slighlty more +biased toward assigning high budgets, to boost the aggregated +throughput more, and more quickly as new processes are started + +2) introduced more tolerance toward seeky queues (I verified that the +phenomena described below used to occur systematically): + + 2a: if a queue is expired after having received very little + service, then it is not punished as a seeky queue, even if it + occurred to consume that little service too slowly; the + rationale is that, if the new active queue has been served for + a too short time interval, then its possible sequential + accesses may not yet prevail on the initial latencies for + moving the disk head on the first sector requested + + 2b: the waiting time (disk idling) of a queue detected as seeky as + a function of the position of the requests it issued is reduced + to a very low value only after the queue has consumed a minimum + fraction of the assigned budget; this prevents processes + generating (partly) seeky workloads from being too ill-treated + + 2c: if a queue has consumed 'enough' budget upon a budget timeout, then, + even if it did not consume all of its budget, that queue is not punished + as any seeky queue; the rationale is that, depending on the disk zones, + a queue may be served at a lower rate than the estimated peak rate. + + Changes 2a and 2b have been critical in lowering latencies, whereas + change 2c, in addition to change 1, helped a lot increase the disk + throughput. + +3) slightly changed the peak rate estimator: a low-pass filter is now +used instead of just keeping the highest rate sampled; the rationale +is that the peak rate of a disk should be quite stable, so the filter +should converge more or less smoothly to the right value; it seemed to +correctly catch the peak rate with all disks we used + +4) added the low latency mechanism described in detail in +http://algogroup.unimore.it/people/paolo/disk_sched/description.php. + diff --git a/fs-aufs4.patch b/fs-aufs4.patch index d153623..7e802f7 100644 --- a/fs-aufs4.patch +++ b/fs-aufs4.patch @@ -1771,10 +1771,10 @@ index 0000000..4ab46ff +Currently this approach is applied to address_space_operations for +regular files only. diff --git a/MAINTAINERS b/MAINTAINERS -index 9c567a4..a62aea4 100644 +index 8c20323..d170184 100644 --- a/MAINTAINERS +++ b/MAINTAINERS -@@ -2128,6 +2128,19 @@ F: include/linux/audit.h +@@ -2213,6 +2213,19 @@ F: include/linux/audit.h F: include/uapi/linux/audit.h F: kernel/audit* @@ -1795,7 +1795,7 @@ index 9c567a4..a62aea4 100644 M: Miguel Ojeda Sandonis W: http://miguelojeda.es/auxdisplay.htm diff --git a/drivers/block/loop.c b/drivers/block/loop.c -index 80cf8ad..ba9e4a7 100644 +index 1fa8cc2..7339e65 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -712,6 +712,24 @@ static inline int is_loop_device(struct file *file) @@ -1824,10 +1824,10 @@ index 80cf8ad..ba9e4a7 100644 static ssize_t loop_attr_show(struct device *dev, char *page, diff --git a/fs/Kconfig b/fs/Kconfig -index 6725f59..2ced198 100644 +index b8fcb41..78adefb 100644 --- a/fs/Kconfig +++ b/fs/Kconfig -@@ -235,6 +235,7 @@ source "fs/pstore/Kconfig" +@@ -236,6 +236,7 @@ source "fs/pstore/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" @@ -2152,10 +2152,10 @@ index 0000000..e48d268 +#endif /* __AUFS_H__ */ diff --git a/fs/aufs/branch.c b/fs/aufs/branch.c new file mode 100644 -index 0000000..9b1f7c8 +index 0000000..7a60c73 --- /dev/null +++ b/fs/aufs/branch.c -@@ -0,0 +1,1406 @@ +@@ -0,0 +1,1409 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -2209,7 +2209,7 @@ index 0000000..9b1f7c8 + + if (br->br_fhsm) { + au_br_fhsm_fin(br->br_fhsm); -+ kfree(br->br_fhsm); ++ au_delayed_kfree(br->br_fhsm); + } + + key = br->br_dykey; @@ -2223,8 +2223,9 @@ index 0000000..9b1f7c8 + lockdep_off(); + path_put(&br->br_path); + lockdep_on(); -+ kfree(wbr); -+ kfree(br); ++ if (wbr) ++ au_delayed_kfree(wbr); ++ au_delayed_kfree(br); +} + +/* @@ -2322,11 +2323,12 @@ index 0000000..9b1f7c8 + return add_branch; /* success */ + +out_wbr: -+ kfree(add_branch->br_wbr); ++ if (add_branch->br_wbr) ++ au_delayed_kfree(add_branch->br_wbr); +out_hnotify: + au_hnotify_fin_br(add_branch); +out_br: -+ kfree(add_branch); ++ au_delayed_kfree(add_branch); +out: + return ERR_PTR(err); +} @@ -2471,7 +2473,7 @@ index 0000000..9b1f7c8 + bindex = au_br_index(sb, br->br_id); + if (0 <= bindex) { + hdir = au_hi(d_inode(sb->s_root), bindex); -+ au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ au_hn_inode_lock_nested(hdir, AuLsc_I_PARENT); + } else { + h_dentry = au_br_dentry(br); + h_inode = d_inode(h_dentry); @@ -2485,14 +2487,14 @@ index 0000000..9b1f7c8 + wbr_wh_write_unlock(wbr); + } + if (hdir) -+ au_hn_imtx_unlock(hdir); ++ au_hn_inode_unlock(hdir); + else + inode_unlock(h_inode); + vfsub_mnt_drop_write(au_br_mnt(br)); + br->br_perm = old_perm; + + if (!err && wbr && !au_br_writable(new_perm)) { -+ kfree(wbr); ++ au_delayed_kfree(wbr); + br->br_wbr = NULL; + } + @@ -2596,7 +2598,7 @@ index 0000000..9b1f7c8 + + AuRwMustWriteLock(&dinfo->di_rwsem); + -+ hdp = dinfo->di_hdentry + bindex; ++ hdp = au_hdentry(dinfo, bindex); + memmove(hdp + 1, hdp, sizeof(*hdp) * amount); + au_h_dentry_init(hdp); + dinfo->di_bbot++; @@ -2652,6 +2654,7 @@ index 0000000..9b1f7c8 + root = sb->s_root; + root_inode = d_inode(root); + IMustLock(root_inode); ++ IiMustWriteLock(root_inode); + err = test_add(sb, add, remount); + if (unlikely(err < 0)) + goto out; @@ -3071,14 +3074,13 @@ index 0000000..9b1f7c8 + + AuRwMustWriteLock(&dinfo->di_rwsem); + -+ hdp = dinfo->di_hdentry; ++ hdp = au_hdentry(dinfo, bindex); + if (bindex < bbot) -+ memmove(hdp + bindex, hdp + bindex + 1, -+ sizeof(*hdp) * (bbot - bindex)); -+ hdp[0 + bbot].hd_dentry = NULL; ++ memmove(hdp, hdp + 1, sizeof(*hdp) * (bbot - bindex)); ++ /* au_h_dentry_init(au_hdentry(dinfo, bbot); */ + dinfo->di_bbot--; + -+ p = krealloc(hdp, sizeof(*p) * bbot, AuGFP_SBILIST); ++ p = krealloc(dinfo->di_hdentry, sizeof(*p) * bbot, AuGFP_SBILIST); + if (p) + dinfo->di_hdentry = p; + /* harmless error */ @@ -3286,7 +3288,7 @@ index 0000000..9b1f7c8 + inode = ilookup(sb, ibusy.ino); + if (!inode + || inode->i_ino == AUFS_ROOT_INO -+ || is_bad_inode(inode)) ++ || au_is_bad_inode(inode)) + goto out_unlock; + + ii_read_lock_child(inode); @@ -3515,7 +3517,7 @@ index 0000000..9b1f7c8 + if (br->br_wbr) { + err = au_wbr_init(br, sb, mod->perm); + if (unlikely(err)) { -+ kfree(br->br_wbr); ++ au_delayed_kfree(br->br_wbr); + br->br_wbr = NULL; + } + } @@ -3527,7 +3529,7 @@ index 0000000..9b1f7c8 + if (!au_br_fhsm(mod->perm)) { + /* fhsm --> non-fhsm */ + au_br_fhsm_fin(br->br_fhsm); -+ kfree(br->br_fhsm); ++ au_delayed_kfree(br->br_fhsm); + br->br_fhsm = NULL; + } + } else if (au_br_fhsm(mod->perm)) @@ -3539,7 +3541,8 @@ index 0000000..9b1f7c8 + goto out; /* success */ + +out_bf: -+ kfree(bf); ++ if (bf) ++ au_delayed_kfree(bf); +out: + AuTraceErr(err); + return err; @@ -3923,10 +3926,10 @@ index 0000000..0bbb2d3 +-include ${srctree}/${src}/conf_priv.mk diff --git a/fs/aufs/cpup.c b/fs/aufs/cpup.c new file mode 100644 -index 0000000..584870a +index 0000000..0a06bf2 --- /dev/null +++ b/fs/aufs/cpup.c -@@ -0,0 +1,1379 @@ +@@ -0,0 +1,1383 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -4286,9 +4289,9 @@ index 0000000..584870a + dst->f_pos = 0; + err = au_do_copy_file(dst, src, len, buf, blksize); + if (do_kfree) -+ kfree(buf); ++ au_delayed_kfree(buf); + else -+ free_page((unsigned long)buf); ++ au_delayed_free_page((unsigned long)buf); + +out: + return err; @@ -4448,7 +4451,7 @@ index 0000000..584870a + sym.k[symlen] = 0; + err = vfsub_symlink(h_dir, h_path, sym.k); + } -+ free_page((unsigned long)sym.k); ++ au_delayed_free_page((unsigned long)sym.k); + +out: + return err; @@ -4819,7 +4822,7 @@ index 0000000..584870a + } +out_parent: + dput(dst_parent); -+ kfree(a); ++ au_delayed_kfree(a); +out: + return err; +} @@ -5029,23 +5032,27 @@ index 0000000..584870a + int err; + unsigned int flags_orig; + aufs_bindex_t bsrc_orig; -+ struct dentry *h_d_dst, *h_d_start; + struct au_dinfo *dinfo; -+ struct au_hdentry *hdp; ++ struct { ++ struct au_hdentry *hd; ++ struct dentry *h_dentry; ++ } hdst, hsrc; + + dinfo = au_di(cpg->dentry); + AuRwMustWriteLock(&dinfo->di_rwsem); + + bsrc_orig = cpg->bsrc; + cpg->bsrc = dinfo->di_btop; -+ hdp = dinfo->di_hdentry; -+ h_d_dst = hdp[0 + cpg->bdst].hd_dentry; ++ hdst.hd = au_hdentry(dinfo, cpg->bdst); ++ hdst.h_dentry = hdst.hd->hd_dentry; ++ hdst.hd->hd_dentry = wh_dentry; + dinfo->di_btop = cpg->bdst; -+ hdp[0 + cpg->bdst].hd_dentry = wh_dentry; -+ h_d_start = NULL; ++ ++ hsrc.h_dentry = NULL; + if (file) { -+ h_d_start = hdp[0 + cpg->bsrc].hd_dentry; -+ hdp[0 + cpg->bsrc].hd_dentry = au_hf_top(file)->f_path.dentry; ++ hsrc.hd = au_hdentry(dinfo, cpg->bsrc); ++ hsrc.h_dentry = hsrc.hd->hd_dentry; ++ hsrc.hd->hd_dentry = au_hf_top(file)->f_path.dentry; + } + flags_orig = cpg->flags; + cpg->flags = !AuCpup_DTIME; @@ -5054,9 +5061,9 @@ index 0000000..584870a + if (file) { + if (!err) + err = au_reopen_nondir(file); -+ hdp[0 + cpg->bsrc].hd_dentry = h_d_start; ++ hsrc.hd->hd_dentry = hsrc.h_dentry; + } -+ hdp[0 + cpg->bdst].hd_dentry = h_d_dst; ++ hdst.hd->hd_dentry = hdst.h_dentry; + dinfo->di_btop = cpg->bsrc; + cpg->bsrc = bsrc_orig; + @@ -5408,10 +5415,10 @@ index 0000000..7721429 +#endif /* __AUFS_CPUP_H__ */ diff --git a/fs/aufs/dbgaufs.c b/fs/aufs/dbgaufs.c new file mode 100644 -index 0000000..ef297ab +index 0000000..4ab45c0 --- /dev/null +++ b/fs/aufs/dbgaufs.c -@@ -0,0 +1,432 @@ +@@ -0,0 +1,438 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -5455,7 +5462,7 @@ index 0000000..ef297ab +static int dbgaufs_xi_release(struct inode *inode __maybe_unused, + struct file *file) +{ -+ kfree(file->private_data); ++ au_delayed_kfree(file->private_data); + return 0; +} + @@ -5517,7 +5524,7 @@ index 0000000..ef297ab +static int dbgaufs_plink_release(struct inode *inode __maybe_unused, + struct file *file) +{ -+ free_page((unsigned long)file->private_data); ++ au_delayed_free_page((unsigned long)file->private_data); + return 0; +} + @@ -5581,7 +5588,7 @@ index 0000000..ef297ab + goto out; /* success */ + +out_free: -+ free_page((unsigned long)p); ++ au_delayed_free_page((unsigned long)p); +out: + return err; +} @@ -5683,7 +5690,10 @@ index 0000000..ef297ab + for (; bindex <= bbot; bindex++) { + br = au_sbr(sb, bindex); + xi = &br->br_xino; ++ /* debugfs acquires the parent i_mutex */ ++ lockdep_off(); + debugfs_remove(xi->xi_dbgaufs); ++ lockdep_on(); + xi->xi_dbgaufs = NULL; + } +} @@ -5708,8 +5718,11 @@ index 0000000..ef297ab + br = au_sbr(sb, bindex); + xi = &br->br_xino; + AuDebugOn(xi->xi_dbgaufs); ++ /* debugfs acquires the parent i_mutex */ ++ lockdep_off(); + xi->xi_dbgaufs = debugfs_create_file(name, dbgaufs_mode, parent, + sbinfo, &dbgaufs_xino_fop); ++ lockdep_on(); + /* ignore an error */ + if (unlikely(!xi->xi_dbgaufs)) + AuWarn1("failed %s under debugfs\n", name); @@ -5900,7 +5913,7 @@ index 0000000..d1e09bd +#endif /* __DBGAUFS_H__ */ diff --git a/fs/aufs/dcsub.c b/fs/aufs/dcsub.c new file mode 100644 -index 0000000..832baa4 +index 0000000..a88a26d --- /dev/null +++ b/fs/aufs/dcsub.c @@ -0,0 +1,224 @@ @@ -5935,7 +5948,7 @@ index 0000000..832baa4 + p = dpage->dentries; + for (i = 0; i < dpage->ndentry; i++) + dput(*p++); -+ free_page((unsigned long)dpage->dentries); ++ au_delayed_free_page((unsigned long)dpage->dentries); +} + +int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp) @@ -5958,7 +5971,7 @@ index 0000000..832baa4 + return 0; /* success */ + +out_dpages: -+ kfree(dpages->dpages); ++ au_delayed_kfree(dpages->dpages); +out: + return err; +} @@ -5971,7 +5984,7 @@ index 0000000..832baa4 + p = dpages->dpages; + for (i = 0; i < dpages->ndpage; i++) + au_dpage_free(p++); -+ kfree(dpages->dpages); ++ au_delayed_kfree(dpages->dpages); +} + +static int au_dpages_append(struct au_dcsub_pages *dpages, @@ -6272,10 +6285,10 @@ index 0000000..9f4a2b5 +#endif /* __AUFS_DCSUB_H__ */ diff --git a/fs/aufs/debug.c b/fs/aufs/debug.c new file mode 100644 -index 0000000..5fa12e9 +index 0000000..47e1c06 --- /dev/null +++ b/fs/aufs/debug.c -@@ -0,0 +1,442 @@ +@@ -0,0 +1,440 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -6424,7 +6437,7 @@ index 0000000..5fa12e9 + int err, hn; + + err = do_pri_inode(-1, inode, -1, NULL); -+ if (err || !au_test_aufs(inode->i_sb) || is_bad_inode(inode)) ++ if (err || !au_test_aufs(inode->i_sb) || au_is_bad_inode(inode)) + return; + + iinfo = au_ii(inode); @@ -6454,9 +6467,9 @@ index 0000000..5fa12e9 +{ + struct dentry *wh = NULL; + int hn; ++ struct inode *inode; + struct au_iinfo *iinfo; + struct au_hinode *hi; -+ struct inode *inode; + + if (!dentry || IS_ERR(dentry)) { + dpri("d%d: err %ld\n", bindex, PTR_ERR(dentry)); @@ -6471,11 +6484,12 @@ index 0000000..5fa12e9 + d_unhashed(dentry) ? "un" : ""); + hn = -1; + inode = NULL; -+ if (bindex >= 0 -+ && d_is_positive(dentry) -+ && au_test_aufs(dentry->d_sb)) ++ if (d_is_positive(dentry)) + inode = d_inode(dentry); -+ if (inode && !is_bad_inode(inode)) { ++ if (inode ++ && au_test_aufs(dentry->d_sb) ++ && bindex >= 0 ++ && !au_is_bad_inode(inode)) { + iinfo = au_ii(inode); + hi = au_hinode(iinfo, bindex); + hn = !!au_hn(hi); @@ -6490,7 +6504,6 @@ index 0000000..5fa12e9 + struct au_dinfo *dinfo; + aufs_bindex_t bindex; + int err; -+ struct au_hdentry *hdp; + + err = do_pri_dentry(-1, dentry); + if (err || !au_test_aufs(dentry->d_sb)) @@ -6505,9 +6518,8 @@ index 0000000..5fa12e9 + dinfo->di_tmpfile); + if (dinfo->di_btop < 0) + return; -+ hdp = dinfo->di_hdentry; + for (bindex = dinfo->di_btop; bindex <= dinfo->di_bbot; bindex++) -+ do_pri_dentry(bindex, hdp[0 + bindex].hd_dentry); ++ do_pri_dentry(bindex, au_hdentry(dinfo, bindex)->hd_dentry); +} + +static int do_pri_file(aufs_bindex_t bindex, struct file *file) @@ -6615,7 +6627,7 @@ index 0000000..5fa12e9 + au_br_count_init(&a->fake); + err = do_pri_br(-1, &a->fake); + au_br_count_fin(&a->fake); -+ kfree(a); ++ au_delayed_kfree(a); + dpri("dev 0x%x\n", sb->s_dev); + if (err || !au_test_aufs(sb)) + return; @@ -6623,9 +6635,8 @@ index 0000000..5fa12e9 + sbinfo = au_sbi(sb); + if (!sbinfo) + return; -+ dpri("nw %lld, gen %u, kobj %d\n", -+ percpu_counter_sum(&sbinfo->si_nowait.nw_len), -+ sbinfo->si_generation, ++ dpri("nw %d, gen %u, kobj %d\n", ++ atomic_read(&sbinfo->si_nowait.nw_len), sbinfo->si_generation, + atomic_read(&sbinfo->si_kobj.kref.refcount)); + for (bindex = 0; bindex <= sbinfo->si_bbot; bindex++) + do_pri_br(bindex, sbinfo->si_branch[0 + bindex]); @@ -6951,10 +6962,10 @@ index 0000000..cd5fc3f +#endif /* __AUFS_DEBUG_H__ */ diff --git a/fs/aufs/dentry.c b/fs/aufs/dentry.c new file mode 100644 -index 0000000..6f6fe25 +index 0000000..d6867c8 --- /dev/null +++ b/fs/aufs/dentry.c -@@ -0,0 +1,1136 @@ +@@ -0,0 +1,1128 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -6979,14 +6990,6 @@ index 0000000..6f6fe25 +#include +#include "aufs.h" + -+#define AuLkup_ALLOW_NEG 1 -+#define AuLkup_IGNORE_PERM (1 << 1) -+#define au_ftest_lkup(flags, name) ((flags) & AuLkup_##name) -+#define au_fset_lkup(flags, name) \ -+ do { (flags) |= AuLkup_##name; } while (0) -+#define au_fclr_lkup(flags, name) \ -+ do { (flags) &= ~AuLkup_##name; } while (0) -+ +struct au_do_lookup_args { + unsigned int flags; + mode_t type; @@ -7089,15 +7092,15 @@ index 0000000..6f6fe25 + * otherwise an error. + * can be called at unlinking with @type is zero. + */ -+int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t btop, mode_t type) ++int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t btop, ++ unsigned int flags) +{ + int npositive, err; + aufs_bindex_t bindex, btail, bdiropq; + unsigned char isdir, dirperm1; + struct qstr whname; + struct au_do_lookup_args args = { -+ .flags = 0, -+ .type = type ++ .flags = flags + }; + const struct qstr *name = &dentry->d_name; + struct dentry *parent; @@ -7113,8 +7116,6 @@ index 0000000..6f6fe25 + goto out; + + isdir = !!d_is_dir(dentry); -+ if (!type) -+ au_fset_lkup(args.flags, ALLOW_NEG); + dirperm1 = !!au_opt_test(au_mntflags(sb), DIRPERM1); + + npositive = 0; @@ -7128,9 +7129,7 @@ index 0000000..6f6fe25 + if (h_dentry) { + if (d_is_positive(h_dentry)) + npositive++; -+ if (type != S_IFDIR) -+ break; -+ continue; ++ break; + } + h_parent = au_h_dptr(parent, bindex); + if (!h_parent || !d_is_dir(h_parent)) @@ -7183,7 +7182,7 @@ index 0000000..6f6fe25 + +out_parent: + dput(parent); -+ kfree(whname.name); ++ au_delayed_kfree(whname.name); +out: + return err; +} @@ -7364,8 +7363,9 @@ index 0000000..6f6fe25 + bbot = dinfo->di_bbot; + bwh = dinfo->di_bwh; + bdiropq = dinfo->di_bdiropq; -+ p = dinfo->di_hdentry + dinfo->di_btop; -+ for (bindex = dinfo->di_btop; bindex <= bbot; bindex++, p++) { ++ bindex = dinfo->di_btop; ++ p = au_hdentry(dinfo, bindex); ++ for (; bindex <= bbot; bindex++, p++) { + if (!p->hd_dentry) + continue; + @@ -7384,7 +7384,7 @@ index 0000000..6f6fe25 + } + + /* swap two lower dentries, and loop again */ -+ q = dinfo->di_hdentry + new_bindex; ++ q = au_hdentry(dinfo, new_bindex); + tmp = *q; + *q = *p; + *p = tmp; @@ -7408,16 +7408,18 @@ index 0000000..6f6fe25 + dinfo->di_btop = -1; + dinfo->di_bbot = -1; + bbot = au_dbbot(parent); -+ p = dinfo->di_hdentry; -+ for (bindex = 0; bindex <= bbot; bindex++, p++) ++ bindex = 0; ++ p = au_hdentry(dinfo, bindex); ++ for (; bindex <= bbot; bindex++, p++) + if (p->hd_dentry) { + dinfo->di_btop = bindex; + break; + } + + if (dinfo->di_btop >= 0) { -+ p = dinfo->di_hdentry + bbot; -+ for (bindex = bbot; bindex >= 0; bindex--, p--) ++ bindex = bbot; ++ p = au_hdentry(dinfo, bindex); ++ for (; bindex >= 0; bindex--, p--) + if (p->hd_dentry) { + dinfo->di_bbot = bindex; + err = 0; @@ -7534,14 +7536,14 @@ index 0000000..6f6fe25 + err = 0; + AuDebugOn(dinfo->di_btop < 0); + orig_h.mode = 0; -+ orig_h.dentry = dinfo->di_hdentry[dinfo->di_btop].hd_dentry; ++ orig_h.dentry = au_hdentry(dinfo, dinfo->di_btop)->hd_dentry; + orig_h.inode = NULL; + if (d_is_positive(orig_h.dentry)) { + orig_h.inode = d_inode(orig_h.dentry); + orig_h.mode = orig_h.inode->i_mode & S_IFMT; + } + if (tmp->di_btop >= 0) { -+ tmp_h.dentry = tmp->di_hdentry[tmp->di_btop].hd_dentry; ++ tmp_h.dentry = au_hdentry(tmp, tmp->di_btop)->hd_dentry; + if (d_is_positive(tmp_h.dentry)) { + tmp_h.inode = d_inode(tmp_h.dentry); + tmp_h.mode = tmp_h.inode->i_mode & S_IFMT; @@ -7570,7 +7572,7 @@ index 0000000..6f6fe25 + AuDebugOn(dinfo->di_btop != dinfo->di_bbot); + au_set_h_dptr(dentry, dinfo->di_btop, NULL); + au_di_cp(dinfo, tmp); -+ hd = tmp->di_hdentry + tmp->di_btop; ++ hd = au_hdentry(tmp, tmp->di_btop); + au_set_h_dptr(dentry, tmp->di_btop, + dget(hd->hd_dentry)); + } @@ -7621,13 +7623,13 @@ index 0000000..6f6fe25 + dinfo->di_bbot = tmp->di_bbot; + dinfo->di_bwh = tmp->di_bwh; + dinfo->di_bdiropq = tmp->di_bdiropq; -+ hd = tmp->di_hdentry; + bbot = dinfo->di_bbot; -+ for (bindex = tmp->di_btop; bindex <= bbot; -+ bindex++) { ++ bindex = tmp->di_btop; ++ hd = au_hdentry(tmp, bindex); ++ for (; bindex <= bbot; bindex++, hd++) { + if (au_h_dptr(dentry, bindex)) + continue; -+ h_dentry = hd[bindex].hd_dentry; ++ h_dentry = hd->hd_dentry; + if (!h_dentry) + continue; + AuDebugOn(d_is_negative(h_dentry)); @@ -7638,7 +7640,8 @@ index 0000000..6f6fe25 + au_set_h_dptr(dentry, bindex, + dget(h_dentry)); + } -+ err = au_refresh_hinode(inode, dentry); ++ if (inode) ++ err = au_refresh_hinode(inode, dentry); + au_dbg_verify_dinode(dentry); + } + } else { @@ -7727,7 +7730,7 @@ index 0000000..6f6fe25 + * if current working dir is removed, it returns an error. + * but the dentry is legal. + */ -+ err = au_lkup_dentry(dentry, /*btop*/0, /*type*/0); ++ err = au_lkup_dentry(dentry, /*btop*/0, AuLkup_ALLOW_NEG); + AuDbgDentry(dentry); + au_di_swap(tmp, dinfo); + if (err == -ENOENT) @@ -8006,7 +8009,7 @@ index 0000000..6f6fe25 + inode = NULL; + if (d_really_is_positive(dentry)) + inode = d_inode(dentry); -+ if (unlikely(inode && is_bad_inode(inode))) { ++ if (unlikely(inode && au_is_bad_inode(inode))) { + err = -EINVAL; + AuTraceErr(err); + goto out_dgrade; @@ -8093,10 +8096,10 @@ index 0000000..6f6fe25 +}; diff --git a/fs/aufs/dentry.h b/fs/aufs/dentry.h new file mode 100644 -index 0000000..238909f +index 0000000..94a3753 --- /dev/null +++ b/fs/aufs/dentry.h -@@ -0,0 +1,234 @@ +@@ -0,0 +1,255 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -8137,11 +8140,25 @@ index 0000000..238909f + struct au_rwsem di_rwsem; + aufs_bindex_t di_btop, di_bbot, di_bwh, di_bdiropq; + unsigned char di_tmpfile; /* to allow the different name */ -+ struct au_hdentry *di_hdentry; ++ union { ++ struct au_hdentry *di_hdentry; ++ struct llist_node di_lnode; /* delayed free */ ++ }; +} ____cacheline_aligned_in_smp; + +/* ---------------------------------------------------------------------- */ + ++/* flags for au_lkup_dentry() */ ++#define AuLkup_ALLOW_NEG 1 ++#define AuLkup_IGNORE_PERM (1 << 1) ++#define au_ftest_lkup(flags, name) ((flags) & AuLkup_##name) ++#define au_fset_lkup(flags, name) \ ++ do { (flags) |= AuLkup_##name; } while (0) ++#define au_fclr_lkup(flags, name) \ ++ do { (flags) &= ~AuLkup_##name; } while (0) ++ ++/* ---------------------------------------------------------------------- */ ++ +/* dentry.c */ +extern const struct dentry_operations aufs_dop, aufs_dop_noreval; +struct au_branch; @@ -8149,7 +8166,8 @@ index 0000000..238909f +int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir, + struct dentry *h_parent, struct au_branch *br); + -+int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t btop, mode_t type); ++int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t btop, ++ unsigned int flags); +int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex, int wh); +int au_refresh_dentry(struct dentry *dentry, struct dentry *parent); +int au_reval_dpath(struct dentry *dentry, unsigned int sigen); @@ -8257,6 +8275,12 @@ index 0000000..238909f + hdentry->hd_dentry = NULL; +} + ++static inline struct au_hdentry *au_hdentry(struct au_dinfo *di, ++ aufs_bindex_t bindex) ++{ ++ return di->di_hdentry + bindex; ++} ++ +static inline void au_hdput(struct au_hdentry *hd) +{ + if (hd) @@ -8333,10 +8357,10 @@ index 0000000..238909f +#endif /* __AUFS_DENTRY_H__ */ diff --git a/fs/aufs/dinfo.c b/fs/aufs/dinfo.c new file mode 100644 -index 0000000..ff8d5fd +index 0000000..25e71e9 --- /dev/null +++ b/fs/aufs/dinfo.c -@@ -0,0 +1,548 @@ +@@ -0,0 +1,552 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -8392,7 +8416,7 @@ index 0000000..ff8d5fd + goto out; + } + -+ au_cache_free_dinfo(dinfo); ++ au_cache_dfree_dinfo(dinfo); + dinfo = NULL; + +out: @@ -8408,12 +8432,12 @@ index 0000000..ff8d5fd + bindex = dinfo->di_btop; + if (bindex >= 0) { + bbot = dinfo->di_bbot; -+ p = dinfo->di_hdentry + bindex; ++ p = au_hdentry(dinfo, bindex); + while (bindex++ <= bbot) + au_hdput(p++); + } -+ kfree(dinfo->di_hdentry); -+ au_cache_free_dinfo(dinfo); ++ au_delayed_kfree(dinfo->di_hdentry); ++ au_cache_dfree_dinfo(dinfo); +} + +void au_di_swap(struct au_dinfo *a, struct au_dinfo *b) @@ -8660,7 +8684,7 @@ index 0000000..ff8d5fd + if (au_dbtop(dentry) < 0 || bindex < au_dbtop(dentry)) + return NULL; + AuDebugOn(bindex < 0); -+ d = au_di(dentry)->di_hdentry[0 + bindex].hd_dentry; ++ d = au_hdentry(au_di(dentry), bindex)->hd_dentry; + AuDebugOn(d && au_dcount(d) <= 0); + return d; +} @@ -8749,11 +8773,14 @@ index 0000000..ff8d5fd +void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex, + struct dentry *h_dentry) +{ -+ struct au_hdentry *hd = au_di(dentry)->di_hdentry + bindex; ++ struct au_dinfo *dinfo; ++ struct au_hdentry *hd; + struct au_branch *br; + + DiMustWriteLock(dentry); + ++ dinfo = au_di(dentry); ++ hd = au_hdentry(dinfo, bindex); + au_hdput(hd); + hd->hd_dentry = h_dentry; + if (h_dentry) { @@ -8803,6 +8830,7 @@ index 0000000..ff8d5fd + struct au_dinfo *dinfo; + struct dentry *h_d; + struct au_hdentry *hdp; ++ aufs_bindex_t bindex, bbot; + + DiMustWriteLock(dentry); + @@ -8810,21 +8838,21 @@ index 0000000..ff8d5fd + if (!dinfo || dinfo->di_btop < 0) + return; + -+ hdp = dinfo->di_hdentry; + if (do_put_zero) { -+ aufs_bindex_t bindex, bbot; -+ + bbot = dinfo->di_bbot; -+ for (bindex = dinfo->di_btop; bindex <= bbot; bindex++) { -+ h_d = hdp[0 + bindex].hd_dentry; ++ bindex = dinfo->di_btop; ++ hdp = au_hdentry(dinfo, bindex); ++ for (; bindex <= bbot; bindex++, hdp++) { ++ h_d = hdp->hd_dentry; + if (h_d && d_is_negative(h_d)) + au_set_h_dptr(dentry, bindex, NULL); + } + } + -+ dinfo->di_btop = -1; -+ while (++dinfo->di_btop <= dinfo->di_bbot) -+ if (hdp[0 + dinfo->di_btop].hd_dentry) ++ dinfo->di_btop = 0; ++ hdp = au_hdentry(dinfo, dinfo->di_btop); ++ for (; dinfo->di_btop <= dinfo->di_bbot; dinfo->di_btop++, hdp++) ++ if (hdp->hd_dentry) + break; + if (dinfo->di_btop > dinfo->di_bbot) { + dinfo->di_btop = -1; @@ -8832,9 +8860,9 @@ index 0000000..ff8d5fd + return; + } + -+ dinfo->di_bbot++; -+ while (0 <= --dinfo->di_bbot) -+ if (hdp[0 + dinfo->di_bbot].hd_dentry) ++ hdp = au_hdentry(dinfo, dinfo->di_bbot); ++ for (; dinfo->di_bbot >= 0; dinfo->di_bbot--, hdp--) ++ if (hdp->hd_dentry) + break; + AuDebugOn(dinfo->di_btop > dinfo->di_bbot || dinfo->di_bbot < 0); +} @@ -8887,10 +8915,10 @@ index 0000000..ff8d5fd +} diff --git a/fs/aufs/dir.c b/fs/aufs/dir.c new file mode 100644 -index 0000000..fd38c54 +index 0000000..bd6c868 --- /dev/null +++ b/fs/aufs/dir.c -@@ -0,0 +1,756 @@ +@@ -0,0 +1,762 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -9033,14 +9061,14 @@ index 0000000..fd38c54 + if (err) + goto out_unlock; + hdir = au_hi(dir, btop); -+ au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ au_hn_inode_lock_nested(hdir, AuLsc_I_PARENT); + h_dir = au_h_iptr(dir, btop); + if (h_dir->i_nlink + && timespec_compare(&h_dir->i_mtime, &dt.dt_mtime) < 0) { + dt.dt_h_path = h_path; + au_dtime_revert(&dt); + } -+ au_hn_imtx_unlock(hdir); ++ au_hn_inode_unlock(hdir); + vfsub_mnt_drop_write(h_path.mnt); + au_cpup_attr_timesizes(dir); + @@ -9049,7 +9077,7 @@ index 0000000..fd38c54 +out: + dput(a->dentry); + au_nwt_done(&au_sbi(sb)->si_nowait); -+ kfree(arg); ++ au_delayed_kfree(arg); +} + +void au_dir_ts(struct inode *dir, aufs_bindex_t bindex) @@ -9085,7 +9113,7 @@ index 0000000..fd38c54 + if (unlikely(wkq_err)) { + pr_err("wkq %d\n", wkq_err); + dput(dentry); -+ kfree(arg); ++ au_delayed_kfree(arg); + } + +out: @@ -9204,7 +9232,7 @@ index 0000000..fd38c54 + }; + err = au_do_open(file, &args); + if (unlikely(err)) -+ kfree(fidir); ++ au_delayed_kfree(fidir); + } + si_read_unlock(sb); + return err; @@ -9216,8 +9244,11 @@ index 0000000..fd38c54 + struct au_vdir *vdir_cache; + struct au_finfo *finfo; + struct au_fidir *fidir; ++ struct au_hfile *hf; + aufs_bindex_t bindex, bbot; ++ int execed, delayed; + ++ delayed = (current->flags & PF_KTHREAD) || in_interrupt(); + finfo = au_fi(file); + fidir = finfo->fi_hdir; + if (fidir) { @@ -9225,22 +9256,25 @@ index 0000000..fd38c54 + &au_sbi(file->f_path.dentry->d_sb)->si_files); + vdir_cache = fidir->fd_vdir_cache; /* lock-free */ + if (vdir_cache) -+ au_vdir_free(vdir_cache); ++ au_vdir_free(vdir_cache, delayed); + + bindex = finfo->fi_btop; + if (bindex >= 0) { ++ execed = vfsub_file_execed(file); ++ hf = fidir->fd_hfile + bindex; + /* + * calls fput() instead of filp_close(), + * since no dnotify or lock for the lower file. + */ + bbot = fidir->fd_bbot; -+ for (; bindex <= bbot; bindex++) -+ au_set_h_fptr(file, bindex, NULL); ++ for (; bindex <= bbot; bindex++, hf++) ++ if (hf->hf_file) ++ au_hfput(hf, execed); + } -+ kfree(fidir); ++ au_delayed_kfree(fidir); + finfo->fi_hdir = NULL; + } -+ au_finfo_fin(file); ++ au_finfo_fin(file, delayed); + return 0; +} + @@ -9359,7 +9393,7 @@ index 0000000..fd38c54 + +/* ---------------------------------------------------------------------- */ + -+static int aufs_iterate(struct file *file, struct dir_context *ctx) ++static int aufs_iterate_shared(struct file *file, struct dir_context *ctx) +{ + int err; + struct dentry *dentry; @@ -9637,7 +9671,7 @@ index 0000000..fd38c54 + .owner = THIS_MODULE, + .llseek = default_llseek, + .read = generic_read_dir, -+ .iterate = aufs_iterate, ++ .iterate_shared = aufs_iterate_shared, + .unlocked_ioctl = aufs_ioctl_dir, +#ifdef CONFIG_COMPAT + .compat_ioctl = aufs_compat_ioctl_dir, @@ -9649,10 +9683,10 @@ index 0000000..fd38c54 +}; diff --git a/fs/aufs/dir.h b/fs/aufs/dir.h new file mode 100644 -index 0000000..16821f9 +index 0000000..8a145f1 --- /dev/null +++ b/fs/aufs/dir.h -@@ -0,0 +1,131 @@ +@@ -0,0 +1,137 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -9697,7 +9731,10 @@ index 0000000..16821f9 + +struct au_vdir_dehstr { + struct hlist_node hash; -+ struct au_vdir_destr *str; ++ union { ++ struct au_vdir_destr *str; ++ struct llist_node lnode; /* delayed free */ ++ }; +} ____cacheline_aligned_in_smp; + +struct au_vdir_de { @@ -9735,7 +9772,10 @@ index 0000000..16821f9 + + unsigned long vd_version; + unsigned int vd_deblk_sz; -+ unsigned long vd_jiffy; ++ union { ++ unsigned long vd_jiffy; ++ struct llist_node vd_lnode; /* delayed free */ ++ }; +} ____cacheline_aligned_in_smp; + +/* ---------------------------------------------------------------------- */ @@ -9759,7 +9799,7 @@ index 0000000..16821f9 +int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino, + unsigned int d_type, aufs_bindex_t bindex, + unsigned char shwh); -+void au_vdir_free(struct au_vdir *vdir); ++void au_vdir_free(struct au_vdir *vdir, int atonce); +int au_vdir_init(struct file *file); +int au_vdir_fill_de(struct file *file, struct dir_context *ctx); + @@ -9786,7 +9826,7 @@ index 0000000..16821f9 +#endif /* __AUFS_DIR_H__ */ diff --git a/fs/aufs/dynop.c b/fs/aufs/dynop.c new file mode 100644 -index 0000000..e0c77bc +index 0000000..197be54 --- /dev/null +++ b/fs/aufs/dynop.c @@ -0,0 +1,369 @@ @@ -9819,17 +9859,17 @@ index 0000000..e0c77bc + * How large will these lists be? + * Usually just a few elements, 20-30 at most for each, I guess. + */ -+static struct au_splhead dynop[AuDyLast]; ++static struct au_sphlhead dynop[AuDyLast]; + -+static struct au_dykey *dy_gfind_get(struct au_splhead *spl, const void *h_op) ++static struct au_dykey *dy_gfind_get(struct au_sphlhead *sphl, const void *h_op) +{ + struct au_dykey *key, *tmp; -+ struct list_head *head; ++ struct hlist_head *head; + + key = NULL; -+ head = &spl->head; ++ head = &sphl->head; + rcu_read_lock(); -+ list_for_each_entry_rcu(tmp, head, dk_list) ++ hlist_for_each_entry_rcu(tmp, head, dk_hnode) + if (tmp->dk_op.dy_hop == h_op) { + key = tmp; + kref_get(&key->dk_kref); @@ -9876,24 +9916,24 @@ index 0000000..e0c77bc +} + +/* kref_get() if @key is already added */ -+static struct au_dykey *dy_gadd(struct au_splhead *spl, struct au_dykey *key) ++static struct au_dykey *dy_gadd(struct au_sphlhead *sphl, struct au_dykey *key) +{ + struct au_dykey *tmp, *found; -+ struct list_head *head; ++ struct hlist_head *head; + const void *h_op = key->dk_op.dy_hop; + + found = NULL; -+ head = &spl->head; -+ spin_lock(&spl->spin); -+ list_for_each_entry(tmp, head, dk_list) ++ head = &sphl->head; ++ spin_lock(&sphl->spin); ++ hlist_for_each_entry(tmp, head, dk_hnode) + if (tmp->dk_op.dy_hop == h_op) { + kref_get(&tmp->dk_kref); + found = tmp; + break; + } + if (!found) -+ list_add_rcu(&key->dk_list, head); -+ spin_unlock(&spl->spin); ++ hlist_add_head_rcu(&key->dk_hnode, head); ++ spin_unlock(&sphl->spin); + + if (!found) + DyPrSym(key); @@ -9906,17 +9946,17 @@ index 0000000..e0c77bc + + key = container_of(rcu, struct au_dykey, dk_rcu); + DyPrSym(key); -+ kfree(key); ++ kfree(key); /* not delayed */ +} + +static void dy_free(struct kref *kref) +{ + struct au_dykey *key; -+ struct au_splhead *spl; ++ struct au_sphlhead *sphl; + + key = container_of(kref, struct au_dykey, dk_kref); -+ spl = dynop + key->dk_op.dy_type; -+ au_spl_del_rcu(&key->dk_list, spl); ++ sphl = dynop + key->dk_op.dy_type; ++ au_sphl_del_rcu(&key->dk_hnode, sphl); + call_rcu(&key->dk_rcu, dy_free_rcu); +} + @@ -10001,7 +10041,7 @@ index 0000000..e0c77bc +static struct au_dykey *dy_get(struct au_dynop *op, struct au_branch *br) +{ + struct au_dykey *key, *old; -+ struct au_splhead *spl; ++ struct au_sphlhead *sphl; + struct op { + unsigned int sz; + void (*set)(struct au_dykey *key, const void *h_op, @@ -10015,8 +10055,8 @@ index 0000000..e0c77bc + }; + const struct op *p; + -+ spl = dynop + op->dy_type; -+ key = dy_gfind_get(spl, op->dy_hop); ++ sphl = dynop + op->dy_type; ++ key = dy_gfind_get(sphl, op->dy_hop); + if (key) + goto out_add; /* success */ + @@ -10030,9 +10070,9 @@ index 0000000..e0c77bc + key->dk_op.dy_hop = op->dy_hop; + kref_init(&key->dk_kref); + p->set(key, op->dy_hop, au_br_sb(br)); -+ old = dy_gadd(spl, key); ++ old = dy_gadd(sphl, key); + if (old) { -+ kfree(key); ++ au_delayed_kfree(key); + key = old; + } + @@ -10127,16 +10167,16 @@ index 0000000..e0c77bc + +void au_dy_arefresh(int do_dx) +{ -+ struct au_splhead *spl; -+ struct list_head *head; ++ struct au_sphlhead *sphl; ++ struct hlist_head *head; + struct au_dykey *key; + -+ spl = dynop + AuDy_AOP; -+ head = &spl->head; -+ spin_lock(&spl->spin); -+ list_for_each_entry(key, head, dk_list) ++ sphl = dynop + AuDy_AOP; ++ head = &sphl->head; ++ spin_lock(&sphl->spin); ++ hlist_for_each_entry(key, head, dk_hnode) + dy_adx((void *)key, do_dx); -+ spin_unlock(&spl->spin); ++ spin_unlock(&sphl->spin); +} + +/* ---------------------------------------------------------------------- */ @@ -10149,7 +10189,7 @@ index 0000000..e0c77bc + BUILD_BUG_ON(offsetof(struct au_dyaop, da_key)); + + for (i = 0; i < AuDyLast; i++) -+ au_spl_init(dynop + i); ++ au_sphl_init(dynop + i); +} + +void au_dy_fin(void) @@ -10157,11 +10197,11 @@ index 0000000..e0c77bc + int i; + + for (i = 0; i < AuDyLast; i++) -+ WARN_ON(!list_empty(&dynop[i].head)); ++ WARN_ON(!hlist_empty(&dynop[i].head)); +} diff --git a/fs/aufs/dynop.h b/fs/aufs/dynop.h new file mode 100644 -index 0000000..92f61f4 +index 0000000..054e7032 --- /dev/null +++ b/fs/aufs/dynop.h @@ -0,0 +1,74 @@ @@ -10206,7 +10246,7 @@ index 0000000..92f61f4 + +struct au_dykey { + union { -+ struct list_head dk_list; ++ struct hlist_node dk_hnode; + struct rcu_head dk_rcu; + }; + struct au_dynop dk_op; @@ -10241,7 +10281,7 @@ index 0000000..92f61f4 +#endif /* __AUFS_DYNOP_H__ */ diff --git a/fs/aufs/export.c b/fs/aufs/export.c new file mode 100644 -index 0000000..dcdfe1d +index 0000000..8ffc8e7 --- /dev/null +++ b/fs/aufs/export.c @@ -0,0 +1,837 @@ @@ -10480,7 +10520,7 @@ index 0000000..dcdfe1d + + dentry = ERR_PTR(-ESTALE); + sigen = au_sigen(sb); -+ if (unlikely(is_bad_inode(inode) ++ if (unlikely(au_is_bad_inode(inode) + || IS_DEADDIR(inode) + || sigen != au_iigen(inode, NULL))) + goto out_iput; @@ -10664,7 +10704,7 @@ index 0000000..dcdfe1d + } + +out_name: -+ free_page((unsigned long)arg.name); ++ au_delayed_free_page((unsigned long)arg.name); +out_file: + fput(file); +out: @@ -10818,7 +10858,7 @@ index 0000000..dcdfe1d + dentry = ERR_PTR(-ESTALE); + } +out_pathname: -+ free_page((unsigned long)pathname); ++ au_delayed_free_page((unsigned long)pathname); +out_h_parent: + dput(h_parent); +out: @@ -11084,10 +11124,10 @@ index 0000000..dcdfe1d +} diff --git a/fs/aufs/f_op.c b/fs/aufs/f_op.c new file mode 100644 -index 0000000..0b54eef +index 0000000..0f570bc --- /dev/null +++ b/fs/aufs/f_op.c -@@ -0,0 +1,770 @@ +@@ -0,0 +1,772 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -11189,6 +11229,7 @@ index 0000000..0b54eef +{ + struct au_finfo *finfo; + aufs_bindex_t bindex; ++ int delayed; + + finfo = au_fi(file); + au_sphl_del(&finfo->fi_hlist, @@ -11197,7 +11238,8 @@ index 0000000..0b54eef + if (bindex >= 0) + au_set_h_fptr(file, bindex, NULL); + -+ au_finfo_fin(file); ++ delayed = (current->flags & PF_KTHREAD) || in_interrupt(); ++ au_finfo_fin(file, delayed); + return 0; +} + @@ -11444,11 +11486,20 @@ index 0000000..0b54eef + sb = inode->i_sb; + si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); + -+ h_file = au_read_pre(file, /*keep_fi*/0); ++ h_file = au_read_pre(file, /*keep_fi*/1); + err = PTR_ERR(h_file); + if (IS_ERR(h_file)) + goto out; + ++ if (au_test_loopback_kthread()) { ++ au_warn_loopback(h_file->f_path.dentry->d_sb); ++ if (file->f_mapping != h_file->f_mapping) { ++ file->f_mapping = h_file->f_mapping; ++ smp_mb(); /* unnecessary? */ ++ } ++ } ++ fi_read_unlock(file); ++ + err = au_do_iter(h_file, MAY_READ, kio, iov_iter); + /* todo: necessary? */ + /* file->f_ra = h_file->f_ra; */ @@ -11497,20 +11548,11 @@ index 0000000..0b54eef + sb = inode->i_sb; + si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); + -+ h_file = au_read_pre(file, /*keep_fi*/1); ++ h_file = au_read_pre(file, /*keep_fi*/0); + err = PTR_ERR(h_file); + if (IS_ERR(h_file)) + goto out; + -+ if (au_test_loopback_kthread()) { -+ au_warn_loopback(h_file->f_path.dentry->d_sb); -+ if (file->f_mapping != h_file->f_mapping) { -+ file->f_mapping = h_file->f_mapping; -+ smp_mb(); /* unnecessary? */ -+ } -+ } -+ fi_read_unlock(file); -+ + err = vfsub_splice_to(h_file, ppos, pipe, len, flags); + /* todo: necessasry? */ + /* file->f_ra = h_file->f_ra; */ @@ -12292,10 +12334,10 @@ index 0000000..40289e4 +} diff --git a/fs/aufs/file.c b/fs/aufs/file.c new file mode 100644 -index 0000000..65926e8 +index 0000000..33dde75 --- /dev/null +++ b/fs/aufs/file.c -@@ -0,0 +1,844 @@ +@@ -0,0 +1,845 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -12571,7 +12613,7 @@ index 0000000..65926e8 + } + if (unlikely(err)) { + finfo->fi_hdir = NULL; -+ au_finfo_fin(file); ++ au_finfo_fin(file, /*atonce*/0); + } + +out: @@ -12650,11 +12692,11 @@ index 0000000..65926e8 + + btop = dinfo->di_btop; + dinfo->di_btop = btgt; -+ hdp = dinfo->di_hdentry; -+ h_dentry = hdp[0 + btgt].hd_dentry; -+ hdp[0 + btgt].hd_dentry = hi_wh; ++ hdp = au_hdentry(dinfo, btgt); ++ h_dentry = hdp->hd_dentry; ++ hdp->hd_dentry = hi_wh; + err = au_reopen_nondir(file); -+ hdp[0 + btgt].hd_dentry = h_dentry; ++ hdp->hd_dentry = h_dentry; + dinfo->di_btop = btop; + + return err; @@ -12891,6 +12933,7 @@ index 0000000..65926e8 + +static void au_do_refresh_dir(struct file *file) +{ ++ int execed; + aufs_bindex_t bindex, bbot, new_bindex, brid; + struct au_hfile *p, tmp, *q; + struct au_finfo *finfo; @@ -12929,6 +12972,7 @@ index 0000000..65926e8 + } + } + ++ execed = vfsub_file_execed(file); + p = fidir->fd_hfile; + if (!au_test_mmapped(file) && !d_unlinked(file->f_path.dentry)) { + bbot = au_sbbot(sb); @@ -12937,14 +12981,14 @@ index 0000000..65926e8 + if (p->hf_file) { + if (file_inode(p->hf_file)) + break; -+ au_hfput(p, file); ++ au_hfput(p, execed); + } + } else { + bbot = au_br_index(sb, brid); + for (finfo->fi_btop = 0; finfo->fi_btop < bbot; + finfo->fi_btop++, p++) + if (p->hf_file) -+ au_hfput(p, file); ++ au_hfput(p, execed); + bbot = au_sbbot(sb); + } + @@ -12954,7 +12998,7 @@ index 0000000..65926e8 + if (p->hf_file) { + if (file_inode(p->hf_file)) + break; -+ au_hfput(p, file); ++ au_hfput(p, execed); + } + AuDebugOn(fidir->fd_bbot < finfo->fi_btop); +} @@ -13069,8 +13113,7 @@ index 0000000..65926e8 +} + +/* it will never be called, but necessary to support O_DIRECT */ -+static ssize_t aufs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, -+ loff_t offset) ++static ssize_t aufs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ BUG(); return 0; } + +/* they will never be called. */ @@ -13142,10 +13185,10 @@ index 0000000..65926e8 +}; diff --git a/fs/aufs/file.h b/fs/aufs/file.h new file mode 100644 -index 0000000..cc98251 +index 0000000..4698c98 --- /dev/null +++ b/fs/aufs/file.h -@@ -0,0 +1,291 @@ +@@ -0,0 +1,294 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -13211,7 +13254,10 @@ index 0000000..cc98251 + struct au_fidir *fi_hdir; /* for dir only */ + + struct hlist_node fi_hlist; -+ struct file *fi_file; /* very ugly */ ++ union { ++ struct file *fi_file; /* very ugly */ ++ struct llist_node fi_lnode; /* delayed free */ ++ }; +} ____cacheline_aligned_in_smp; + +/* ---------------------------------------------------------------------- */ @@ -13262,7 +13308,7 @@ index 0000000..cc98251 +struct file *au_read_pre(struct file *file, int keep_fi); + +/* finfo.c */ -+void au_hfput(struct au_hfile *hf, struct file *file); ++void au_hfput(struct au_hfile *hf, int execed); +void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, + struct file *h_file); + @@ -13271,7 +13317,7 @@ index 0000000..cc98251 +int au_fidir_realloc(struct au_finfo *finfo, int nbr); + +void au_fi_init_once(void *_fi); -+void au_finfo_fin(struct file *file); ++void au_finfo_fin(struct file *file, int atonce); +int au_finfo_init(struct file *file, struct au_fidir *fidir); + +/* ioctl.c */ @@ -13439,10 +13485,10 @@ index 0000000..cc98251 +#endif /* __AUFS_FILE_H__ */ diff --git a/fs/aufs/finfo.c b/fs/aufs/finfo.c new file mode 100644 -index 0000000..179f21a +index 0000000..dfb905d --- /dev/null +++ b/fs/aufs/finfo.c -@@ -0,0 +1,149 @@ +@@ -0,0 +1,151 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -13466,10 +13512,9 @@ index 0000000..179f21a + +#include "aufs.h" + -+void au_hfput(struct au_hfile *hf, struct file *file) ++void au_hfput(struct au_hfile *hf, int execed) +{ -+ /* todo: direct access f_flags */ -+ if (vfsub_file_flags(file) & __FMODE_EXEC) ++ if (execed) + allow_write_access(hf->hf_file); + fput(hf->hf_file); + hf->hf_file = NULL; @@ -13491,7 +13536,7 @@ index 0000000..179f21a + hf = fidir->fd_hfile + bindex; + + if (hf && hf->hf_file) -+ au_hfput(hf, file); ++ au_hfput(hf, vfsub_file_execed(file)); + if (val) { + FiMustWriteLock(file); + AuDebugOn(IS_ERR_OR_NULL(file->f_path.dentry)); @@ -13548,7 +13593,7 @@ index 0000000..179f21a + +/* ---------------------------------------------------------------------- */ + -+void au_finfo_fin(struct file *file) ++void au_finfo_fin(struct file *file, int atonce) +{ + struct au_finfo *finfo; + @@ -13557,7 +13602,10 @@ index 0000000..179f21a + finfo = au_fi(file); + AuDebugOn(finfo->fi_hdir); + AuRwDestroy(&finfo->fi_rwsem); -+ au_cache_free_finfo(finfo); ++ if (!atonce) ++ au_cache_dfree_finfo(finfo); ++ else ++ au_cache_free_finfo(finfo); +} + +void au_fi_init_once(void *_finfo) @@ -13594,7 +13642,7 @@ index 0000000..179f21a +} diff --git a/fs/aufs/fstype.h b/fs/aufs/fstype.h new file mode 100644 -index 0000000..2842400 +index 0000000..6c4a5d5 --- /dev/null +++ b/fs/aufs/fstype.h @@ -0,0 +1,400 @@ @@ -13641,7 +13689,7 @@ index 0000000..2842400 + +static inline int au_test_iso9660(struct super_block *sb __maybe_unused) +{ -+#if defined(CONFIG_ISO9660_FS) || defined(CONFIG_ISO9660_FS_MODULE) ++#if IS_ENABLED(CONFIG_ISO9660_FS) + return sb->s_magic == ISOFS_SUPER_MAGIC; +#else + return 0; @@ -13650,7 +13698,7 @@ index 0000000..2842400 + +static inline int au_test_romfs(struct super_block *sb __maybe_unused) +{ -+#if defined(CONFIG_ROMFS_FS) || defined(CONFIG_ROMFS_FS_MODULE) ++#if IS_ENABLED(CONFIG_ROMFS_FS) + return sb->s_magic == ROMFS_MAGIC; +#else + return 0; @@ -13659,7 +13707,7 @@ index 0000000..2842400 + +static inline int au_test_cramfs(struct super_block *sb __maybe_unused) +{ -+#if defined(CONFIG_CRAMFS) || defined(CONFIG_CRAMFS_MODULE) ++#if IS_ENABLED(CONFIG_CRAMFS) + return sb->s_magic == CRAMFS_MAGIC; +#endif + return 0; @@ -13667,7 +13715,7 @@ index 0000000..2842400 + +static inline int au_test_nfs(struct super_block *sb __maybe_unused) +{ -+#if defined(CONFIG_NFS_FS) || defined(CONFIG_NFS_FS_MODULE) ++#if IS_ENABLED(CONFIG_NFS_FS) + return sb->s_magic == NFS_SUPER_MAGIC; +#else + return 0; @@ -13676,7 +13724,7 @@ index 0000000..2842400 + +static inline int au_test_fuse(struct super_block *sb __maybe_unused) +{ -+#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE) ++#if IS_ENABLED(CONFIG_FUSE_FS) + return sb->s_magic == FUSE_SUPER_MAGIC; +#else + return 0; @@ -13685,7 +13733,7 @@ index 0000000..2842400 + +static inline int au_test_xfs(struct super_block *sb __maybe_unused) +{ -+#if defined(CONFIG_XFS_FS) || defined(CONFIG_XFS_FS_MODULE) ++#if IS_ENABLED(CONFIG_XFS_FS) + return sb->s_magic == XFS_SB_MAGIC; +#else + return 0; @@ -13703,7 +13751,7 @@ index 0000000..2842400 + +static inline int au_test_ecryptfs(struct super_block *sb __maybe_unused) +{ -+#if defined(CONFIG_ECRYPT_FS) || defined(CONFIG_ECRYPT_FS_MODULE) ++#if IS_ENABLED(CONFIG_ECRYPT_FS) + return !strcmp(au_sbtype(sb), "ecryptfs"); +#else + return 0; @@ -13717,7 +13765,7 @@ index 0000000..2842400 + +static inline int au_test_ubifs(struct super_block *sb __maybe_unused) +{ -+#if defined(CONFIG_UBIFS_FS) || defined(CONFIG_UBIFS_FS_MODULE) ++#if IS_ENABLED(CONFIG_UBIFS_FS) + return sb->s_magic == UBIFS_SUPER_MAGIC; +#else + return 0; @@ -13744,7 +13792,7 @@ index 0000000..2842400 + +static inline int au_test_configfs(struct super_block *sb __maybe_unused) +{ -+#if defined(CONFIG_CONFIGFS_FS) || defined(CONFIG_CONFIGFS_FS_MODULE) ++#if IS_ENABLED(CONFIG_CONFIGFS_FS) + return sb->s_magic == CONFIGFS_MAGIC; +#else + return 0; @@ -13753,7 +13801,7 @@ index 0000000..2842400 + +static inline int au_test_minix(struct super_block *sb __maybe_unused) +{ -+#if defined(CONFIG_MINIX_FS) || defined(CONFIG_MINIX_FS_MODULE) ++#if IS_ENABLED(CONFIG_MINIX_FS) + return sb->s_magic == MINIX3_SUPER_MAGIC + || sb->s_magic == MINIX2_SUPER_MAGIC + || sb->s_magic == MINIX2_SUPER_MAGIC2 @@ -13766,7 +13814,7 @@ index 0000000..2842400 + +static inline int au_test_fat(struct super_block *sb __maybe_unused) +{ -+#if defined(CONFIG_FAT_FS) || defined(CONFIG_FAT_FS_MODULE) ++#if IS_ENABLED(CONFIG_FAT_FS) + return sb->s_magic == MSDOS_SUPER_MAGIC; +#else + return 0; @@ -13794,7 +13842,7 @@ index 0000000..2842400 + +static inline int au_test_squashfs(struct super_block *sb __maybe_unused) +{ -+#if defined(CONFIG_SQUASHFS) || defined(CONFIG_SQUASHFS_MODULE) ++#if IS_ENABLED(CONFIG_SQUASHFS) + return sb->s_magic == SQUASHFS_MAGIC; +#else + return 0; @@ -13803,7 +13851,7 @@ index 0000000..2842400 + +static inline int au_test_btrfs(struct super_block *sb __maybe_unused) +{ -+#if defined(CONFIG_BTRFS_FS) || defined(CONFIG_BTRFS_FS_MODULE) ++#if IS_ENABLED(CONFIG_BTRFS_FS) + return sb->s_magic == BTRFS_SUPER_MAGIC; +#else + return 0; @@ -13812,7 +13860,7 @@ index 0000000..2842400 + +static inline int au_test_xenfs(struct super_block *sb __maybe_unused) +{ -+#if defined(CONFIG_XENFS) || defined(CONFIG_XENFS_MODULE) ++#if IS_ENABLED(CONFIG_XENFS) + return sb->s_magic == XENFS_SUPER_MAGIC; +#else + return 0; @@ -13830,7 +13878,7 @@ index 0000000..2842400 + +static inline int au_test_nilfs(struct super_block *sb __maybe_unused) +{ -+#if defined(CONFIG_NILFS) || defined(CONFIG_NILFS_MODULE) ++#if IS_ENABLED(CONFIG_NILFS) + return sb->s_magic == NILFS_SUPER_MAGIC; +#else + return 0; @@ -13839,7 +13887,7 @@ index 0000000..2842400 + +static inline int au_test_hfsplus(struct super_block *sb __maybe_unused) +{ -+#if defined(CONFIG_HFSPLUS_FS) || defined(CONFIG_HFSPLUS_FS_MODULE) ++#if IS_ENABLED(CONFIG_HFSPLUS_FS) + return sb->s_magic == HFSPLUS_SUPER_MAGIC; +#else + return 0; @@ -14000,7 +14048,7 @@ index 0000000..2842400 +#endif /* __AUFS_FSTYPE_H__ */ diff --git a/fs/aufs/hfsnotify.c b/fs/aufs/hfsnotify.c new file mode 100644 -index 0000000..501b551 +index 0000000..485587a --- /dev/null +++ b/fs/aufs/hfsnotify.c @@ -0,0 +1,287 @@ @@ -14037,8 +14085,8 @@ index 0000000..501b551 +{ + struct au_hnotify *hn = container_of(mark, struct au_hnotify, + hn_mark); -+ AuDbg("here\n"); -+ au_cache_free_hnotify(hn); ++ /* AuDbg("here\n"); */ ++ au_cache_dfree_hnotify(hn); + smp_mb__before_atomic(); + if (atomic64_dec_and_test(&au_hfsn_ifree)) + wake_up(&au_hfsn_wq); @@ -14161,8 +14209,8 @@ index 0000000..501b551 +{ + struct au_br_hfsnotify *hfsn = group->private; + -+ AuDbg("here\n"); -+ kfree(hfsn); ++ /* AuDbg("here\n"); */ ++ au_delayed_kfree(hfsn); +} + +static int au_hfsn_handle_event(struct fsnotify_group *group, @@ -14256,7 +14304,7 @@ index 0000000..501b551 + goto out; /* success */ + +out_hfsn: -+ kfree(hfsn); ++ au_delayed_kfree(hfsn); +out: + return err; +} @@ -14355,10 +14403,10 @@ index 0000000..af256fa +} diff --git a/fs/aufs/hnotify.c b/fs/aufs/hnotify.c new file mode 100644 -index 0000000..d948ed5 +index 0000000..4444fe1 --- /dev/null +++ b/fs/aufs/hnotify.c -@@ -0,0 +1,710 @@ +@@ -0,0 +1,723 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -14396,7 +14444,7 @@ index 0000000..d948ed5 + AuTraceErr(err); + if (unlikely(err)) { + hinode->hi_notify = NULL; -+ au_cache_free_hnotify(hn); ++ au_cache_dfree_hnotify(hn); + /* + * The upper dir was removed by udba, but the same named + * dir left. In this case, aufs assignes a new inode @@ -14420,7 +14468,7 @@ index 0000000..d948ed5 + if (hn) { + hinode->hi_notify = NULL; + if (au_hnotify_op.free(hinode, hn)) -+ au_cache_free_hnotify(hn); ++ au_cache_dfree_hnotify(hn); + } +} + @@ -14854,7 +14902,7 @@ index 0000000..d948ed5 + || au_ftest_hnjob(a->flags[AuHn_CHILD], GEN))) { + inode = lookup_wlock_by_ino(sb, bfound, h_ino); + try_iput = 1; -+ } ++ } + + args.flags = a->flags[AuHn_CHILD]; + args.dentry = dentry; @@ -14893,7 +14941,7 @@ index 0000000..d948ed5 + iput(a->dir); + si_write_unlock(sb); + au_nwt_done(&sbinfo->si_nowait); -+ kfree(a); ++ au_delayed_kfree(a); +} + +/* ---------------------------------------------------------------------- */ @@ -14999,7 +15047,7 @@ index 0000000..d948ed5 + iput(args->h_child_inode); + iput(args->h_dir); + iput(args->dir); -+ kfree(args); ++ au_delayed_kfree(args); + } + +out: @@ -15040,17 +15088,26 @@ index 0000000..d948ed5 + +static void au_hn_destroy_cache(void) +{ -+ kmem_cache_destroy(au_cachep[AuCache_HNOTIFY]); -+ au_cachep[AuCache_HNOTIFY] = NULL; ++ struct au_cache *cp; ++ ++ flush_delayed_work(&au_dfree.dwork); ++ cp = au_dfree.cache + AuCache_HNOTIFY; ++ AuDebugOn(!llist_empty(&cp->llist)); ++ kmem_cache_destroy(cp->cache); ++ cp->cache = NULL; +} + ++AU_CACHE_DFREE_FUNC(hnotify, HNOTIFY, hn_lnode); ++ +int __init au_hnotify_init(void) +{ + int err; ++ struct au_cache *cp; + + err = -ENOMEM; -+ au_cachep[AuCache_HNOTIFY] = AuCache(au_hnotify); -+ if (au_cachep[AuCache_HNOTIFY]) { ++ cp = au_dfree.cache + AuCache_HNOTIFY; ++ cp->cache = AuCache(au_hnotify); ++ if (cp->cache) { + err = 0; + if (au_hnotify_op.init) + err = au_hnotify_op.init(); @@ -15063,18 +15120,22 @@ index 0000000..d948ed5 + +void au_hnotify_fin(void) +{ ++ struct au_cache *cp; ++ + if (au_hnotify_op.fin) + au_hnotify_op.fin(); ++ + /* cf. au_cache_fin() */ -+ if (au_cachep[AuCache_HNOTIFY]) ++ cp = au_dfree.cache + AuCache_HNOTIFY; ++ if (cp->cache) + au_hn_destroy_cache(); +} diff --git a/fs/aufs/i_op.c b/fs/aufs/i_op.c new file mode 100644 -index 0000000..6f3fedb +index 0000000..8517019 --- /dev/null +++ b/fs/aufs/i_op.c -@@ -0,0 +1,1407 @@ +@@ -0,0 +1,1413 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -15274,8 +15335,9 @@ index 0000000..6f3fedb + if (!err) + err = au_digen_test(parent, au_sigen(sb)); + if (!err) { ++ /* regardless LOOKUP_CREATE, always ALLOW_NEG */ + npositive = au_lkup_dentry(dentry, au_dbtop(parent), -+ /*type*/0); ++ AuLkup_ALLOW_NEG); + err = npositive; + } + di_read_unlock(parent, AuLock_IR); @@ -15356,8 +15418,8 @@ index 0000000..6f3fedb + umode_t create_mode, int *opened) +{ + int err, h_opened = *opened; -+ struct dentry *parent; -+ struct dentry *d; ++ unsigned int lkup_flags; ++ struct dentry *parent, *d; + struct au_sphlhead *aopen; + struct vfsub_aopen_args args = { + .open_flag = open_flag, @@ -15369,14 +15431,18 @@ index 0000000..6f3fedb + }; + + IMustLock(dir); -+ AuDbg("open_flag 0x%x\n", open_flag); ++ AuDbg("open_flag 0%o\n", open_flag); + AuDbgDentry(dentry); + + err = 0; + if (!au_di(dentry)) { -+ d = aufs_lookup(dir, dentry, /*flags*/0); ++ lkup_flags = LOOKUP_OPEN; ++ if (open_flag & O_CREAT) ++ lkup_flags |= LOOKUP_CREATE; ++ d = aufs_lookup(dir, dentry, lkup_flags); + if (IS_ERR(d)) { + err = PTR_ERR(d); ++ AuTraceErr(err); + goto out; + } else if (d) { + /* @@ -15384,8 +15450,8 @@ index 0000000..6f3fedb + * another error will be returned later. + */ + d_drop(d); -+ dput(d); + AuDbgDentry(d); ++ dput(d); + } + AuDbgDentry(dentry); + } @@ -15402,7 +15468,7 @@ index 0000000..6f3fedb + + parent = dentry->d_parent; /* dir is locked */ + di_write_lock_parent(parent); -+ err = au_lkup_dentry(dentry, /*btop*/0, /*type*/0); ++ err = au_lkup_dentry(dentry, /*btop*/0, AuLkup_ALLOW_NEG); + if (unlikely(err)) + goto out_unlock; + @@ -15448,10 +15514,10 @@ index 0000000..6f3fedb + di_write_unlock(parent); + aufs_read_unlock(dentry, AuLock_DW); + AuDbgDentry(dentry); -+ if (unlikely(err)) ++ if (unlikely(err < 0)) + goto out; +out_no_open: -+ if (!err && !(*opened & FILE_CREATED)) { ++ if (err >= 0 && !(*opened & FILE_CREATED)) { + AuLabel(out_no_open); + dget(dentry); + err = finish_no_open(file, dentry); @@ -15598,7 +15664,7 @@ index 0000000..6f3fedb +void au_pin_hdir_unlock(struct au_pin *p) +{ + if (p->hdir) -+ au_hn_imtx_unlock(p->hdir); ++ au_hn_inode_unlock(p->hdir); +} + +int au_pin_hdir_lock(struct au_pin *p) @@ -15610,7 +15676,7 @@ index 0000000..6f3fedb + goto out; + + /* even if an error happens later, keep this lock */ -+ au_hn_imtx_lock_nested(p->hdir, p->lsc_hi); ++ au_hn_inode_lock_nested(p->hdir, p->lsc_hi); + + err = -EBUSY; + if (unlikely(p->hdir->hi_inode != d_inode(p->h_parent))) @@ -15651,17 +15717,17 @@ index 0000000..6f3fedb + return err; +} + -+void au_pin_hdir_set_owner(struct au_pin *p, struct task_struct *task) ++static void au_pin_hdir_set_owner(struct au_pin *p, struct task_struct *task) +{ -+#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) -+ p->hdir->hi_inode->i_mutex.owner = task; ++#if !defined(CONFIG_RWSEM_GENERIC_SPINLOCK) && defined(CONFIG_RWSEM_SPIN_ON_OWNER) ++ p->hdir->hi_inode->i_rwsem.owner = task; +#endif +} + +void au_pin_hdir_acquire_nest(struct au_pin *p) +{ + if (p->hdir) { -+ mutex_acquire_nest(&p->hdir->hi_inode->i_mutex.dep_map, ++ rwsem_acquire_nest(&p->hdir->hi_inode->i_rwsem.dep_map, + p->lsc_hi, 0, NULL, _RET_IP_); + au_pin_hdir_set_owner(p, current); + } @@ -15671,7 +15737,7 @@ index 0000000..6f3fedb +{ + if (p->hdir) { + au_pin_hdir_set_owner(p, p->task); -+ mutex_release(&p->hdir->hi_inode->i_mutex.dep_map, 1, _RET_IP_); ++ rwsem_release(&p->hdir->hi_inode->i_rwsem.dep_map, 1, _RET_IP_); + } +} + @@ -16065,7 +16131,7 @@ index 0000000..6f3fedb +out_si: + si_read_unlock(sb); +out_kfree: -+ kfree(a); ++ au_delayed_kfree(a); +out: + AuTraceErr(err); + return err; @@ -16100,15 +16166,15 @@ index 0000000..6f3fedb + return err; +} + -+ssize_t au_srxattr(struct dentry *dentry, struct au_srxattr *arg) ++ssize_t au_srxattr(struct dentry *dentry, struct inode *inode, ++ struct au_srxattr *arg) +{ + int err; + struct path h_path; + struct super_block *sb; + struct au_icpup_args *a; -+ struct inode *inode, *h_inode; ++ struct inode *h_inode; + -+ inode = d_inode(dentry); + IMustLock(inode); + + err = -ENOMEM; @@ -16130,6 +16196,7 @@ index 0000000..6f3fedb + inode_unlock(a->h_inode); + switch (arg->type) { + case AU_XATTR_SET: ++ AuDebugOn(d_is_negative(h_path.dentry)); + err = vfsub_setxattr(h_path.dentry, + arg->u.set.name, arg->u.set.value, + arg->u.set.size, arg->u.set.flags); @@ -16157,7 +16224,7 @@ index 0000000..6f3fedb + di_write_unlock(dentry); + si_read_unlock(sb); +out_kfree: -+ kfree(a); ++ au_delayed_kfree(a); +out: + AuTraceErr(err); + return err; @@ -16484,7 +16551,7 @@ index 0000000..6f3fedb +}; diff --git a/fs/aufs/i_op_add.c b/fs/aufs/i_op_add.c new file mode 100644 -index 0000000..e0833c5 +index 0000000..4ce147b --- /dev/null +++ b/fs/aufs/i_op_add.c @@ -0,0 +1,924 @@ @@ -16835,7 +16902,7 @@ index 0000000..e0833c5 + if (!try_aopen) + aufs_read_unlock(dentry, AuLock_DW); +out_free: -+ kfree(a); ++ au_delayed_kfree(a); +out: + return err; +} @@ -17299,7 +17366,7 @@ index 0000000..e0833c5 + } + aufs_read_and_write_unlock2(dentry, src_dentry); +out_kfree: -+ kfree(a); ++ au_delayed_kfree(a); +out: + AuTraceErr(err); + return err; @@ -17408,16 +17475,16 @@ index 0000000..e0833c5 + } + aufs_read_unlock(dentry, AuLock_DW); +out_free: -+ kfree(a); ++ au_delayed_kfree(a); +out: + return err; +} diff --git a/fs/aufs/i_op_del.c b/fs/aufs/i_op_del.c new file mode 100644 -index 0000000..3fe30b6 +index 0000000..27c1fb43 --- /dev/null +++ b/fs/aufs/i_op_del.c -@@ -0,0 +1,510 @@ +@@ -0,0 +1,511 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -17487,7 +17554,8 @@ index 0000000..3fe30b6 + au_di_cp(tmp, dinfo); + au_di_swap(tmp, dinfo); + /* returns the number of positive dentries */ -+ need_wh = au_lkup_dentry(dentry, btop + 1, /*type*/0); ++ need_wh = au_lkup_dentry(dentry, btop + 1, ++ /* AuLkup_IGNORE_PERM */ 0); + au_di_swap(tmp, dinfo); + au_rw_write_unlock(&tmp->di_rwsem); + au_di_free(tmp); @@ -17813,7 +17881,7 @@ index 0000000..3fe30b6 +out_unlock: + aufs_read_unlock(dentry, AuLock_DW); +out_free: -+ kfree(a); ++ au_delayed_kfree(a); +out: + return err; +} @@ -17923,14 +17991,14 @@ index 0000000..3fe30b6 +out_unlock: + aufs_read_unlock(dentry, AuLock_DW); +out_free: -+ kfree(a); ++ au_delayed_kfree(a); +out: + AuTraceErr(err); + return err; +} diff --git a/fs/aufs/i_op_ren.c b/fs/aufs/i_op_ren.c new file mode 100644 -index 0000000..cd2204b +index 0000000..200b4d5 --- /dev/null +++ b/fs/aufs/i_op_ren.c @@ -0,0 +1,1015 @@ @@ -18041,9 +18109,9 @@ index 0000000..cd2204b +{ + int rerr; + -+ au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD); ++ au_hn_inode_lock_nested(a->src_hinode, AuLsc_I_CHILD); + rerr = au_diropq_remove(a->src_dentry, a->btgt); -+ au_hn_imtx_unlock(a->src_hinode); ++ au_hn_inode_unlock(a->src_hinode); + au_set_dbdiropq(a->src_dentry, a->src_bdiropq); + if (rerr) + RevertFailure("remove diropq %pd", a->src_dentry); @@ -18198,9 +18266,9 @@ index 0000000..cd2204b + err = 0; + a->src_bdiropq = au_dbdiropq(a->src_dentry); + a->src_hinode = au_hi(a->src_inode, a->btgt); -+ au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD); ++ au_hn_inode_lock_nested(a->src_hinode, AuLsc_I_CHILD); + diropq = au_diropq_create(a->src_dentry, a->btgt); -+ au_hn_imtx_unlock(a->src_hinode); ++ au_hn_inode_unlock(a->src_hinode); + if (IS_ERR(diropq)) + err = PTR_ERR(diropq); + else @@ -18944,17 +19012,17 @@ index 0000000..cd2204b + iput(a->dst_inode); + if (a->thargs) + au_whtmp_rmdir_free(a->thargs); -+ kfree(a); ++ au_delayed_kfree(a); +out: + AuTraceErr(err); + return err; +} diff --git a/fs/aufs/iinfo.c b/fs/aufs/iinfo.c new file mode 100644 -index 0000000..29023ce +index 0000000..db04b18 --- /dev/null +++ b/fs/aufs/iinfo.c -@@ -0,0 +1,280 @@ +@@ -0,0 +1,284 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -19092,7 +19160,7 @@ index 0000000..29023ce + struct au_iinfo *iinfo; + aufs_bindex_t bindex, bbot; + -+ AuDebugOn(is_bad_inode(inode)); ++ AuDebugOn(au_is_bad_inode(inode)); + IiMustWriteLock(inode); + + iinfo = au_ii(inode); @@ -19150,6 +19218,7 @@ index 0000000..29023ce +{ + struct au_iinfo *iinfo; + struct super_block *sb; ++ struct au_hinode *hi; + int nbr, i; + + sb = inode->i_sb; @@ -19157,12 +19226,13 @@ index 0000000..29023ce + nbr = au_sbbot(sb) + 1; + if (unlikely(nbr <= 0)) + nbr = 1; -+ iinfo->ii_hinode = kmalloc_array(nbr, sizeof(*iinfo->ii_hinode), -+ GFP_NOFS); -+ if (iinfo->ii_hinode) { ++ hi = kmalloc_array(nbr, sizeof(*iinfo->ii_hinode), GFP_NOFS); ++ if (hi) { + au_ninodes_inc(sb); -+ for (i = 0; i < nbr; i++) -+ au_hinode_init(iinfo->ii_hinode + i); ++ ++ iinfo->ii_hinode = hi; ++ for (i = 0; i < nbr; i++, hi++) ++ au_hinode_init(hi); + + iinfo->ii_generation.ig_generation = au_sigen(sb); + iinfo->ii_btop = -1; @@ -19183,9 +19253,11 @@ index 0000000..29023ce + err = -ENOMEM; + hip = krealloc(iinfo->ii_hinode, sizeof(*hip) * nbr, GFP_NOFS); + if (hip) { -+ for (i = iinfo->ii_bbot + 1; i < nbr; i++) -+ au_hinode_init(hip + i); + iinfo->ii_hinode = hip; ++ i = iinfo->ii_bbot + 1; ++ hip += i; ++ for (; i < nbr; i++, hip++) ++ au_hinode_init(hip); + err = 0; + } + @@ -19200,7 +19272,7 @@ index 0000000..29023ce + aufs_bindex_t bindex, bbot; + const unsigned char unlinked = !inode->i_nlink; + -+ AuDebugOn(is_bad_inode(inode)); ++ AuDebugOn(au_is_bad_inode(inode)); + + sb = inode->i_sb; + au_ninodes_dec(sb); @@ -19220,7 +19292,7 @@ index 0000000..29023ce + + iinfo = au_ii(inode); + if (iinfo->ii_vdir) -+ au_vdir_free(iinfo->ii_vdir); ++ au_vdir_free(iinfo->ii_vdir, /*atonce*/0); + + bindex = iinfo->ii_btop; + if (bindex >= 0) { @@ -19232,12 +19304,12 @@ index 0000000..29023ce + hi++; + } + } -+ kfree(iinfo->ii_hinode); ++ au_delayed_kfree(iinfo->ii_hinode); + AuRwDestroy(&iinfo->ii_rwsem); +} diff --git a/fs/aufs/inode.c b/fs/aufs/inode.c new file mode 100644 -index 0000000..1794574 +index 0000000..2234241 --- /dev/null +++ b/fs/aufs/inode.c @@ -0,0 +1,517 @@ @@ -19290,7 +19362,7 @@ index 0000000..1794574 + struct au_iinfo *iinfo; + struct au_hinode *p, *q, tmp; + -+ AuDebugOn(is_bad_inode(inode)); ++ AuDebugOn(au_is_bad_inode(inode)); + IiMustWriteLock(inode); + + *update = 0; @@ -19760,10 +19832,10 @@ index 0000000..1794574 +} diff --git a/fs/aufs/inode.h b/fs/aufs/inode.h new file mode 100644 -index 0000000..38db636 +index 0000000..f433330 --- /dev/null +++ b/fs/aufs/inode.h -@@ -0,0 +1,689 @@ +@@ -0,0 +1,700 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -19801,7 +19873,10 @@ index 0000000..38db636 + /* never use fsnotify_add_vfsmount_mark() */ + struct fsnotify_mark hn_mark; +#endif -+ struct inode *hn_aufs_inode; /* no get/put */ ++ union { ++ struct inode *hn_aufs_inode; /* no get/put */ ++ struct llist_node hn_lnode; /* delayed free */ ++ }; +#endif +} ____cacheline_aligned_in_smp; + @@ -19844,7 +19919,10 @@ index 0000000..38db636 +struct au_icntnr { + struct au_iinfo iinfo; + struct inode vfs_inode; -+ struct hlist_node plink; ++ union { ++ struct hlist_node plink; ++ struct llist_node lnode; /* delayed free */ ++ }; +} ____cacheline_aligned_in_smp; + +/* au_pin flags */ @@ -19877,7 +19955,6 @@ index 0000000..38db636 +void au_pin_hdir_unlock(struct au_pin *p); +int au_pin_hdir_lock(struct au_pin *p); +int au_pin_hdir_relock(struct au_pin *p); -+void au_pin_hdir_set_owner(struct au_pin *p, struct task_struct *task); +void au_pin_hdir_acquire_nest(struct au_pin *p); +void au_pin_hdir_release(struct au_pin *p); + @@ -20072,10 +20149,10 @@ index 0000000..38db636 +int au_cpup_xattr(struct dentry *h_dst, struct dentry *h_src, int ignore_flags, + unsigned int verbose); +ssize_t aufs_listxattr(struct dentry *dentry, char *list, size_t size); -+ssize_t aufs_getxattr(struct dentry *dentry, const char *name, void *value, -+ size_t size); -+int aufs_setxattr(struct dentry *dentry, const char *name, const void *value, -+ size_t size, int flags); ++ssize_t aufs_getxattr(struct dentry *dentry, struct inode *inode, ++ const char *name, void *value, size_t size); ++int aufs_setxattr(struct dentry *dentry, struct inode *inode, const char *name, ++ const void *value, size_t size, int flags); +int aufs_removexattr(struct dentry *dentry, const char *name); + +/* void au_xattr_init(struct super_block *sb); */ @@ -20115,7 +20192,8 @@ index 0000000..38db636 + } acl_set; + } u; +}; -+ssize_t au_srxattr(struct dentry *dentry, struct au_srxattr *arg); ++ssize_t au_srxattr(struct dentry *dentry, struct inode *inode, ++ struct au_srxattr *arg); +#endif + +/* ---------------------------------------------------------------------- */ @@ -20246,6 +20324,11 @@ index 0000000..38db636 + return iinfo->ii_hinode + bindex; +} + ++static inline int au_is_bad_inode(struct inode *inode) ++{ ++ return !!(is_bad_inode(inode) || !au_hinode(au_ii(inode), 0)); ++} ++ +static inline aufs_bindex_t au_ii_br_id(struct inode *inode, + aufs_bindex_t bindex) +{ @@ -20432,20 +20515,20 @@ index 0000000..38db636 + au_hn_ctl(hdir, /*do_set*/1); +} + -+static inline void au_hn_imtx_lock(struct au_hinode *hdir) ++static inline void au_hn_inode_lock(struct au_hinode *hdir) +{ + inode_lock(hdir->hi_inode); + au_hn_suspend(hdir); +} + -+static inline void au_hn_imtx_lock_nested(struct au_hinode *hdir, ++static inline void au_hn_inode_lock_nested(struct au_hinode *hdir, + unsigned int sc __maybe_unused) +{ + inode_lock_nested(hdir->hi_inode, sc); + au_hn_suspend(hdir); +} + -+static inline void au_hn_imtx_unlock(struct au_hinode *hdir) ++static inline void au_hn_inode_unlock(struct au_hinode *hdir) +{ + au_hn_resume(hdir); + inode_unlock(hdir->hi_inode); @@ -20680,7 +20763,7 @@ index 0000000..fc5529b +#endif diff --git a/fs/aufs/loop.c b/fs/aufs/loop.c new file mode 100644 -index 0000000..8a69754 +index 0000000..77df479 --- /dev/null +++ b/fs/aufs/loop.c @@ -0,0 +1,146 @@ @@ -20828,7 +20911,7 @@ index 0000000..8a69754 +{ + if (backing_file_func) + symbol_put(loop_backing_file); -+ kfree(au_warn_loopback_array); ++ au_delayed_kfree(au_warn_loopback_array); +} diff --git a/fs/aufs/loop.h b/fs/aufs/loop.h new file mode 100644 @@ -20926,10 +21009,10 @@ index 0000000..4f83bdf +endif diff --git a/fs/aufs/module.c b/fs/aufs/module.c new file mode 100644 -index 0000000..4a2e668 +index 0000000..01e55ab --- /dev/null +++ b/fs/aufs/module.c -@@ -0,0 +1,223 @@ +@@ -0,0 +1,289 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -20967,17 +21050,64 @@ index 0000000..4a2e668 +} + +/* ---------------------------------------------------------------------- */ -+ +/* + * aufs caches + */ -+struct kmem_cache *au_cachep[AuCache_Last] = { -+ [0] = NULL -+}; ++ ++struct au_dfree au_dfree; ++ ++/* delayed free */ ++static void au_do_dfree(struct work_struct *work __maybe_unused) ++{ ++ struct llist_head *head; ++ struct llist_node *node, *next; ++ ++#define AU_CACHE_DFREE_DO_BODY(name, idx, lnode) do { \ ++ head = &au_dfree.cache[AuCache_##idx].llist; \ ++ node = llist_del_all(head); \ ++ for (; node; node = next) { \ ++ struct au_##name *p \ ++ = llist_entry(node, struct au_##name, \ ++ lnode); \ ++ next = llist_next(node); \ ++ au_cache_free_##name(p); \ ++ } \ ++ } while (0) ++ ++ AU_CACHE_DFREE_DO_BODY(dinfo, DINFO, di_lnode); ++ AU_CACHE_DFREE_DO_BODY(icntnr, ICNTNR, lnode); ++ AU_CACHE_DFREE_DO_BODY(finfo, FINFO, fi_lnode); ++ AU_CACHE_DFREE_DO_BODY(vdir, VDIR, vd_lnode); ++ AU_CACHE_DFREE_DO_BODY(vdir_dehstr, DEHSTR, lnode); ++#ifdef CONFIG_AUFS_HNOTIFY ++ AU_CACHE_DFREE_DO_BODY(hnotify, HNOTIFY, hn_lnode); ++#endif ++ ++#define AU_DFREE_DO_BODY(llist, func) do { \ ++ node = llist_del_all(llist); \ ++ for (; node; node = next) { \ ++ next = llist_next(node); \ ++ func(node); \ ++ } \ ++ } while (0) ++ ++ AU_DFREE_DO_BODY(au_dfree.llist + AU_DFREE_KFREE, kfree); ++ AU_DFREE_DO_BODY(au_dfree.llist + AU_DFREE_FREE_PAGE, au_free_page); ++ ++#undef AU_CACHE_DFREE_DO_BODY ++#undef AU_DFREE_DO_BODY ++} ++ ++AU_CACHE_DFREE_FUNC(dinfo, DINFO, di_lnode); ++AU_CACHE_DFREE_FUNC(icntnr, ICNTNR, lnode); ++AU_CACHE_DFREE_FUNC(finfo, FINFO, fi_lnode); ++AU_CACHE_DFREE_FUNC(vdir, VDIR, vd_lnode); ++AU_CACHE_DFREE_FUNC(vdir_dehstr, DEHSTR, lnode); + +static void au_cache_fin(void) +{ + int i; ++ struct au_cache *cp; + + /* + * Make sure all delayed rcu free inodes are flushed before we @@ -20987,27 +21117,33 @@ index 0000000..4a2e668 + + /* excluding AuCache_HNOTIFY */ + BUILD_BUG_ON(AuCache_HNOTIFY + 1 != AuCache_Last); ++ flush_delayed_work(&au_dfree.dwork); + for (i = 0; i < AuCache_HNOTIFY; i++) { -+ kmem_cache_destroy(au_cachep[i]); -+ au_cachep[i] = NULL; ++ cp = au_dfree.cache + i; ++ AuDebugOn(!llist_empty(&cp->llist)); ++ kmem_cache_destroy(cp->cache); ++ cp->cache = NULL; + } +} + +static int __init au_cache_init(void) +{ -+ au_cachep[AuCache_DINFO] = AuCacheCtor(au_dinfo, au_di_init_once); -+ if (au_cachep[AuCache_DINFO]) ++ struct au_cache *cp; ++ ++ cp = au_dfree.cache; ++ cp[AuCache_DINFO].cache = AuCacheCtor(au_dinfo, au_di_init_once); ++ if (cp[AuCache_DINFO].cache) + /* SLAB_DESTROY_BY_RCU */ -+ au_cachep[AuCache_ICNTNR] = AuCacheCtor(au_icntnr, -+ au_icntnr_init_once); -+ if (au_cachep[AuCache_ICNTNR]) -+ au_cachep[AuCache_FINFO] = AuCacheCtor(au_finfo, -+ au_fi_init_once); -+ if (au_cachep[AuCache_FINFO]) -+ au_cachep[AuCache_VDIR] = AuCache(au_vdir); -+ if (au_cachep[AuCache_VDIR]) -+ au_cachep[AuCache_DEHSTR] = AuCache(au_vdir_dehstr); -+ if (au_cachep[AuCache_DEHSTR]) ++ cp[AuCache_ICNTNR].cache = AuCacheCtor(au_icntnr, ++ au_icntnr_init_once); ++ if (cp[AuCache_ICNTNR].cache) ++ cp[AuCache_FINFO].cache = AuCacheCtor(au_finfo, ++ au_fi_init_once); ++ if (cp[AuCache_FINFO].cache) ++ cp[AuCache_VDIR].cache = AuCache(au_vdir); ++ if (cp[AuCache_VDIR].cache) ++ cp[AuCache_DEHSTR].cache = AuCache(au_vdir_dehstr); ++ if (cp[AuCache_DEHSTR].cache) + return 0; + + au_cache_fin(); @@ -21070,6 +21206,7 @@ index 0000000..4a2e668 +{ + int err, i; + char *p; ++ struct au_cache *cp; + + p = au_esc_chars; + for (i = 1; i <= ' '; i++) @@ -21084,6 +21221,16 @@ index 0000000..4a2e668 + for (i = 0; i < AuIop_Last; i++) + aufs_iop_nogetattr[i].getattr = NULL; + ++ /* First, initialize au_dfree */ ++ for (i = 0; i < AuCache_Last; i++) { /* including hnotify */ ++ cp = au_dfree.cache + i; ++ cp->cache = NULL; ++ init_llist_head(&cp->llist); ++ } ++ for (i = 0; i < AU_DFREE_Last; i++) ++ init_llist_head(au_dfree.llist + i); ++ INIT_DELAYED_WORK(&au_dfree.dwork, au_do_dfree); ++ + au_sbilist_init(); + sysaufs_brs_init(); + au_debug_init(); @@ -21134,6 +21281,7 @@ index 0000000..4a2e668 +out_sysaufs: + sysaufs_fin(); + au_dy_fin(); ++ flush_delayed_work(&au_dfree.dwork); +out: + return err; +} @@ -21149,16 +21297,17 @@ index 0000000..4a2e668 + au_procfs_fin(); + sysaufs_fin(); + au_dy_fin(); ++ flush_delayed_work(&au_dfree.dwork); +} + +module_init(aufs_init); +module_exit(aufs_exit); diff --git a/fs/aufs/module.h b/fs/aufs/module.h new file mode 100644 -index 0000000..21344c6 +index 0000000..c81b221 --- /dev/null +++ b/fs/aufs/module.h -@@ -0,0 +1,89 @@ +@@ -0,0 +1,144 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -21186,6 +21335,7 @@ index 0000000..21344c6 +#ifdef __KERNEL__ + +#include ++#include "debug.h" + +struct path; +struct seq_file; @@ -21212,7 +21362,7 @@ index 0000000..21344c6 + +/* ---------------------------------------------------------------------- */ + -+/* kmem cache */ ++/* kmem cache and delayed free */ +enum { + AuCache_DINFO, + AuCache_ICNTNR, @@ -21223,19 +21373,54 @@ index 0000000..21344c6 + AuCache_Last +}; + ++enum { ++ AU_DFREE_KFREE, ++ AU_DFREE_FREE_PAGE, ++ AU_DFREE_Last ++}; ++ ++struct au_cache { ++ struct kmem_cache *cache; ++ struct llist_head llist; /* delayed free */ ++}; ++ ++/* ++ * in order to reduce the cost of the internal timer, consolidate all the ++ * delayed free works into a single delayed_work. ++ */ ++struct au_dfree { ++ struct au_cache cache[AuCache_Last]; ++ struct llist_head llist[AU_DFREE_Last]; ++ struct delayed_work dwork; ++}; ++ ++extern struct au_dfree au_dfree; ++ +#define AuCacheFlags (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD) +#define AuCache(type) KMEM_CACHE(type, AuCacheFlags) +#define AuCacheCtor(type, ctor) \ + kmem_cache_create(#type, sizeof(struct type), \ + __alignof__(struct type), AuCacheFlags, ctor) + -+extern struct kmem_cache *au_cachep[]; ++#define AU_DFREE_DELAY msecs_to_jiffies(10) ++#define AU_DFREE_BODY(lnode, llist) do { \ ++ if (llist_add(lnode, llist)) \ ++ schedule_delayed_work(&au_dfree.dwork, \ ++ AU_DFREE_DELAY); \ ++ } while (0) ++#define AU_CACHE_DFREE_FUNC(name, idx, lnode) \ ++ void au_cache_dfree_##name(struct au_##name *p) \ ++ { \ ++ struct au_cache *cp = au_dfree.cache + AuCache_##idx; \ ++ AU_DFREE_BODY(&p->lnode, &cp->llist); \ ++ } + +#define AuCacheFuncs(name, index) \ +static inline struct au_##name *au_cache_alloc_##name(void) \ -+{ return kmem_cache_alloc(au_cachep[AuCache_##index], GFP_NOFS); } \ ++{ return kmem_cache_alloc(au_dfree.cache[AuCache_##index].cache, GFP_NOFS); } \ +static inline void au_cache_free_##name(struct au_##name *p) \ -+{ kmem_cache_free(au_cachep[AuCache_##index], p); } ++{ kmem_cache_free(au_dfree.cache[AuCache_##index].cache, p); } \ ++void au_cache_dfree_##name(struct au_##name *p) + +AuCacheFuncs(dinfo, DINFO); +AuCacheFuncs(icntnr, ICNTNR); @@ -21246,14 +21431,33 @@ index 0000000..21344c6 +AuCacheFuncs(hnotify, HNOTIFY); +#endif + ++static inline void au_delayed_kfree(const void *p) ++{ ++ AuDebugOn(!p); ++ AuDebugOn(ksize(p) < sizeof(struct llist_node)); ++ ++ AU_DFREE_BODY((void *)p, au_dfree.llist + AU_DFREE_KFREE); ++} ++ ++/* cast only */ ++static inline void au_free_page(void *p) ++{ ++ free_page((unsigned long)p); ++} ++ ++static inline void au_delayed_free_page(unsigned long addr) ++{ ++ AU_DFREE_BODY((void *)addr, au_dfree.llist + AU_DFREE_FREE_PAGE); ++} ++ +#endif /* __KERNEL__ */ +#endif /* __AUFS_MODULE_H__ */ diff --git a/fs/aufs/mvdown.c b/fs/aufs/mvdown.c new file mode 100644 -index 0000000..9aff9fc +index 0000000..802ef09 --- /dev/null +++ b/fs/aufs/mvdown.c -@@ -0,0 +1,703 @@ +@@ -0,0 +1,704 @@ +/* + * Copyright (C) 2011-2016 Junjiro R. Okajima + * @@ -21720,7 +21924,8 @@ index 0000000..9aff9fc + au_di_swap(tmp, dinfo); + + /* returns the number of positive dentries */ -+ err = au_lkup_dentry(a->dentry, a->mvd_bsrc + 1, /*type*/0); ++ err = au_lkup_dentry(a->dentry, a->mvd_bsrc + 1, ++ /* AuLkup_IGNORE_PERM */ 0); + if (!err) + a->bwh = au_dbwh(a->dentry); + else if (err > 0) @@ -21952,17 +22157,17 @@ index 0000000..9aff9fc + e = copy_to_user(uarg, &args->mvdown, sizeof(args->mvdown)); + if (unlikely(e)) + err = -EFAULT; -+ kfree(args); ++ au_delayed_kfree(args); +out: + AuTraceErr(err); + return err; +} diff --git a/fs/aufs/opts.c b/fs/aufs/opts.c new file mode 100644 -index 0000000..6d53b2e +index 0000000..ddb2965 --- /dev/null +++ b/fs/aufs/opts.c -@@ -0,0 +1,1859 @@ +@@ -0,0 +1,1860 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -23221,7 +23426,7 @@ index 0000000..6d53b2e + } + } + -+ kfree(a); ++ au_delayed_kfree(a); + dump_opts(opts); + if (unlikely(err)) + au_opts_free(opts); @@ -23634,16 +23839,17 @@ index 0000000..6d53b2e + continue; + + hdir = au_hi(dir, bindex); -+ au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ au_hn_inode_lock_nested(hdir, AuLsc_I_PARENT); + if (wbr) + wbr_wh_write_lock(wbr); + err = au_wh_init(br, sb); + if (wbr) + wbr_wh_write_unlock(wbr); -+ au_hn_imtx_unlock(hdir); ++ au_hn_inode_unlock(hdir); + + if (!err && do_free) { -+ kfree(wbr); ++ if (wbr) ++ au_delayed_kfree(wbr); + br->br_wbr = NULL; + } + } @@ -24041,10 +24247,10 @@ index 0000000..8d0c534 +#endif /* __AUFS_OPTS_H__ */ diff --git a/fs/aufs/plink.c b/fs/aufs/plink.c new file mode 100644 -index 0000000..a565a11 +index 0000000..8a816b9 --- /dev/null +++ b/fs/aufs/plink.c -@@ -0,0 +1,502 @@ +@@ -0,0 +1,514 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -24092,6 +24298,7 @@ index 0000000..a565a11 +{ + int err; + pid_t pid, ppid; ++ struct task_struct *parent, *prev; + struct au_sbinfo *sbi; + + SiMustAnyLock(sb); @@ -24106,11 +24313,22 @@ index 0000000..a565a11 + goto out; + + /* todo: it highly depends upon /sbin/mount.aufs */ ++ prev = NULL; ++ parent = current; ++ ppid = 0; + rcu_read_lock(); -+ ppid = task_pid_vnr(rcu_dereference(current->real_parent)); ++ while (1) { ++ parent = rcu_dereference(parent->real_parent); ++ if (parent == prev) ++ break; ++ ppid = task_pid_vnr(parent); ++ if (pid == ppid) { ++ rcu_read_unlock(); ++ goto out; ++ } ++ prev = parent; ++ } + rcu_read_unlock(); -+ if (pid == ppid) -+ goto out; + + if (au_ftest_lock(flags, NOPLMW)) { + /* if there is no i_mutex lock in VFS, we don't need to wait */ @@ -24607,7 +24825,7 @@ index 0000000..720b2ed +} diff --git a/fs/aufs/posix_acl.c b/fs/aufs/posix_acl.c new file mode 100644 -index 0000000..c44f63c +index 0000000..7981c43 --- /dev/null +++ b/fs/aufs/posix_acl.c @@ -0,0 +1,98 @@ @@ -24684,7 +24902,8 @@ index 0000000..c44f63c + }, + }; + -+ inode_lock(inode); ++ IMustLock(inode); ++ + if (inode->i_ino == AUFS_ROOT_INO) + dentry = dget(inode->i_sb->s_root); + else { @@ -24699,14 +24918,13 @@ index 0000000..c44f63c + } + } + -+ ssz = au_srxattr(dentry, &arg); ++ ssz = au_srxattr(dentry, inode, &arg); + dput(dentry); + err = ssz; + if (ssz >= 0) + err = 0; + +out: -+ inode_unlock(inode); + return err; +} diff --git a/fs/aufs/procfs.c b/fs/aufs/procfs.c @@ -24886,10 +25104,10 @@ index 0000000..a334330 +} diff --git a/fs/aufs/rdu.c b/fs/aufs/rdu.c new file mode 100644 -index 0000000..3774f4e +index 0000000..238c568 --- /dev/null +++ b/fs/aufs/rdu.c -@@ -0,0 +1,389 @@ +@@ -0,0 +1,381 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -25040,7 +25258,7 @@ index 0000000..3774f4e + arg.end += rdu->sz; + + err = -ENOTDIR; -+ if (unlikely(!file->f_op->iterate)) ++ if (unlikely(!file->f_op->iterate && !file->f_op->iterate_shared)) + goto out; + + err = security_file_permission(file, MAY_READ); @@ -25050,15 +25268,7 @@ index 0000000..3774f4e + + dentry = file->f_path.dentry; + inode = d_inode(dentry); -+#if 1 -+ inode_lock(inode); -+#else -+ /* todo: create a new inline func inode_lock_killable() */ -+ err = mutex_lock_killable(&inode->i_mutex); -+ AuTraceErr(err); -+ if (unlikely(err)) -+ goto out; -+#endif ++ inode_lock_shared(inode); + + arg.sb = inode->i_sb; + err = si_read_lock(arg.sb, AuLock_FLUSH | AuLock_NOPLM); @@ -25115,7 +25325,7 @@ index 0000000..3774f4e +out_si: + si_read_unlock(arg.sb); +out_mtx: -+ inode_unlock(inode); ++ inode_unlock_shared(inode); +out: + AuTraceErr(err); + return err; @@ -25485,10 +25695,10 @@ index 0000000..678fe6f +#endif /* __AUFS_RWSEM_H__ */ diff --git a/fs/aufs/sbinfo.c b/fs/aufs/sbinfo.c new file mode 100644 -index 0000000..3dd9fd6 +index 0000000..3859f7f --- /dev/null +++ b/fs/aufs/sbinfo.c -@@ -0,0 +1,353 @@ +@@ -0,0 +1,354 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -25524,7 +25734,7 @@ index 0000000..3dd9fd6 + sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); + for (i = 0; i < AuPlink_NHASH; i++) + AuDebugOn(!hlist_empty(&sbinfo->si_plink[i].head)); -+ au_nwt_fin(&sbinfo->si_nowait); ++ AuDebugOn(atomic_read(&sbinfo->si_nowait.nw_len)); + + AuDebugOn(percpu_counter_sum(&sbinfo->si_ninodes)); + percpu_counter_destroy(&sbinfo->si_ninodes); @@ -25535,14 +25745,15 @@ index 0000000..3dd9fd6 + au_br_free(sbinfo); + au_rw_write_unlock(&sbinfo->si_rwsem); + -+ kfree(sbinfo->si_branch); ++ au_delayed_kfree(sbinfo->si_branch); + for (i = 0; i < AU_NPIDMAP; i++) -+ kfree(sbinfo->au_si_pid.pid_bitmap[i]); ++ if (sbinfo->au_si_pid.pid_bitmap[i]) ++ au_delayed_kfree(sbinfo->au_si_pid.pid_bitmap[i]); + mutex_destroy(&sbinfo->au_si_pid.pid_mtx); + mutex_destroy(&sbinfo->si_xib_mtx); + AuRwDestroy(&sbinfo->si_rwsem); + -+ kfree(sbinfo); ++ au_delayed_kfree(sbinfo); +} + +int au_si_alloc(struct super_block *sb) @@ -25614,9 +25825,9 @@ index 0000000..3dd9fd6 + return 0; /* success */ + +out_br: -+ kfree(sbinfo->si_branch); ++ au_delayed_kfree(sbinfo->si_branch); +out_sbinfo: -+ kfree(sbinfo); ++ au_delayed_kfree(sbinfo); +out: + return err; +} @@ -25844,10 +26055,10 @@ index 0000000..3dd9fd6 +} diff --git a/fs/aufs/spl.h b/fs/aufs/spl.h new file mode 100644 -index 0000000..945343a +index 0000000..411f6c8 --- /dev/null +++ b/fs/aufs/spl.h -@@ -0,0 +1,111 @@ +@@ -0,0 +1,113 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -25874,6 +26085,7 @@ index 0000000..945343a + +#ifdef __KERNEL__ + ++#if 0 +struct au_splhead { + spinlock_t spin; + struct list_head head; @@ -25906,6 +26118,7 @@ index 0000000..945343a + list_del_rcu(list); + spin_unlock(&spl->spin); +} ++#endif + +/* ---------------------------------------------------------------------- */ + @@ -25961,10 +26174,10 @@ index 0000000..945343a +#endif /* __AUFS_SPL_H__ */ diff --git a/fs/aufs/super.c b/fs/aufs/super.c new file mode 100644 -index 0000000..ff7e582 +index 0000000..093fba6 --- /dev/null +++ b/fs/aufs/super.c -@@ -0,0 +1,1039 @@ +@@ -0,0 +1,1038 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -26003,6 +26216,7 @@ index 0000000..ff7e582 + if (c) { + au_icntnr_init(c); + c->vfs_inode.i_version = 1; /* sigen(sb); */ ++ c->iinfo.ii_hinode = NULL; + return &c->vfs_inode; + } + return NULL; @@ -26012,13 +26226,12 @@ index 0000000..ff7e582 +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + -+ INIT_HLIST_HEAD(&inode->i_dentry); -+ au_cache_free_icntnr(container_of(inode, struct au_icntnr, vfs_inode)); ++ au_cache_dfree_icntnr(container_of(inode, struct au_icntnr, vfs_inode)); +} + +static void aufs_destroy_inode(struct inode *inode) +{ -+ if (!is_bad_inode(inode)) ++ if (!au_is_bad_inode(inode)) + au_iinfo_fin(inode); + call_rcu(&inode->i_rcu, aufs_destroy_inode_cb); +} @@ -26065,11 +26278,12 @@ index 0000000..ff7e582 + + err = 0; + bbot = au_sbbot(sb); -+ hdp = au_di(sb->s_root)->di_hdentry; -+ for (bindex = 0; !err && bindex <= bbot; bindex++) { ++ bindex = 0; ++ hdp = au_hdentry(au_di(sb->s_root), bindex); ++ for (; !err && bindex <= bbot; bindex++, hdp++) { + br = au_sbr(sb, bindex); + path.mnt = au_br_mnt(br); -+ path.dentry = hdp[bindex].hd_dentry; ++ path.dentry = hdp->hd_dentry; + err = au_seq_path(seq, &path); + if (!err) { + au_optstr_br_perm(&perm, br->br_perm); @@ -26144,7 +26358,6 @@ index 0000000..ff7e582 + struct qstr *name; + struct file *f; + struct dentry *d, *h_root; -+ struct au_hdentry *hdp; + + AuRwMustAnyLock(&sbinfo->si_rwsem); + @@ -26158,8 +26371,7 @@ index 0000000..ff7e582 + brid = au_xino_brid(sb); + if (brid >= 0) { + bindex = au_br_index(sb, brid); -+ hdp = au_di(sb->s_root)->di_hdentry; -+ h_root = hdp[0 + bindex].hd_dentry; ++ h_root = au_hdentry(au_di(sb->s_root), bindex)->hd_dentry; + } + d = f->f_path.dentry; + name = &d->d_name; @@ -26483,7 +26695,7 @@ index 0000000..ff7e582 + head = arg; + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, head, i_sb_list) { -+ if (!is_bad_inode(inode) ++ if (!au_is_bad_inode(inode) + && au_ii(inode)->ii_btop >= 0) { + spin_lock(&inode->i_lock); + if (atomic_read(&inode->i_count)) { @@ -26796,7 +27008,7 @@ index 0000000..ff7e582 +out_mtx: + inode_unlock(inode); +out_opts: -+ free_page((unsigned long)opts.opt); ++ au_delayed_free_page((unsigned long)opts.opt); +out: + err = cvt_err(err); + AuTraceErr(err); @@ -26937,7 +27149,7 @@ index 0000000..ff7e582 + kobject_put(&sbinfo->si_kobj); + sb->s_fs_info = NULL; +out_opts: -+ free_page((unsigned long)opts.opt); ++ au_delayed_free_page((unsigned long)opts.opt); +out: + AuTraceErr(err); + err = cvt_err(err); @@ -27867,7 +28079,7 @@ index 0000000..14975c9 +#endif /* __SYSAUFS_H__ */ diff --git a/fs/aufs/sysfs.c b/fs/aufs/sysfs.c new file mode 100644 -index 0000000..ff1c510 +index 0000000..36e66d6 --- /dev/null +++ b/fs/aufs/sysfs.c @@ -0,0 +1,376 @@ @@ -28082,7 +28294,7 @@ index 0000000..ff1c510 + if (unlikely(err == PAGE_SIZE)) + err = -EFBIG; + } -+ kfree(seq); ++ au_delayed_kfree(seq); +out_unlock: + si_read_unlock(sb); +out: @@ -28153,9 +28365,9 @@ index 0000000..ff1c510 + err = -EFAULT; + +out_seq: -+ kfree(seq); ++ au_delayed_kfree(seq); +out_buf: -+ free_page((unsigned long)buf); ++ au_delayed_free_page((unsigned long)buf); +out: + si_read_unlock(sb); + return err; @@ -28412,10 +28624,10 @@ index 0000000..cbebb37 +} diff --git a/fs/aufs/vdir.c b/fs/aufs/vdir.c new file mode 100644 -index 0000000..fc5ae1e +index 0000000..1fe1c42 --- /dev/null +++ b/fs/aufs/vdir.c -@@ -0,0 +1,888 @@ +@@ -0,0 +1,899 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -28528,7 +28740,7 @@ index 0000000..fc5ae1e + struct hlist_node *node; + + hlist_for_each_entry_safe(pos, node, head, wh_hash) -+ kfree(pos); ++ au_delayed_kfree(pos); +} + +static void au_nhash_de_do_free(struct hlist_head *head) @@ -28537,7 +28749,7 @@ index 0000000..fc5ae1e + struct hlist_node *node; + + hlist_for_each_entry_safe(pos, node, head, hash) -+ au_cache_free_vdir_dehstr(pos); ++ au_cache_dfree_vdir_dehstr(pos); +} + +static void au_nhash_do_free(struct au_nhash *nhash, @@ -28555,7 +28767,7 @@ index 0000000..fc5ae1e + nhash_count(head); + free(head++); + } -+ kfree(nhash->nh_head); ++ au_delayed_kfree(nhash->nh_head); +} + +void au_nhash_wh_free(struct au_nhash *whlist) @@ -28598,6 +28810,8 @@ index 0000000..fc5ae1e + AuDebugOn(!nhash->nh_num || !nhash->nh_head); + + v = 0; ++ if (len > 8) ++ len = 8; + while (len--) + v += *name++; + /* v = hash_long(v, magic_bit); */ @@ -28766,15 +28980,23 @@ index 0000000..fc5ae1e + +/* ---------------------------------------------------------------------- */ + -+void au_vdir_free(struct au_vdir *vdir) ++void au_vdir_free(struct au_vdir *vdir, int atonce) +{ + unsigned char **deblk; + + deblk = vdir->vd_deblk; -+ while (vdir->vd_nblk--) -+ kfree(*deblk++); -+ kfree(vdir->vd_deblk); -+ au_cache_free_vdir(vdir); ++ if (!atonce) { ++ while (vdir->vd_nblk--) ++ au_delayed_kfree(*deblk++); ++ au_delayed_kfree(vdir->vd_deblk); ++ au_cache_dfree_vdir(vdir); ++ } else { ++ /* not delayed */ ++ while (vdir->vd_nblk--) ++ kfree(*deblk++); ++ kfree(vdir->vd_deblk); ++ au_cache_free_vdir(vdir); ++ } +} + +static struct au_vdir *alloc_vdir(struct file *file) @@ -28808,10 +29030,10 @@ index 0000000..fc5ae1e + if (!err) + return vdir; /* success */ + -+ kfree(vdir->vd_deblk); ++ au_delayed_kfree(vdir->vd_deblk); + +out_free: -+ au_cache_free_vdir(vdir); ++ au_cache_dfree_vdir(vdir); +out: + vdir = ERR_PTR(err); + return vdir; @@ -28823,7 +29045,7 @@ index 0000000..fc5ae1e + union au_vdir_deblk_p p, deblk_end; + + while (vdir->vd_nblk > 1) { -+ kfree(vdir->vd_deblk[vdir->vd_nblk - 1]); ++ au_delayed_kfree(vdir->vd_deblk[vdir->vd_nblk - 1]); + /* vdir->vd_deblk[vdir->vd_nblk - 1] = NULL; */ + vdir->vd_nblk--; + } @@ -28954,7 +29176,7 @@ index 0000000..fc5ae1e + } + } + -+ free_page((unsigned long)o); ++ au_delayed_free_page((unsigned long)o); + +out: + AuTraceErr(err); @@ -29055,6 +29277,7 @@ index 0000000..fc5ae1e + err = 0; + inode = file_inode(file); + IMustLock(inode); ++ IiMustWriteLock(inode); + SiMustAnyLock(inode->i_sb); + + allocated = NULL; @@ -29092,7 +29315,7 @@ index 0000000..fc5ae1e + if (allocated) + au_set_ivdir(inode, allocated); + } else if (allocated) -+ au_vdir_free(allocated); ++ au_vdir_free(allocated, /*atonce*/0); + +out: + return err; @@ -29186,7 +29409,7 @@ index 0000000..fc5ae1e + if (allocated) + au_set_fvdir_cache(file, allocated); + } else if (allocated) -+ au_vdir_free(allocated); ++ au_vdir_free(allocated, /*atonce*/0); + +out: + return err; @@ -30196,10 +30419,10 @@ index 0000000..2d01dd8 +} diff --git a/fs/aufs/vfsub.h b/fs/aufs/vfsub.h new file mode 100644 -index 0000000..b0f1c53 +index 0000000..69c60e9 --- /dev/null +++ b/fs/aufs/vfsub.h -@@ -0,0 +1,310 @@ +@@ -0,0 +1,316 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -30398,6 +30621,12 @@ index 0000000..b0f1c53 + return flags; +} + ++static inline int vfsub_file_execed(struct file *file) ++{ ++ /* todo: direct access f_flags */ ++ return !!(vfsub_file_flags(file) & __FMODE_EXEC); ++} ++ +#if 0 /* reserved */ +static inline void vfsub_file_accessed(struct file *h_file) +{ @@ -30422,7 +30651,7 @@ index 0000000..b0f1c53 +static inline int vfsub_update_time(struct inode *h_inode, struct timespec *ts, + int flags) +{ -+ return generic_update_time(h_inode, ts, flags); ++ return update_time(h_inode, ts, flags); + /* no vfsub_update_h_iattr() since we don't have struct path */ +} + @@ -30512,7 +30741,7 @@ index 0000000..b0f1c53 +#endif /* __AUFS_VFSUB_H__ */ diff --git a/fs/aufs/wbr_policy.c b/fs/aufs/wbr_policy.c new file mode 100644 -index 0000000..22f4cef +index 0000000..9e508d8 --- /dev/null +++ b/fs/aufs/wbr_policy.c @@ -0,0 +1,765 @@ @@ -30979,7 +31208,7 @@ index 0000000..22f4cef + + mfs->mfsrr_bytes = bavail; + AuDbg("b%d\n", mfs->mfs_bindex); -+ kfree(st); ++ au_delayed_kfree(st); +} + +static int au_wbr_create_mfs(struct dentry *dentry, unsigned int flags) @@ -31283,7 +31512,7 @@ index 0000000..22f4cef +}; diff --git a/fs/aufs/whout.c b/fs/aufs/whout.c new file mode 100644 -index 0000000..4a781db +index 0000000..cdd3a8f --- /dev/null +++ b/fs/aufs/whout.c @@ -0,0 +1,1060 @@ @@ -31451,7 +31680,7 @@ index 0000000..4a781db + +out_name: + if (name != defname) -+ kfree(name); ++ au_delayed_kfree(name); +out: + AuTraceErrPtr(dentry); + return dentry; @@ -31855,7 +32084,7 @@ index 0000000..4a781db + h_root = au_h_dptr(a->sb->s_root, bindex); + AuDebugOn(h_root != au_br_dentry(a->br)); + -+ au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ au_hn_inode_lock_nested(hdir, AuLsc_I_PARENT); + wbr_wh_write_lock(wbr); + err = au_h_verify(wbr->wbr_whbase, au_opt_udba(a->sb), hdir->hi_inode, + h_root, a->br); @@ -31879,7 +32108,7 @@ index 0000000..4a781db + if (!err) + err = au_wh_init(a->br, a->sb); + wbr_wh_write_unlock(wbr); -+ au_hn_imtx_unlock(hdir); ++ au_hn_inode_unlock(hdir); + di_read_unlock(a->sb->s_root, AuLock_IR); + if (!err) + au_fhsm_wrote(a->sb, bindex, /*force*/0); @@ -31890,7 +32119,7 @@ index 0000000..4a781db + au_br_put(a->br); + si_write_unlock(a->sb); + au_nwt_done(&au_sbi(a->sb)->si_nowait); -+ kfree(arg); ++ au_delayed_kfree(arg); + if (unlikely(err)) + AuIOErr("err %d\n", err); +} @@ -31918,7 +32147,7 @@ index 0000000..4a781db + if (unlikely(wkq_err)) { + atomic_dec(&br->br_wbr->wbr_wh_running); + au_br_put(br); -+ kfree(arg); ++ au_delayed_kfree(arg); + } + do_dec = 0; + } @@ -32077,7 +32306,7 @@ index 0000000..4a781db + wh_dentry = ERR_PTR(err); + if (!err) { + wh_dentry = vfsub_lkup_one(&wh_name, h_parent); -+ kfree(wh_name.name); ++ au_delayed_kfree(wh_name.name); + } + return wh_dentry; +} @@ -32153,7 +32382,7 @@ index 0000000..4a781db + break; + } + } -+ free_page((unsigned long)wh_name.name); ++ au_delayed_free_page((unsigned long)wh_name.name); + +out: + return err; @@ -32195,7 +32424,7 @@ index 0000000..4a781db + rdhash = AUFS_RDHASH_DEF; + err = au_nhash_alloc(&whtmp->whlist, rdhash, gfp); + if (unlikely(err)) { -+ kfree(whtmp); ++ au_delayed_kfree(whtmp); + whtmp = ERR_PTR(err); + } + @@ -32210,7 +32439,7 @@ index 0000000..4a781db + dput(whtmp->wh_dentry); + iput(whtmp->dir); + au_nhash_wh_free(&whtmp->whlist); -+ kfree(whtmp); ++ au_delayed_kfree(whtmp); +} + +/* @@ -32307,12 +32536,12 @@ index 0000000..4a781db + err = vfsub_mnt_want_write(au_br_mnt(a->br)); + if (unlikely(err)) + goto out_mnt; -+ au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ au_hn_inode_lock_nested(hdir, AuLsc_I_PARENT); + err = au_h_verify(a->wh_dentry, au_opt_udba(sb), h_dir, h_parent, + a->br); + if (!err) + err = au_whtmp_rmdir(a->dir, bindex, a->wh_dentry, &a->whlist); -+ au_hn_imtx_unlock(hdir); ++ au_hn_inode_unlock(hdir); + vfsub_mnt_drop_write(au_br_mnt(a->br)); + +out_mnt: @@ -32440,10 +32669,10 @@ index 0000000..5a5c378 +#endif /* __AUFS_WHOUT_H__ */ diff --git a/fs/aufs/wkq.c b/fs/aufs/wkq.c new file mode 100644 -index 0000000..8c17dc7 +index 0000000..b18cdab --- /dev/null +++ b/fs/aufs/wkq.c -@@ -0,0 +1,218 @@ +@@ -0,0 +1,213 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -32500,7 +32729,7 @@ index 0000000..8c17dc7 + else { + kobject_put(wkinfo->kobj); + module_put(THIS_MODULE); /* todo: ?? */ -+ kfree(wkinfo); ++ au_delayed_kfree(wkinfo); + } +} + @@ -32523,7 +32752,7 @@ index 0000000..8c17dc7 + +static void au_wkq_comp_free(struct completion *comp) +{ -+ kfree(comp); ++ au_delayed_kfree(comp); +} + +#else @@ -32604,7 +32833,7 @@ index 0000000..8c17dc7 + int err; + struct au_wkinfo *wkinfo; + -+ percpu_counter_inc(&au_sbi(sb)->si_nowait.nw_len); ++ atomic_inc(&au_sbi(sb)->si_nowait.nw_len); + + /* + * wkq_func() must free this wkinfo. @@ -32634,16 +32863,11 @@ index 0000000..8c17dc7 + +void au_nwt_init(struct au_nowait_tasks *nwt) +{ -+ percpu_counter_init(&nwt->nw_len, 0, GFP_NOFS); ++ atomic_set(&nwt->nw_len, 0); ++ /* smp_mb(); */ /* atomic_set */ + init_waitqueue_head(&nwt->nw_wq); +} + -+void au_nwt_fin(struct au_nowait_tasks *nwt) -+{ -+ AuDebugOn(percpu_counter_sum(&nwt->nw_len)); -+ percpu_counter_destroy(&nwt->nw_len); -+} -+ +void au_wkq_fin(void) +{ + destroy_workqueue(au_wkq); @@ -32664,7 +32888,7 @@ index 0000000..8c17dc7 +} diff --git a/fs/aufs/wkq.h b/fs/aufs/wkq.h new file mode 100644 -index 0000000..7314558 +index 0000000..9b87836 --- /dev/null +++ b/fs/aufs/wkq.h @@ -0,0 +1,93 @@ @@ -32695,6 +32919,8 @@ index 0000000..7314558 + +#ifdef __KERNEL__ + ++#include ++ +struct super_block; + +/* ---------------------------------------------------------------------- */ @@ -32703,7 +32929,7 @@ index 0000000..7314558 + * in the next operation, wait for the 'nowait' tasks in system-wide workqueue + */ +struct au_nowait_tasks { -+ struct percpu_counter nw_len; ++ atomic_t nw_len; + wait_queue_head_t nw_wq; +}; + @@ -32730,7 +32956,6 @@ index 0000000..7314558 +int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb, + unsigned int flags); +void au_nwt_init(struct au_nowait_tasks *nwt); -+void au_nwt_fin(struct au_nowait_tasks *nwt); +int __init au_wkq_init(void); +void au_wkq_fin(void); + @@ -32748,14 +32973,13 @@ index 0000000..7314558 + +static inline void au_nwt_done(struct au_nowait_tasks *nwt) +{ -+ percpu_counter_dec(&nwt->nw_len); -+ if (!percpu_counter_sum(&nwt->nw_len)) ++ if (atomic_dec_and_test(&nwt->nw_len)) + wake_up_all(&nwt->nw_wq); +} + +static inline int au_nwt_flush(struct au_nowait_tasks *nwt) +{ -+ wait_event(nwt->nw_wq, !percpu_counter_sum(&nwt->nw_len)); ++ wait_event(nwt->nw_wq, !atomic_read(&nwt->nw_len)); + return 0; +} + @@ -32763,10 +32987,10 @@ index 0000000..7314558 +#endif /* __AUFS_WKQ_H__ */ diff --git a/fs/aufs/xattr.c b/fs/aufs/xattr.c new file mode 100644 -index 0000000..26223a2 +index 0000000..e919044 --- /dev/null +++ b/fs/aufs/xattr.c -@@ -0,0 +1,344 @@ +@@ -0,0 +1,347 @@ +/* + * Copyright (C) 2014-2016 Junjiro R. Okajima + * @@ -32945,10 +33169,12 @@ index 0000000..26223a2 + AuTraceErr(err); + } + -+ kfree(value); ++ if (value) ++ au_delayed_kfree(value); + +out_free: -+ kfree(o); ++ if (o) ++ au_delayed_kfree(o); +out: + if (!unlocked) + inode_unlock(h_isrc); @@ -33002,6 +33228,7 @@ index 0000000..26223a2 + arg->u.list.list, arg->u.list.size); + break; + case AU_XATTR_GET: ++ AuDebugOn(d_is_negative(h_path.dentry)); + err = vfs_getxattr(h_path.dentry, + arg->u.get.name, arg->u.get.value, + arg->u.get.size); @@ -33030,8 +33257,8 @@ index 0000000..26223a2 + return au_lgxattr(dentry, &arg); +} + -+ssize_t aufs_getxattr(struct dentry *dentry, const char *name, void *value, -+ size_t size) ++ssize_t aufs_getxattr(struct dentry *dentry, struct inode *inode __maybe_unused, ++ const char *name, void *value, size_t size) +{ + struct au_lgxattr arg = { + .type = AU_XATTR_GET, @@ -33045,8 +33272,8 @@ index 0000000..26223a2 + return au_lgxattr(dentry, &arg); +} + -+int aufs_setxattr(struct dentry *dentry, const char *name, const void *value, -+ size_t size, int flags) ++int aufs_setxattr(struct dentry *dentry, struct inode *inode, const char *name, ++ const void *value, size_t size, int flags) +{ + struct au_srxattr arg = { + .type = AU_XATTR_SET, @@ -33058,7 +33285,7 @@ index 0000000..26223a2 + }, + }; + -+ return au_srxattr(dentry, &arg); ++ return au_srxattr(dentry, inode, &arg); +} + +int aufs_removexattr(struct dentry *dentry, const char *name) @@ -33070,7 +33297,7 @@ index 0000000..26223a2 + }, + }; + -+ return au_srxattr(dentry, &arg); ++ return au_srxattr(dentry, d_inode(dentry), &arg); +} + +/* ---------------------------------------------------------------------- */ @@ -33113,10 +33340,10 @@ index 0000000..26223a2 +#endif diff --git a/fs/aufs/xino.c b/fs/aufs/xino.c new file mode 100644 -index 0000000..ed7b243 +index 0000000..7f62beb --- /dev/null +++ b/fs/aufs/xino.c -@@ -0,0 +1,1317 @@ +@@ -0,0 +1,1318 @@ +/* + * Copyright (C) 2005-2016 Junjiro R. Okajima + * @@ -33364,7 +33591,7 @@ index 0000000..ed7b243 + bindex = au_br_index(sb, brid); + if (bindex >= 0) { + ldir->hdir = au_hi(d_inode(sb->s_root), bindex); -+ au_hn_imtx_lock_nested(ldir->hdir, AuLsc_I_PARENT); ++ au_hn_inode_lock_nested(ldir->hdir, AuLsc_I_PARENT); + } else { + ldir->parent = dget_parent(xino->f_path.dentry); + ldir->dir = d_inode(ldir->parent); @@ -33375,7 +33602,7 @@ index 0000000..ed7b243 +static void au_xino_unlock_dir(struct au_xino_lock_dir *ldir) +{ + if (ldir->hdir) -+ au_hn_imtx_unlock(ldir->hdir); ++ au_hn_inode_unlock(ldir->hdir); + else { + inode_unlock(ldir->dir); + dput(ldir->parent); @@ -33457,7 +33684,7 @@ index 0000000..ed7b243 + AuErr1("statfs err %d, ignored\n", err); + +out_st: -+ kfree(st); ++ au_delayed_kfree(st); +out: + return err; +} @@ -33492,7 +33719,7 @@ index 0000000..ed7b243 + au_br_put(br); + si_write_unlock(sb); + au_nwt_done(&au_sbi(sb)->si_nowait); -+ kfree(args); ++ au_delayed_kfree(args); +} + +static int xino_trunc_test(struct super_block *sb, struct au_branch *br) @@ -33534,7 +33761,7 @@ index 0000000..ed7b243 + args = kmalloc(sizeof(*args), GFP_NOFS); + if (unlikely(!args)) { + AuErr1("no memory\n"); -+ goto out_args; ++ goto out; + } + + au_br_get(br); @@ -33546,9 +33773,8 @@ index 0000000..ed7b243 + + pr_err("wkq %d\n", wkq_err); + au_br_put(br); ++ au_delayed_kfree(args); + -+out_args: -+ kfree(args); +out: + atomic_dec(&br->br_xino_running); +} @@ -33717,7 +33943,7 @@ index 0000000..ed7b243 + struct au_branch *br; + vfs_writef_t xwrite; + -+ AuDebugOn(is_bad_inode(inode)); ++ AuDebugOn(au_is_bad_inode(inode)); + + sb = inode->i_sb; + mnt_flags = au_mntflags(sb); @@ -34071,7 +34297,7 @@ index 0000000..ed7b243 + (sb, au_sbr(sb, bindex)->br_xino.xi_file, page); + else + AuDbg("b%d\n", bindex); -+ free_page((unsigned long)page); ++ au_delayed_free_page((unsigned long)page); + +out: + return err; @@ -34148,7 +34374,8 @@ index 0000000..ed7b243 + if (sbinfo->si_xib) + fput(sbinfo->si_xib); + sbinfo->si_xib = NULL; -+ free_page((unsigned long)sbinfo->si_xib_buf); ++ if (sbinfo->si_xib_buf) ++ au_delayed_free_page((unsigned long)sbinfo->si_xib_buf); + sbinfo->si_xib_buf = NULL; +} + @@ -34191,7 +34418,8 @@ index 0000000..ed7b243 + goto out; /* success */ + +out_free: -+ free_page((unsigned long)sbinfo->si_xib_buf); ++ if (sbinfo->si_xib_buf) ++ au_delayed_free_page((unsigned long)sbinfo->si_xib_buf); + sbinfo->si_xib_buf = NULL; + if (err >= 0) + err = -EIO; @@ -34245,7 +34473,6 @@ index 0000000..ed7b243 + ino = AUFS_ROOT_INO; + writef = au_sbi(sb)->si_xwrite; + for (bindex = 0, p = fpair; bindex <= bbot; bindex++, p++) { -+ br = au_sbr(sb, bindex); + bshared = is_sb_shared(sb, bindex, bindex - 1); + if (bshared >= 0) { + /* shared xino */ @@ -34255,6 +34482,7 @@ index 0000000..ed7b243 + + if (!p->new) { + /* new xino */ ++ br = au_sbr(sb, bindex); + p->old = br->br_xino.xi_file; + p->new = au_xino_create2(base, br->br_xino.xi_file); + err = PTR_ERR(p->new); @@ -34284,7 +34512,7 @@ index 0000000..ed7b243 + fput(p->new); + else + break; -+ kfree(fpair); ++ au_delayed_kfree(fpair); +out: + return err; +} @@ -34395,7 +34623,7 @@ index 0000000..ed7b243 + if (!IS_ERR(file)) + au_xino_brid_set(sb, br->br_id); + } -+ free_page((unsigned long)page); ++ au_delayed_free_page((unsigned long)page); + } else { + file = au_xino_create(sb, AUFS_XINO_DEFPATH, /*silent*/0); + if (IS_ERR(file)) @@ -34435,10 +34663,10 @@ index 0000000..ed7b243 + return err; +} diff --git a/fs/dcache.c b/fs/dcache.c -index d5ecc6e..0dd0237 100644 +index 1ed81bb..34f4ea4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c -@@ -1156,7 +1156,7 @@ enum d_walk_ret { +@@ -1205,7 +1205,7 @@ enum d_walk_ret { * * The @enter() and @finish() callbacks are called with d_lock held. */ @@ -34447,7 +34675,7 @@ index d5ecc6e..0dd0237 100644 enum d_walk_ret (*enter)(void *, struct dentry *), void (*finish)(void *)) { -@@ -1261,6 +1261,7 @@ rename_retry: +@@ -1313,6 +1313,7 @@ rename_retry: seq = 1; goto again; } @@ -34456,7 +34684,7 @@ index d5ecc6e..0dd0237 100644 /* * Search for at least 1 mount point in the dentry's subdirs. diff --git a/fs/exec.c b/fs/exec.c -index c4010b8..c2b225f 100644 +index 887c1c9..40e8767 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -104,6 +104,7 @@ bool path_noexec(const struct path *path) @@ -34534,10 +34762,10 @@ index ad17e05..ae9f267 100644 void __init files_init(void) { diff --git a/fs/inode.c b/fs/inode.c -index 69b8b52..f46e5c6 100644 +index 9ea4219..ef8c6907 100644 --- a/fs/inode.c +++ b/fs/inode.c -@@ -850,6 +850,8 @@ unsigned int get_next_ino(void) +@@ -851,6 +851,8 @@ unsigned int get_next_ino(void) unsigned int *p = &get_cpu_var(last_ino); unsigned int res = *p; @@ -34546,7 +34774,7 @@ index 69b8b52..f46e5c6 100644 #ifdef CONFIG_SMP if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { static atomic_t shared_last_ino; -@@ -862,7 +864,7 @@ unsigned int get_next_ino(void) +@@ -863,7 +865,7 @@ unsigned int get_next_ino(void) res++; /* get_next_ino should not provide a 0 inode number */ if (unlikely(!res)) @@ -34555,8 +34783,25 @@ index 69b8b52..f46e5c6 100644 *p = res; put_cpu_var(last_ino); return res; +@@ -1591,7 +1593,7 @@ EXPORT_SYMBOL(generic_update_time); + * This does the actual work of updating an inodes time or version. Must have + * had called mnt_want_write() before calling this. + */ +-static int update_time(struct inode *inode, struct timespec *time, int flags) ++int update_time(struct inode *inode, struct timespec *time, int flags) + { + int (*update_time)(struct inode *, struct timespec *, int); + +@@ -1600,6 +1602,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) + + return update_time(inode, time, flags); + } ++EXPORT_SYMBOL_GPL(update_time); + + /** + * touch_atime - update the access time diff --git a/fs/namespace.c b/fs/namespace.c -index 4fb1691..97654d2 100644 +index 419f746..9c0e0af 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -463,6 +463,7 @@ void __mnt_drop_write(struct vfsmount *mnt) @@ -34567,7 +34812,7 @@ index 4fb1691..97654d2 100644 /** * mnt_drop_write - give up write access to a mount -@@ -1811,6 +1812,7 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, +@@ -1812,6 +1813,7 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, } return 0; } @@ -34576,7 +34821,7 @@ index 4fb1691..97654d2 100644 static void cleanup_group_ids(struct mount *mnt, struct mount *end) { diff --git a/fs/notify/group.c b/fs/notify/group.c -index d16b62c..53e45b6 100644 +index 3e2dd85..b17cb4b 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -22,6 +22,7 @@ @@ -34587,7 +34832,7 @@ index d16b62c..53e45b6 100644 #include #include "fsnotify.h" -@@ -72,6 +73,7 @@ void fsnotify_get_group(struct fsnotify_group *group) +@@ -81,6 +82,7 @@ void fsnotify_get_group(struct fsnotify_group *group) { atomic_inc(&group->refcnt); } @@ -34595,7 +34840,7 @@ index d16b62c..53e45b6 100644 /* * Drop a reference to a group. Free it if it's through. -@@ -81,6 +83,7 @@ void fsnotify_put_group(struct fsnotify_group *group) +@@ -90,6 +92,7 @@ void fsnotify_put_group(struct fsnotify_group *group) if (atomic_dec_and_test(&group->refcnt)) fsnotify_final_destroy_group(group); } @@ -34603,7 +34848,7 @@ index d16b62c..53e45b6 100644 /* * Create a new fsnotify_group and hold a reference for the group returned. -@@ -109,6 +112,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) +@@ -118,6 +121,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) return group; } @@ -34612,7 +34857,7 @@ index d16b62c..53e45b6 100644 int fsnotify_fasync(int fd, struct file *file, int on) { diff --git a/fs/notify/mark.c b/fs/notify/mark.c -index 7115c5d..ac2bd69 100644 +index d3fea0b..5fc06ad 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -113,6 +113,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) @@ -34623,7 +34868,7 @@ index 7115c5d..ac2bd69 100644 /* Calculate mask of events for a list of marks */ u32 fsnotify_recalc_mask(struct hlist_head *head) -@@ -213,6 +214,7 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, +@@ -230,6 +231,7 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, mutex_unlock(&group->mark_mutex); fsnotify_free_mark(mark); } @@ -34631,7 +34876,7 @@ index 7115c5d..ac2bd69 100644 void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock) { -@@ -398,6 +400,7 @@ err: +@@ -415,6 +417,7 @@ err: return ret; } @@ -34639,16 +34884,16 @@ index 7115c5d..ac2bd69 100644 int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, struct inode *inode, struct vfsmount *mnt, int allow_dups) -@@ -498,6 +501,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, +@@ -533,6 +536,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, atomic_set(&mark->refcnt, 1); mark->free_mark = free_mark; } +EXPORT_SYMBOL_GPL(fsnotify_init_mark); - static void fsnotify_mark_destroy(struct work_struct *work) - { + /* + * Destroy all marks in destroy_list, waits for SRCU period to finish before diff --git a/fs/open.c b/fs/open.c -index 081d3d6..b4359e4 100644 +index 93ae3cd..d25b9bd 100644 --- a/fs/open.c +++ b/fs/open.c @@ -64,6 +64,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, @@ -34657,7 +34902,7 @@ index 081d3d6..b4359e4 100644 } +EXPORT_SYMBOL_GPL(do_truncate); - long vfs_truncate(struct path *path, loff_t length) + long vfs_truncate(const struct path *path, loff_t length) { @@ -678,6 +679,7 @@ int open_check_o_direct(struct file *f) } @@ -34668,10 +34913,10 @@ index 081d3d6..b4359e4 100644 static int do_dentry_open(struct file *f, struct inode *inode, diff --git a/fs/proc/base.c b/fs/proc/base.c -index 0d163a8..b958f79 100644 +index a11eb71..8f10865 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c -@@ -1934,7 +1934,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) +@@ -1939,7 +1939,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) down_read(&mm->mmap_sem); vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { @@ -34697,7 +34942,7 @@ index f8595e8..cb8eda0 100644 ino = inode->i_ino; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c -index 5415835..c41eb73 100644 +index 4648c7f..061cb85 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -298,7 +298,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) @@ -34712,7 +34957,7 @@ index 5415835..c41eb73 100644 dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; -@@ -1617,7 +1620,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) +@@ -1624,7 +1627,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) struct proc_maps_private *proc_priv = &numa_priv->proc_maps; struct vm_area_struct *vma = v; struct numa_maps *md = &numa_priv->md; @@ -34738,10 +34983,10 @@ index faacb0c..17b43be 100644 ino = inode->i_ino; pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; diff --git a/fs/read_write.c b/fs/read_write.c -index cf377cf..0a43d7b 100644 +index 933b53a..260c4a4 100644 --- a/fs/read_write.c +++ b/fs/read_write.c -@@ -534,6 +534,30 @@ ssize_t __vfs_write(struct file *file, const char __user *p, size_t count, +@@ -515,6 +515,30 @@ ssize_t __vfs_write(struct file *file, const char __user *p, size_t count, } EXPORT_SYMBOL(__vfs_write); @@ -34814,7 +35059,7 @@ index dd9bf7e..0606690 100644 /** * splice_direct_to_actor - splices data directly between two non-pipes diff --git a/fs/xattr.c b/fs/xattr.c -index 4861322..c4bb039 100644 +index 4beafc4..e118715 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -207,6 +207,7 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, @@ -34826,7 +35071,7 @@ index 4861322..c4bb039 100644 ssize_t vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) diff --git a/include/linux/file.h b/include/linux/file.h -index f87d308..9a290b3 100644 +index 7444f5f..bdac0be 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -19,6 +19,7 @@ struct dentry; @@ -34838,10 +35083,10 @@ index f87d308..9a290b3 100644 static inline void fput_light(struct file *file, int fput_needed) { diff --git a/include/linux/fs.h b/include/linux/fs.h -index 70e61b5..351bb05 100644 +index dd28814..b689a48 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h -@@ -1277,6 +1277,7 @@ extern void fasync_free(struct fasync_struct *); +@@ -1306,6 +1306,7 @@ extern void fasync_free(struct fasync_struct *); /* can be called from interrupts */ extern void kill_fasync(struct fasync_struct **, int, int); @@ -34849,7 +35094,7 @@ index 70e61b5..351bb05 100644 extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force); extern void f_setown(struct file *filp, unsigned long arg, int force); extern void f_delown(struct file *filp); -@@ -1660,6 +1661,7 @@ struct file_operations { +@@ -1690,6 +1691,7 @@ struct file_operations { ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); @@ -34857,7 +35102,7 @@ index 70e61b5..351bb05 100644 int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); -@@ -1718,6 +1720,12 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, +@@ -1750,6 +1752,12 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, struct iovec *fast_pointer, struct iovec **ret_pointer); @@ -34870,11 +35115,19 @@ index 70e61b5..351bb05 100644 extern ssize_t __vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t __vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); +@@ -2105,6 +2113,7 @@ extern int current_umask(void); + extern void ihold(struct inode * inode); + extern void iput(struct inode *); + extern int generic_update_time(struct inode *, struct timespec *, int); ++extern int update_time(struct inode *, struct timespec *, int); + + /* /sys/fs */ + extern struct kobject *fs_kobj; diff --git a/include/linux/mm.h b/include/linux/mm.h -index 02f7f31..689b05b 100644 +index ece042d..1e24513 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h -@@ -1251,6 +1251,28 @@ static inline int fixup_user_fault(struct task_struct *tsk, +@@ -1239,6 +1239,28 @@ static inline int fixup_user_fault(struct task_struct *tsk, } #endif @@ -34904,10 +35157,10 @@ index 02f7f31..689b05b 100644 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, int write); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h -index c2d75b4..9e324fe 100644 +index ca3e517..10bc491 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h -@@ -269,6 +269,7 @@ struct vm_region { +@@ -274,6 +274,7 @@ struct vm_region { unsigned long vm_top; /* region allocated to here */ unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ struct file *vm_file; /* the backing file or NULL */ @@ -34915,7 +35168,7 @@ index c2d75b4..9e324fe 100644 int vm_usage; /* region usage count (access under nommu_region_sem) */ bool vm_icache_flushed : 1; /* true if the icache has been flushed for -@@ -343,6 +344,7 @@ struct vm_area_struct { +@@ -348,6 +349,7 @@ struct vm_area_struct { unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE units */ struct file * vm_file; /* File we map to (can be NULL). */ @@ -34939,7 +35192,7 @@ index da2751d..2e0fca6 100644 + unsigned int flags); #endif diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild -index 813ffb2e..ac2202e 100644 +index ec10cfe..800211b 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -59,6 +59,7 @@ header-y += atmsvc.h @@ -34952,7 +35205,7 @@ index 813ffb2e..ac2202e 100644 header-y += auxvec.h diff --git a/include/uapi/linux/aufs_type.h b/include/uapi/linux/aufs_type.h new file mode 100644 -index 0000000..e27dde7 +index 0000000..cacb35d --- /dev/null +++ b/include/uapi/linux/aufs_type.h @@ -0,0 +1,419 @@ @@ -34997,7 +35250,7 @@ index 0000000..e27dde7 + +#include + -+#define AUFS_VERSION "4.6-20160530" ++#define AUFS_VERSION "4.7-20160822" + +/* todo? move this to linux-2.6.19/include/magic.h */ +#define AUFS_SUPER_MAGIC ('a' << 24 | 'u' << 16 | 'f' << 8 | 's') @@ -35376,10 +35629,10 @@ index 0000000..e27dde7 + +#endif /* __AUFS_TYPE_H__ */ diff --git a/kernel/fork.c b/kernel/fork.c -index d277e83..683b8a2 100644 +index 4a7ec0c..8c8f7ac 100644 --- a/kernel/fork.c +++ b/kernel/fork.c -@@ -475,7 +475,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +@@ -479,7 +479,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; @@ -35398,7 +35651,7 @@ index 53fa971..bce3211 100644 } +EXPORT_SYMBOL_GPL(task_work_run); diff --git a/mm/Makefile b/mm/Makefile -index deb467e..0f6ae63 100644 +index 78c6f7d..aea4230 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -37,7 +37,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ @@ -35411,10 +35664,10 @@ index deb467e..0f6ae63 100644 obj-y += init-mm.o diff --git a/mm/filemap.c b/mm/filemap.c -index f2479af..31f4b0d 100644 +index 20f3b1f..ee827ce 100644 --- a/mm/filemap.c +++ b/mm/filemap.c -@@ -2211,7 +2211,7 @@ int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +@@ -2208,7 +2208,7 @@ int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) int ret = VM_FAULT_LOCKED; sb_start_pagefault(inode->i_sb); @@ -35424,10 +35677,10 @@ index f2479af..31f4b0d 100644 if (page->mapping != inode->i_mapping) { unlock_page(page); diff --git a/mm/memory.c b/mm/memory.c -index 07493e3..dc696bc 100644 +index 9e04681..06980d1 100644 --- a/mm/memory.c +++ b/mm/memory.c -@@ -2098,7 +2098,7 @@ static inline int wp_page_reuse(struct mm_struct *mm, +@@ -2100,7 +2100,7 @@ static inline int wp_page_reuse(struct mm_struct *mm, } if (!page_mkwrite) @@ -35437,10 +35690,10 @@ index 07493e3..dc696bc 100644 return VM_FAULT_WRITE; diff --git a/mm/mmap.c b/mm/mmap.c -index bd2e1a53..7328b74 100644 +index de2c176..b7f391c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c -@@ -166,7 +166,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) +@@ -162,7 +162,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); if (vma->vm_file) @@ -35449,7 +35702,7 @@ index bd2e1a53..7328b74 100644 mpol_put(vma_policy(vma)); kmem_cache_free(vm_area_cachep, vma); return next; -@@ -785,7 +785,7 @@ again: remove_next = 1 + (end > next->vm_end); +@@ -782,7 +782,7 @@ again: remove_next = 1 + (end > next->vm_end); if (remove_next) { if (file) { uprobe_munmap(next, next->vm_start, next->vm_end); @@ -35458,7 +35711,7 @@ index bd2e1a53..7328b74 100644 } if (next->anon_vma) anon_vma_merge(vma, next); -@@ -1566,8 +1566,8 @@ out: +@@ -1563,8 +1563,8 @@ out: return addr; unmap_and_free_vma: @@ -35468,7 +35721,7 @@ index bd2e1a53..7328b74 100644 /* Undo any partial mapping done by a device driver. */ unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); -@@ -2362,7 +2362,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, +@@ -2358,7 +2358,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, goto out_free_mpol; if (new->vm_file) @@ -35477,7 +35730,7 @@ index bd2e1a53..7328b74 100644 if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); -@@ -2381,7 +2381,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, +@@ -2377,7 +2377,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (new->vm_ops && new->vm_ops->close) new->vm_ops->close(new); if (new->vm_file) @@ -35486,7 +35739,7 @@ index bd2e1a53..7328b74 100644 unlink_anon_vmas(new); out_free_mpol: mpol_put(vma_policy(new)); -@@ -2523,7 +2523,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, +@@ -2528,7 +2528,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, struct vm_area_struct *vma; unsigned long populate = 0; unsigned long ret = -EINVAL; @@ -35495,7 +35748,7 @@ index bd2e1a53..7328b74 100644 pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n", current->comm, current->pid); -@@ -2590,10 +2590,27 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, +@@ -2597,10 +2597,27 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, } } @@ -35524,7 +35777,7 @@ index bd2e1a53..7328b74 100644 out: up_write(&mm->mmap_sem); if (populate) -@@ -2864,7 +2881,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, +@@ -2873,7 +2890,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; if (new_vma->vm_file) @@ -35534,7 +35787,7 @@ index bd2e1a53..7328b74 100644 new_vma->vm_ops->open(new_vma); vma_link(mm, new_vma, prev, rb_link, rb_parent); diff --git a/mm/nommu.c b/mm/nommu.c -index c8bd59a..4cfc2fc 100644 +index c2e588802..c39edc4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -644,7 +644,7 @@ static void __put_nommu_region(struct vm_region *region) @@ -35670,7 +35923,7 @@ index 0000000..b323b8a +} +#endif /* !CONFIG_MMU */ diff --git a/security/commoncap.c b/security/commoncap.c -index 48071ed..50a1a40 100644 +index e7fadde..6423e53 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -1058,12 +1058,14 @@ int cap_mmap_addr(unsigned long addr) @@ -35709,58 +35962,58 @@ index 03c1652..f88c84b 100644 int devcgroup_inode_mknod(int mode, dev_t dev) { diff --git a/security/security.c b/security/security.c -index 3644b03..593879b 100644 +index 7095693..da7fe2c 100644 --- a/security/security.c +++ b/security/security.c -@@ -433,6 +433,7 @@ int security_path_rmdir(struct path *dir, struct dentry *dentry) +@@ -434,6 +434,7 @@ int security_path_rmdir(const struct path *dir, struct dentry *dentry) return 0; return call_int_hook(path_rmdir, 0, dir, dentry); } +EXPORT_SYMBOL_GPL(security_path_rmdir); - int security_path_unlink(struct path *dir, struct dentry *dentry) + int security_path_unlink(const struct path *dir, struct dentry *dentry) { -@@ -449,6 +450,7 @@ int security_path_symlink(struct path *dir, struct dentry *dentry, +@@ -450,6 +451,7 @@ int security_path_symlink(const struct path *dir, struct dentry *dentry, return 0; return call_int_hook(path_symlink, 0, dir, dentry, old_name); } +EXPORT_SYMBOL_GPL(security_path_symlink); - int security_path_link(struct dentry *old_dentry, struct path *new_dir, + int security_path_link(struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) -@@ -457,6 +459,7 @@ int security_path_link(struct dentry *old_dentry, struct path *new_dir, +@@ -458,6 +460,7 @@ int security_path_link(struct dentry *old_dentry, const struct path *new_dir, return 0; return call_int_hook(path_link, 0, old_dentry, new_dir, new_dentry); } +EXPORT_SYMBOL_GPL(security_path_link); - int security_path_rename(struct path *old_dir, struct dentry *old_dentry, - struct path *new_dir, struct dentry *new_dentry, -@@ -484,6 +487,7 @@ int security_path_truncate(struct path *path) + int security_path_rename(const struct path *old_dir, struct dentry *old_dentry, + const struct path *new_dir, struct dentry *new_dentry, +@@ -485,6 +488,7 @@ int security_path_truncate(const struct path *path) return 0; return call_int_hook(path_truncate, 0, path); } +EXPORT_SYMBOL_GPL(security_path_truncate); - int security_path_chmod(struct path *path, umode_t mode) + int security_path_chmod(const struct path *path, umode_t mode) { -@@ -491,6 +495,7 @@ int security_path_chmod(struct path *path, umode_t mode) +@@ -492,6 +496,7 @@ int security_path_chmod(const struct path *path, umode_t mode) return 0; return call_int_hook(path_chmod, 0, path, mode); } +EXPORT_SYMBOL_GPL(security_path_chmod); - int security_path_chown(struct path *path, kuid_t uid, kgid_t gid) + int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid) { -@@ -498,6 +503,7 @@ int security_path_chown(struct path *path, kuid_t uid, kgid_t gid) +@@ -499,6 +504,7 @@ int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid) return 0; return call_int_hook(path_chown, 0, path, uid, gid); } +EXPORT_SYMBOL_GPL(security_path_chown); - int security_path_chroot(struct path *path) + int security_path_chroot(const struct path *path) { -@@ -583,6 +589,7 @@ int security_inode_readlink(struct dentry *dentry) +@@ -584,6 +590,7 @@ int security_inode_readlink(struct dentry *dentry) return 0; return call_int_hook(inode_readlink, 0, dentry); } @@ -35768,7 +36021,7 @@ index 3644b03..593879b 100644 int security_inode_follow_link(struct dentry *dentry, struct inode *inode, bool rcu) -@@ -598,6 +605,7 @@ int security_inode_permission(struct inode *inode, int mask) +@@ -599,6 +606,7 @@ int security_inode_permission(struct inode *inode, int mask) return 0; return call_int_hook(inode_permission, 0, inode, mask); } @@ -35776,7 +36029,7 @@ index 3644b03..593879b 100644 int security_inode_setattr(struct dentry *dentry, struct iattr *attr) { -@@ -736,6 +744,7 @@ int security_file_permission(struct file *file, int mask) +@@ -737,6 +745,7 @@ int security_file_permission(struct file *file, int mask) return fsnotify_perm(file, mask); } @@ -35784,7 +36037,7 @@ index 3644b03..593879b 100644 int security_file_alloc(struct file *file) { -@@ -795,6 +804,7 @@ int security_mmap_file(struct file *file, unsigned long prot, +@@ -796,6 +805,7 @@ int security_mmap_file(struct file *file, unsigned long prot, return ret; return ima_file_mmap(file, prot); } diff --git a/hp-wmi-rfkill-fix.patch b/hp-wmi-rfkill-fix.patch deleted file mode 100644 index 2f3d09c..0000000 --- a/hp-wmi-rfkill-fix.patch +++ /dev/null @@ -1,31 +0,0 @@ -This patch fixes the problem encountered on many HP laptops: in some cases, -WiFi becomes hard-blocked and cannot be unblocked since then. Seen that on -HP 6730b and others. - -This is https://bugzilla.kernel.org/show_bug.cgi?id=69131 - -diff --git a/drivers/platform/x86/hp-wmi.c b/drivers/platform/x86/hp-wmi.c -index 0669731..37000f0 100644 ---- a/drivers/platform/x86/hp-wmi.c -+++ b/drivers/platform/x86/hp-wmi.c -@@ -714,6 +714,11 @@ static int __init hp_wmi_rfkill_setup(struct platform_device *device) - if (err) - return err; - -+ err = hp_wmi_perform_query(HPWMI_WIRELESS_QUERY, 1, &wireless, -+ sizeof(wireless), 0); -+ if (err) -+ return err; -+ - if (wireless & 0x1) { - wifi_rfkill = rfkill_alloc("hp-wifi", &device->dev, - RFKILL_TYPE_WLAN, -@@ -901,7 +906,7 @@ static int __init hp_wmi_bios_setup(struct platform_device *device) - wwan_rfkill = NULL; - rfkill2_count = 0; - -- if (hp_wmi_bios_2009_later() || hp_wmi_rfkill_setup(device)) -+ if (hp_wmi_rfkill_setup(device)) - hp_wmi_rfkill2_setup(device); - - err = device_create_file(&device->dev, &dev_attr_display); diff --git a/kernel-i586.config b/kernel-i586.config index ec661b7..5a3eed1 100644 --- a/kernel-i586.config +++ b/kernel-i586.config @@ -1,12 +1,11 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 4.6.3-nrj-desktop-pae-1rosa-i586 Kernel Configuration +# Linux/x86 4.7.x-nrj-desktop Kernel Configuration # # CONFIG_64BIT is not set CONFIG_X86_32=y CONFIG_X86=y CONFIG_INSTRUCTION_DECODER=y -CONFIG_PERF_EVENTS_INTEL_UNCORE=y CONFIG_OUTPUT_FORMAT="elf32-i386" CONFIG_LOCKDEP_SUPPORT=y CONFIG_STACKTRACE_SUPPORT=y @@ -107,7 +106,7 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_PREEMPT_RCU=y # CONFIG_RCU_EXPERT is not set CONFIG_SRCU=y -# CONFIG_TASKS_RCU is not set +CONFIG_TASKS_RCU=y CONFIG_RCU_STALL_COMMON=y # CONFIG_TREE_RCU_TRACE is not set # CONFIG_RCU_EXPEDITE_BOOT is not set @@ -116,6 +115,7 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 +CONFIG_NMI_LOG_BUF_SHIFT=13 CONFIG_CGROUPS=y CONFIG_PAGE_COUNTER=y CONFIG_MEMCG=y @@ -155,6 +155,7 @@ CONFIG_RD_LZMA=y CONFIG_RD_XZ=y CONFIG_RD_LZO=y CONFIG_RD_LZ4=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -223,6 +224,7 @@ CONFIG_CC_STACKPROTECTOR_NONE=y # CONFIG_CC_STACKPROTECTOR_STRONG is not set CONFIG_MODULES_USE_ELF_REL=y CONFIG_ARCH_MMAP_RND_BITS=8 +CONFIG_ISA_BUS_API=y CONFIG_CLONE_BACKWARDS=y CONFIG_OLD_SIGSUSPEND3=y CONFIG_OLD_SIGACTION=y @@ -380,7 +382,6 @@ CONFIG_CPU_SUP_TRANSMETA_32=y CONFIG_CPU_SUP_UMC_32=y CONFIG_HPET_TIMER=y CONFIG_HPET_EMULATE_RTC=y -CONFIG_APB_TIMER=y CONFIG_DMI=y CONFIG_SWIOTLB=y CONFIG_IOMMU_HELPER=y @@ -401,6 +402,14 @@ CONFIG_X86_MCE_AMD=y CONFIG_X86_MCE_THRESHOLD=y # CONFIG_X86_MCE_INJECT is not set CONFIG_X86_THERMAL_VECTOR=y + +# +# Performance monitoring +# +CONFIG_PERF_EVENTS_INTEL_UNCORE=y +CONFIG_PERF_EVENTS_INTEL_RAPL=y +CONFIG_PERF_EVENTS_INTEL_CSTATE=y +CONFIG_PERF_EVENTS_AMD_POWER=m # CONFIG_X86_LEGACY_VM86 is not set # CONFIG_VM86 is not set CONFIG_X86_16BIT=y @@ -412,7 +421,6 @@ CONFIG_MICROCODE=y CONFIG_MICROCODE_INTEL=y CONFIG_MICROCODE_AMD=y CONFIG_MICROCODE_OLD_INTERFACE=y -CONFIG_PERF_EVENTS_AMD_POWER=m CONFIG_X86_MSR=m CONFIG_X86_CPUID=m # CONFIG_NOHIGHMEM is not set @@ -439,7 +447,6 @@ CONFIG_BALLOON_COMPACTION=y CONFIG_COMPACTION=y CONFIG_MIGRATION=y CONFIG_PHYS_ADDR_T_64BIT=y -CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y CONFIG_MMU_NOTIFIER=y @@ -459,6 +466,7 @@ CONFIG_CMA_AREAS=7 CONFIG_ZSWAP=y CONFIG_ZPOOL=y CONFIG_ZBUD=y +CONFIG_Z3FOLD=m CONFIG_ZSMALLOC=y # CONFIG_PGTABLE_MAPPING is not set # CONFIG_ZSMALLOC_STAT is not set @@ -549,7 +557,7 @@ CONFIG_ACPI_PROCESSOR_AGGREGATOR=m CONFIG_ACPI_THERMAL=y CONFIG_ACPI_CUSTOM_DSDT_FILE="" # CONFIG_ACPI_CUSTOM_DSDT is not set -CONFIG_ACPI_INITRD_TABLE_OVERRIDE=y +CONFIG_ACPI_TABLE_UPGRADE=y # CONFIG_ACPI_DEBUG is not set CONFIG_ACPI_PCI_SLOT=y CONFIG_X86_PM_TIMER=y @@ -584,6 +592,7 @@ CONFIG_APM=y # CPU Frequency scaling # CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_GOV_ATTR_SET=y CONFIG_CPU_FREQ_GOV_COMMON=y CONFIG_CPU_FREQ_STAT=m CONFIG_CPU_FREQ_STAT_DETAILS=y @@ -592,11 +601,13 @@ CONFIG_CPU_FREQ_STAT_DETAILS=y # CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y # CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set CONFIG_CPU_FREQ_GOV_PERFORMANCE=y CONFIG_CPU_FREQ_GOV_POWERSAVE=m CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m +CONFIG_CPU_FREQ_GOV_SCHEDUTIL=m # # CPU frequency scaling drivers @@ -605,7 +616,6 @@ CONFIG_X86_INTEL_PSTATE=y CONFIG_X86_PCC_CPUFREQ=m CONFIG_X86_ACPI_CPUFREQ=m CONFIG_X86_ACPI_CPUFREQ_CPB=y -CONFIG_X86_SFI_CPUFREQ=m CONFIG_X86_POWERNOW_K6=m CONFIG_X86_POWERNOW_K7=m CONFIG_X86_POWERNOW_K7_ACPI=y @@ -660,6 +670,7 @@ CONFIG_PCIEASPM_DEFAULT=y # CONFIG_PCIEASPM_POWERSAVE is not set # CONFIG_PCIEASPM_PERFORMANCE is not set CONFIG_PCIE_PME=y +CONFIG_PCIE_DPC=m CONFIG_PCI_BUS_ADDR_T_64BIT=y CONFIG_PCI_MSI=y CONFIG_PCI_MSI_IRQ_DOMAIN=y @@ -687,6 +698,7 @@ CONFIG_HOTPLUG_PCI_SHPC=m # PCI host controller drivers # # CONFIG_PCIE_DW_PLAT is not set +# CONFIG_ISA_BUS is not set CONFIG_ISA_DMA_API=y CONFIG_ISA=y # CONFIG_EISA is not set @@ -738,6 +750,7 @@ CONFIG_RAPIDIO_CPS_GEN2=y # Executable file formats / Emulations # CONFIG_BINFMT_ELF=y +CONFIG_ELFCORE=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set CONFIG_BINFMT_SCRIPT=y CONFIG_BINFMT_AOUT=m @@ -842,6 +855,8 @@ CONFIG_IPV6_SIT=m CONFIG_IPV6_NDISC_NODETYPE=y CONFIG_IPV6_TUNNEL=m CONFIG_IPV6_GRE=m +CONFIG_IPV6_FOU=m +CONFIG_IPV6_FOU_TUNNEL=m CONFIG_IPV6_MULTIPLE_TABLES=y CONFIG_IPV6_SUBTREES=y CONFIG_IPV6_MROUTE=y @@ -1210,6 +1225,7 @@ CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5=y # CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE is not set CONFIG_SCTP_COOKIE_HMAC_MD5=y CONFIG_SCTP_COOKIE_HMAC_SHA1=y +CONFIG_INET_SCTP_DIAG=m CONFIG_RDS=m CONFIG_RDS_TCP=m # CONFIG_RDS_DEBUG is not set @@ -1234,7 +1250,6 @@ CONFIG_BRIDGE_VLAN_FILTERING=y CONFIG_NET_DSA=m CONFIG_NET_DSA_HWMON=y CONFIG_NET_DSA_TAG_BRCM=y -CONFIG_NET_DSA_TAG_DSA=y CONFIG_NET_DSA_TAG_EDSA=y CONFIG_NET_DSA_TAG_TRAILER=y CONFIG_VLAN_8021Q=m @@ -1570,7 +1585,7 @@ CONFIG_BT_ATH3K=m CONFIG_BT_WILINK=m CONFIG_AF_RXRPC=m # CONFIG_AF_RXRPC_DEBUG is not set -CONFIG_RXKAD=m +# CONFIG_RXKAD is not set CONFIG_AF_KCM=m CONFIG_FIB_RULES=y CONFIG_WIRELESS=y @@ -1635,7 +1650,6 @@ CONFIG_NFC_SHDLC=y # # Near Field Communication (NFC) devices # -CONFIG_NFC_PN533=m CONFIG_NFC_WILINK=m CONFIG_NFC_TRF7970A=m CONFIG_NFC_MEI_PHY=m @@ -1646,6 +1660,9 @@ CONFIG_NFC_FDP_I2C=m CONFIG_NFC_PN544=m CONFIG_NFC_PN544_I2C=m CONFIG_NFC_PN544_MEI=m +CONFIG_NFC_PN533=m +CONFIG_NFC_PN533_USB=m +CONFIG_NFC_PN533_I2C=m CONFIG_NFC_MICROREAD=m CONFIG_NFC_MICROREAD_I2C=m CONFIG_NFC_MICROREAD_MEI=m @@ -1932,7 +1949,6 @@ CONFIG_SENSORS_LIS3LV02D=m CONFIG_DUMMY_IRQ=m CONFIG_IBM_ASM=m CONFIG_PHANTOM=m -CONFIG_INTEL_MID_PTI=m CONFIG_SGI_IOC4=m CONFIG_TIFM_CORE=m CONFIG_TIFM_7XX1=m @@ -2225,7 +2241,6 @@ CONFIG_SCSI_GDTH=m CONFIG_SCSI_ISCI=m CONFIG_SCSI_GENERIC_NCR5380=m CONFIG_SCSI_GENERIC_NCR5380_MMIO=m -CONFIG_SCSI_GENERIC_NCR53C400=y CONFIG_SCSI_IPS=m CONFIG_SCSI_INITIO=m CONFIG_SCSI_INIA100=m @@ -2248,6 +2263,7 @@ CONFIG_SCSI_QLOGIC_FAS=m CONFIG_SCSI_QLOGIC_1280=m CONFIG_SCSI_QLA_FC=m CONFIG_TCM_QLA2XXX=m +# CONFIG_TCM_QLA2XXX_DEBUG is not set CONFIG_SCSI_QLA_ISCSI=m CONFIG_SCSI_LPFC=m # CONFIG_SCSI_LPFC_DEBUG_FS is not set @@ -2312,6 +2328,9 @@ CONFIG_ATA_BMDMA=y # SATA SFF controllers with BMDMA # CONFIG_ATA_PIIX=m +CONFIG_SATA_DWC=m +# CONFIG_SATA_DWC_OLD_DMA is not set +# CONFIG_SATA_DWC_DEBUG is not set CONFIG_SATA_MV=m CONFIG_SATA_NV=m CONFIG_SATA_PROMISE=m @@ -2436,6 +2455,7 @@ CONFIG_TCM_USER2=m CONFIG_LOOPBACK_TARGET=m CONFIG_TCM_FC=m CONFIG_ISCSI_TARGET=m +CONFIG_ISCSI_TARGET_CXGB4=m CONFIG_SBP_TARGET=m # @@ -2467,6 +2487,7 @@ CONFIG_MACVTAP=m CONFIG_IPVLAN=m CONFIG_VXLAN=m CONFIG_GENEVE=m +CONFIG_GTP=m CONFIG_MACSEC=m CONFIG_NETCONSOLE=m CONFIG_NETCONSOLE_DYNAMIC=y @@ -2501,13 +2522,8 @@ CONFIG_VHOST=m # # Distributed Switch Architecture drivers # -CONFIG_NET_DSA_MV88E6XXX=m CONFIG_NET_DSA_MV88E6060=m -CONFIG_NET_DSA_MV88E6XXX_NEED_PPU=y -CONFIG_NET_DSA_MV88E6131=m -CONFIG_NET_DSA_MV88E6123=m -CONFIG_NET_DSA_MV88E6171=m -CONFIG_NET_DSA_MV88E6352=m +CONFIG_NET_DSA_MV88E6XXX=m CONFIG_NET_DSA_BCM_SF2=m CONFIG_ETHERNET=y CONFIG_MDIO=m @@ -2567,7 +2583,7 @@ CONFIG_CHELSIO_T1_1G=y CONFIG_CHELSIO_T3=m CONFIG_CHELSIO_T4=m CONFIG_CHELSIO_T4_DCB=y -# CONFIG_CHELSIO_T4_UWIRE is not set +CONFIG_CHELSIO_T4_UWIRE=y CONFIG_CHELSIO_T4_FCOE=y CONFIG_CHELSIO_T4VF=m CONFIG_NET_VENDOR_CIRRUS=y @@ -2651,6 +2667,7 @@ CONFIG_MLXSW_CORE_HWMON=y CONFIG_MLXSW_PCI=m CONFIG_MLXSW_SWITCHX2=m CONFIG_MLXSW_SPECTRUM=m +CONFIG_MLXSW_SPECTRUM_DCB=y CONFIG_NET_VENDOR_MICREL=y CONFIG_KS8842=m CONFIG_KS8851=m @@ -2694,7 +2711,10 @@ CONFIG_QLCNIC_HWMON=y CONFIG_QLGE=m CONFIG_NETXEN_NIC=m CONFIG_QED=m +CONFIG_QED_SRIOV=y CONFIG_QEDE=m +# CONFIG_QEDE_VXLAN is not set +# CONFIG_QEDE_GENEVE is not set CONFIG_NET_VENDOR_QUALCOMM=y CONFIG_NET_VENDOR_REALTEK=y CONFIG_ATP=m @@ -2756,6 +2776,7 @@ CONFIG_WIZNET_W5300=m # CONFIG_WIZNET_BUS_DIRECT is not set # CONFIG_WIZNET_BUS_INDIRECT is not set CONFIG_WIZNET_BUS_ANY=y +CONFIG_WIZNET_W5100_SPI=m CONFIG_NET_VENDOR_XIRCOM=y CONFIG_PCMCIA_XIRC2PS=m CONFIG_NET_SB1000=m @@ -2975,14 +2996,12 @@ CONFIG_IWLDVM=m CONFIG_IWLMVM=m CONFIG_IWLWIFI_OPMODE_MODULAR=y # CONFIG_IWLWIFI_BCAST_FILTERING is not set -# CONFIG_IWLWIFI_UAPSD is not set # CONFIG_IWLWIFI_PCIE_RTPM is not set # # Debugging Options # CONFIG_IWLWIFI_DEBUG=y -# CONFIG_IWLWIFI_DEBUG_EXPERIMENTAL_UCODE is not set # CONFIG_IWLWIFI_DEVICE_TRACING is not set CONFIG_WLAN_VENDOR_INTERSIL=y CONFIG_HOSTAP=m @@ -3404,7 +3423,6 @@ CONFIG_TOUCHSCREEN_MMS114=m CONFIG_TOUCHSCREEN_MELFAS_MIP4=m CONFIG_TOUCHSCREEN_MTOUCH=m CONFIG_TOUCHSCREEN_INEXIO=m -CONFIG_TOUCHSCREEN_INTEL_MID=m CONFIG_TOUCHSCREEN_MK712=m CONFIG_TOUCHSCREEN_HTCPEN=m CONFIG_TOUCHSCREEN_PENMOUNT=m @@ -3548,7 +3566,6 @@ CONFIG_VT_CONSOLE_SLEEP=y CONFIG_HW_CONSOLE=y CONFIG_VT_HW_CONSOLE_BINDING=y CONFIG_UNIX98_PTYS=y -CONFIG_DEVPTS_MULTIPLE_INSTANCES=y CONFIG_LEGACY_PTYS=y CONFIG_LEGACY_PTY_COUNT=0 CONFIG_SERIAL_NONSTANDARD=y @@ -3576,6 +3593,7 @@ CONFIG_SERIAL_EARLYCON=y CONFIG_SERIAL_8250=y # CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set CONFIG_SERIAL_8250_PNP=y +CONFIG_SERIAL_8250_FINTEK=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_DMA=y CONFIG_SERIAL_8250_PCI=y @@ -3595,7 +3613,6 @@ CONFIG_SERIAL_8250_RSA=y # CONFIG_SERIAL_8250_FSL is not set CONFIG_SERIAL_8250_DW=m CONFIG_SERIAL_8250_RT288X=y -CONFIG_SERIAL_8250_FINTEK=m CONFIG_SERIAL_8250_MID=m CONFIG_SERIAL_8250_MOXA=m @@ -3625,7 +3642,6 @@ CONFIG_SERIAL_ARC_NR_PORTS=1 CONFIG_SERIAL_RP2=m CONFIG_SERIAL_RP2_NR_UARTS=32 CONFIG_SERIAL_FSL_LPUART=m -# CONFIG_SERIAL_MVEBU_UART is not set # CONFIG_TTY_PRINTK is not set CONFIG_PRINTER=m # CONFIG_LP_CONSOLE is not set @@ -3672,8 +3688,8 @@ CONFIG_HPET=y CONFIG_HPET_MMAP=y CONFIG_HPET_MMAP_DEFAULT=y CONFIG_HANGCHECK_TIMER=m -CONFIG_TCG_TPM=m -CONFIG_TCG_TIS=m +CONFIG_TCG_TPM=y +CONFIG_TCG_TIS=y CONFIG_TCG_TIS_I2C_ATMEL=m CONFIG_TCG_TIS_I2C_INFINEON=m CONFIG_TCG_TIS_I2C_NUVOTON=m @@ -3806,6 +3822,7 @@ CONFIG_SPI_LM70_LLP=m CONFIG_SPI_OC_TINY=m CONFIG_SPI_PXA2XX=m CONFIG_SPI_PXA2XX_PCI=m +CONFIG_SPI_ROCKCHIP=m CONFIG_SPI_SC18IS602=m CONFIG_SPI_TOPCLIFF_PCH=m CONFIG_SPI_XCOMM=m @@ -3929,7 +3946,6 @@ CONFIG_GPIO_DA9055=m CONFIG_GPIO_DLN2=m CONFIG_GPIO_KEMPLD=m CONFIG_GPIO_LP3943=m -CONFIG_GPIO_MSIC=y CONFIG_GPIO_PALMAS=y CONFIG_GPIO_RC5T583=y CONFIG_GPIO_TPS65086=m @@ -4022,7 +4038,6 @@ CONFIG_BATTERY_MAX17040=m CONFIG_BATTERY_MAX17042=m CONFIG_BATTERY_TWL4030_MADC=m CONFIG_CHARGER_88PM860X=m -CONFIG_BATTERY_INTEL_MID=m CONFIG_BATTERY_RX51=m CONFIG_CHARGER_ISP1704=m CONFIG_CHARGER_MAX8903=m @@ -4120,6 +4135,7 @@ CONFIG_SENSORS_MAX16065=m CONFIG_SENSORS_MAX1619=m CONFIG_SENSORS_MAX1668=m CONFIG_SENSORS_MAX197=m +CONFIG_SENSORS_MAX31722=m CONFIG_SENSORS_MAX6639=m CONFIG_SENSORS_MAX6642=m CONFIG_SENSORS_MAX6650=m @@ -4235,9 +4251,15 @@ CONFIG_INTEL_POWERCLAMP=m CONFIG_X86_PKG_TEMP_THERMAL=m CONFIG_INTEL_SOC_DTS_IOSF_CORE=m CONFIG_INTEL_SOC_DTS_THERMAL=m + +# +# ACPI INT340X thermal drivers +# CONFIG_INT340X_THERMAL=m CONFIG_ACPI_THERMAL_REL=m +CONFIG_INT3406_THERMAL=m CONFIG_INTEL_PCH_THERMAL=m +CONFIG_GENERIC_ADC_THERMAL=m CONFIG_WATCHDOG=y CONFIG_WATCHDOG_CORE=y CONFIG_WATCHDOG_NOWAYOUT=y @@ -4277,8 +4299,6 @@ CONFIG_IBMASR=m CONFIG_WAFER_WDT=m CONFIG_I6300ESB_WDT=m CONFIG_IE6XX_WDT=m -CONFIG_INTEL_SCU_WATCHDOG=y -CONFIG_INTEL_MID_WATCHDOG=m CONFIG_ITCO_WDT=m CONFIG_ITCO_VENDOR_SUPPORT=y CONFIG_IT8712F_WDT=m @@ -4393,7 +4413,6 @@ CONFIG_INTEL_SOC_PMIC=y CONFIG_MFD_INTEL_LPSS=m CONFIG_MFD_INTEL_LPSS_ACPI=m CONFIG_MFD_INTEL_LPSS_PCI=m -CONFIG_MFD_INTEL_MSIC=y # CONFIG_MFD_JANZ_CMODIO is not set CONFIG_MFD_KEMPLD=m CONFIG_MFD_88PM800=m @@ -4506,7 +4525,6 @@ CONFIG_REGULATOR_MAX8660=m CONFIG_REGULATOR_MAX8907=m CONFIG_REGULATOR_MAX8925=m CONFIG_REGULATOR_MAX8952=m -CONFIG_REGULATOR_MAX8973=m CONFIG_REGULATOR_MAX8997=m CONFIG_REGULATOR_MAX8998=m CONFIG_REGULATOR_MAX77693=m @@ -4518,6 +4536,7 @@ CONFIG_REGULATOR_PALMAS=m CONFIG_REGULATOR_PCAP=m CONFIG_REGULATOR_PFUZE100=m CONFIG_REGULATOR_PV88060=m +CONFIG_REGULATOR_PV88080=m CONFIG_REGULATOR_PV88090=m CONFIG_REGULATOR_PWM=m CONFIG_REGULATOR_QCOM_SPMI=m @@ -4785,6 +4804,7 @@ CONFIG_VIDEO_MEYE=m CONFIG_VIDEO_SOLO6X10=m CONFIG_STA2X11_VIP=m CONFIG_VIDEO_TW68=m +CONFIG_VIDEO_TW686X=m CONFIG_VIDEO_ZORAN=m CONFIG_VIDEO_ZORAN_DC30=m CONFIG_VIDEO_ZORAN_ZR36060=m @@ -4930,6 +4950,7 @@ CONFIG_VIDEO_SAA7146_VV=m CONFIG_SMS_SIANO_MDTV=m CONFIG_SMS_SIANO_RC=y # CONFIG_SMS_SIANO_DEBUGFS is not set +CONFIG_VIDEO_V4L2_TPG=m # # Media ancillary drivers (tuners, sensors, i2c, frontends) @@ -5263,6 +5284,7 @@ CONFIG_DRM_RADEON_USERPTR=y CONFIG_DRM_AMDGPU=m CONFIG_DRM_AMDGPU_CIK=y # CONFIG_DRM_AMDGPU_USERPTR is not set +# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set CONFIG_DRM_AMD_POWERPLAY=y # @@ -5276,6 +5298,12 @@ CONFIG_DRM_NOUVEAU_BACKLIGHT=y CONFIG_DRM_I915=m # CONFIG_DRM_I915_PRELIMINARY_HW_SUPPORT is not set CONFIG_DRM_I915_USERPTR=y + +# +# drm/i915 Debugging +# +# CONFIG_DRM_I915_WERROR is not set +# CONFIG_DRM_I915_DEBUG is not set CONFIG_DRM_MGA=m CONFIG_DRM_SIS=m CONFIG_DRM_VIA=m @@ -5286,7 +5314,6 @@ CONFIG_DRM_VMWGFX_FBCON=y CONFIG_DRM_GMA500=m CONFIG_DRM_GMA600=y CONFIG_DRM_GMA3600=y -CONFIG_DRM_MEDFIELD=y CONFIG_DRM_UDL=m CONFIG_DRM_AST=m CONFIG_DRM_MGAG200=m @@ -5304,6 +5331,7 @@ CONFIG_DRM_BRIDGE=y # # Display Interface Bridges # +CONFIG_DRM_ANALOGIX_ANX78XX=m # # Frame buffer Devices @@ -5736,13 +5764,13 @@ CONFIG_SND_SOC_IMG_PARALLEL_OUT=m CONFIG_SND_SOC_IMG_SPDIF_IN=m CONFIG_SND_SOC_IMG_SPDIF_OUT=m CONFIG_SND_SOC_IMG_PISTACHIO_INTERNAL_DAC=m -# CONFIG_SND_MFLD_MACHINE is not set CONFIG_SND_SST_MFLD_PLATFORM=m CONFIG_SND_SST_IPC=m CONFIG_SND_SST_IPC_ACPI=m CONFIG_SND_SOC_INTEL_SST=m CONFIG_SND_SOC_INTEL_SST_ACPI=m CONFIG_SND_SOC_INTEL_SST_MATCH=m +CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m @@ -5805,6 +5833,7 @@ CONFIG_SND_SOC_PCM512x_SPI=m CONFIG_SND_SOC_RL6231=m CONFIG_SND_SOC_RL6347A=m CONFIG_SND_SOC_RT286=m +CONFIG_SND_SOC_RT298=m CONFIG_SND_SOC_RT5616=m CONFIG_SND_SOC_RT5631=m CONFIG_SND_SOC_RT5640=m @@ -5827,6 +5856,7 @@ CONFIG_SND_SOC_STI_SAS=m CONFIG_SND_SOC_TAS2552=m CONFIG_SND_SOC_TAS5086=m CONFIG_SND_SOC_TAS571X=m +CONFIG_SND_SOC_TAS5720=m CONFIG_SND_SOC_TFA9879=m CONFIG_SND_SOC_TLV320AIC23=m CONFIG_SND_SOC_TLV320AIC23_I2C=m @@ -5850,6 +5880,7 @@ CONFIG_SND_SOC_WM8804=m CONFIG_SND_SOC_WM8804_I2C=m CONFIG_SND_SOC_WM8804_SPI=m CONFIG_SND_SOC_WM8903=m +CONFIG_SND_SOC_WM8960=m CONFIG_SND_SOC_WM8962=m CONFIG_SND_SOC_WM8974=m CONFIG_SND_SOC_WM8978=m @@ -5894,6 +5925,7 @@ CONFIG_HID_ACRUX=m CONFIG_HID_ACRUX_FF=y CONFIG_HID_APPLE=m CONFIG_HID_APPLEIR=m +CONFIG_HID_ASUS=m CONFIG_HID_AUREAL=m CONFIG_HID_BELKIN=m CONFIG_HID_BETOP_FF=m @@ -6223,6 +6255,7 @@ CONFIG_USB_EZUSB_FX2=m CONFIG_USB_HSIC_USB3503=m # CONFIG_USB_LINK_LAYER_TEST is not set # CONFIG_USB_CHAOSKEY is not set +CONFIG_UCSI=m CONFIG_USB_ATM=m CONFIG_USB_SPEEDTOUCH=m CONFIG_USB_CXACRU=m @@ -6351,6 +6384,7 @@ CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=m CONFIG_LEDS_TRIGGER_ONESHOT=m CONFIG_LEDS_TRIGGER_IDE_DISK=y +# CONFIG_LEDS_TRIGGER_MTD is not set CONFIG_LEDS_TRIGGER_HEARTBEAT=m CONFIG_LEDS_TRIGGER_BACKLIGHT=m CONFIG_LEDS_TRIGGER_CPU=y @@ -6362,6 +6396,7 @@ CONFIG_LEDS_TRIGGER_DEFAULT_ON=m # CONFIG_LEDS_TRIGGER_TRANSIENT=m CONFIG_LEDS_TRIGGER_CAMERA=m +# CONFIG_LEDS_TRIGGER_PANIC is not set CONFIG_ACCESSIBILITY=y CONFIG_A11Y_BRAILLE_CONSOLE=y CONFIG_EDAC_ATOMIC_SCRUB=y @@ -6448,8 +6483,6 @@ CONFIG_RTC_DRV_RX8010=m CONFIG_RTC_DRV_RX8581=m CONFIG_RTC_DRV_RX8025=m CONFIG_RTC_DRV_EM3027=m -CONFIG_RTC_DRV_RV3029C2=m -CONFIG_RTC_DRV_RV3029_HWMON=y CONFIG_RTC_DRV_RV8803=m CONFIG_RTC_DRV_S5M=m @@ -6458,6 +6491,7 @@ CONFIG_RTC_DRV_S5M=m # CONFIG_RTC_DRV_M41T93=m CONFIG_RTC_DRV_M41T94=m +CONFIG_RTC_DRV_DS1302=m CONFIG_RTC_DRV_DS1305=m CONFIG_RTC_DRV_DS1343=m CONFIG_RTC_DRV_DS1347=m @@ -6476,12 +6510,13 @@ CONFIG_RTC_I2C_AND_SPI=y # CONFIG_RTC_DRV_DS3232=m CONFIG_RTC_DRV_PCF2127=m +CONFIG_RTC_DRV_RV3029C2=m +CONFIG_RTC_DRV_RV3029_HWMON=y # # Platform RTC drivers # CONFIG_RTC_DRV_CMOS=y -CONFIG_RTC_DRV_VRTC=m CONFIG_RTC_DRV_DS1286=m CONFIG_RTC_DRV_DS1511=m CONFIG_RTC_DRV_DS1553=m @@ -6536,13 +6571,17 @@ CONFIG_DW_DMAC_CORE=m CONFIG_DW_DMAC=m CONFIG_DW_DMAC_PCI=m CONFIG_HSU_DMA=m -CONFIG_HSU_DMA_PCI=m # # DMA Clients # CONFIG_ASYNC_TX_DMA=y CONFIG_DMATEST=m + +# +# DMABUF options +# +# CONFIG_SYNC_FILE is not set CONFIG_AUXDISPLAY=y CONFIG_KS0108=m CONFIG_KS0108_PORT=0x378 @@ -6755,9 +6794,7 @@ CONFIG_VT6656=m # CONFIG_ADIS16201=m CONFIG_ADIS16203=m -CONFIG_ADIS16204=m CONFIG_ADIS16209=m -CONFIG_ADIS16220=m CONFIG_ADIS16240=m CONFIG_LIS3L02DQ=m CONFIG_SCA3000=m @@ -6955,12 +6992,8 @@ CONFIG_TOSHIBA_HAPS=m CONFIG_TOSHIBA_WMI=m CONFIG_ACPI_CMPC=m CONFIG_INTEL_HID_EVENT=m -CONFIG_INTEL_SCU_IPC=y -CONFIG_INTEL_SCU_IPC_UTIL=m -CONFIG_GPIO_INTEL_PMIC=y -CONFIG_INTEL_MID_POWER_BUTTON=m -CONFIG_INTEL_MFLD_THERMAL=m CONFIG_INTEL_IPS=m +# CONFIG_INTEL_PMC_CORE is not set CONFIG_IBM_RTL=m CONFIG_SAMSUNG_LAPTOP=m CONFIG_MXM_WMI=m @@ -6979,6 +7012,7 @@ CONFIG_CHROMEOS_PSTORE=m CONFIG_CROS_EC_CHARDEV=m CONFIG_CROS_EC_LPC=m CONFIG_CROS_EC_PROTO=y +CONFIG_CROS_KBD_LED_BACKLIGHT=m CONFIG_CLKDEV_LOOKUP=y CONFIG_COMMON_CLK=y @@ -6995,6 +7029,8 @@ CONFIG_CLK_TWL6040=m CONFIG_COMMON_CLK_PALMAS=m CONFIG_COMMON_CLK_PWM=m # CONFIG_COMMON_CLK_PXA is not set +# CONFIG_COMMON_CLK_PIC32 is not set +# CONFIG_COMMON_CLK_OXNAS is not set # # Hardware Spinlock drivers @@ -7007,7 +7043,6 @@ CONFIG_CLKSRC_I8253=y CONFIG_CLKEVT_I8253=y CONFIG_I8253_LOCK=y CONFIG_CLKBLD_I8253=y -CONFIG_DW_APB_TIMER=y # CONFIG_ATMEL_PIT is not set # CONFIG_SH_TIMER_CMT is not set # CONFIG_SH_TIMER_MTU2 is not set @@ -7051,6 +7086,7 @@ CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=y CONFIG_DEVFREQ_GOV_PERFORMANCE=y CONFIG_DEVFREQ_GOV_POWERSAVE=y CONFIG_DEVFREQ_GOV_USERSPACE=y +CONFIG_DEVFREQ_GOV_PASSIVE=m # # DEVFREQ Drivers @@ -7181,6 +7217,9 @@ CONFIG_AD5380=m CONFIG_AD5421=m CONFIG_AD5446=m CONFIG_AD5449=m +CONFIG_AD5592R_BASE=m +CONFIG_AD5592R=m +CONFIG_AD5593R=m CONFIG_AD5504=m CONFIG_AD5624R_SPI=m CONFIG_AD5686=m @@ -7245,6 +7284,7 @@ CONFIG_MAX30100=m # # Humidity sensors # +CONFIG_AM2315=m CONFIG_DHT11=m CONFIG_HDC100X=m CONFIG_HTU21=m @@ -7256,6 +7296,9 @@ CONFIG_SI7020=m # CONFIG_ADIS16400=m CONFIG_ADIS16480=m +CONFIG_BMI160=m +CONFIG_BMI160_I2C=m +CONFIG_BMI160_SPI=m # CONFIG_KMX61 is not set CONFIG_INV_MPU6050_IIO=m CONFIG_INV_MPU6050_I2C=m @@ -7272,6 +7315,7 @@ CONFIG_AL3320A=m CONFIG_APDS9300=m CONFIG_APDS9960=m CONFIG_BH1750=m +CONFIG_BH1780=m CONFIG_CM32181=m CONFIG_CM3232=m CONFIG_CM3323=m @@ -7283,6 +7327,7 @@ CONFIG_HID_SENSOR_PROX=m CONFIG_JSA1212=m CONFIG_RPR0521=m CONFIG_LTR501=m +CONFIG_MAX44000=m CONFIG_OPT3001=m CONFIG_PA12203001=m CONFIG_STK3310=m @@ -7292,6 +7337,7 @@ CONFIG_SENSORS_TSL2563=m CONFIG_TSL4531=m CONFIG_US5182D=m CONFIG_VCNL4000=m +CONFIG_VEML6070=m # # Magnetometer sensors @@ -7299,6 +7345,8 @@ CONFIG_VCNL4000=m CONFIG_AK8975=m CONFIG_AK09911=m CONFIG_BMC150_MAGN=m +CONFIG_BMC150_MAGN_I2C=m +CONFIG_BMC150_MAGN_SPI=m CONFIG_MAG3110=m CONFIG_HID_SENSOR_MAGNETOMETER_3D=m CONFIG_MMC35240=m @@ -7325,14 +7373,16 @@ CONFIG_IIO_SYSFS_TRIGGER=m # # Digital potentiometers # +CONFIG_DS1803=m +CONFIG_MCP4131=m CONFIG_MCP4531=m CONFIG_TPL0102=m # # Pressure sensors # -CONFIG_BMP280=m CONFIG_HID_SENSOR_PRESS=m +CONFIG_HP03=m CONFIG_MPL115=m CONFIG_MPL115_I2C=m CONFIG_MPL115_SPI=m @@ -7343,6 +7393,7 @@ CONFIG_IIO_ST_PRESS=m CONFIG_IIO_ST_PRESS_I2C=m CONFIG_IIO_ST_PRESS_SPI=m CONFIG_T5403=m +CONFIG_HP206C=m # # Lightning sensors @@ -7426,7 +7477,7 @@ CONFIG_INTEL_RAPL=m # Performance monitor support # CONFIG_RAS=y -CONFIG_AMD_MCE_INJ=m +CONFIG_MCE_AMD_INJ=m CONFIG_THUNDERBOLT=m # @@ -7439,6 +7490,7 @@ CONFIG_ND_BLK=y CONFIG_ND_CLAIM=y CONFIG_ND_BTT=y CONFIG_BTT=y +CONFIG_DEV_DAX=m CONFIG_NVMEM=m CONFIG_STM=m CONFIG_STM_DUMMY=m @@ -7485,6 +7537,8 @@ CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE=y CONFIG_EFI_RUNTIME_MAP=y # CONFIG_EFI_FAKE_MEMMAP is not set CONFIG_EFI_RUNTIME_WRAPPERS=y +CONFIG_EFI_BOOTLOADER_CONTROL=m +CONFIG_EFI_CAPSULE_LOADER=m CONFIG_UEFI_CPER=y # @@ -7547,6 +7601,7 @@ CONFIG_F2FS_FS_SECURITY=y # CONFIG_F2FS_CHECK_FS is not set # CONFIG_F2FS_FS_ENCRYPTION is not set # CONFIG_F2FS_IO_TRACE is not set +# CONFIG_F2FS_FAULT_INJECTION is not set CONFIG_FS_DAX=y CONFIG_FS_POSIX_ACL=y CONFIG_EXPORTFS=y @@ -7852,9 +7907,6 @@ CONFIG_DYNAMIC_DEBUG=y # Compile-time checks and compiler options # # CONFIG_DEBUG_INFO is not set -# CONFIG_DEBUG_INFO_SPLIT is not set -# CONFIG_DEBUG_INFO_DWARF4 is not set -# CONFIG_GDB_SCRIPTS is not set CONFIG_ENABLE_WARN_DEPRECATED=y CONFIG_ENABLE_MUST_CHECK=y CONFIG_FRAME_WARN=1024 @@ -7940,6 +7992,7 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_PROVE_RCU is not set # CONFIG_SPARSE_RCU_POINTER is not set # CONFIG_TORTURE_TEST is not set +# CONFIG_RCU_PERF_TEST is not set # CONFIG_RCU_TORTURE_TEST is not set CONFIG_RCU_CPU_STALL_TIMEOUT=60 # CONFIG_RCU_TRACE is not set @@ -7989,6 +8042,7 @@ CONFIG_FUNCTION_PROFILER=y CONFIG_FTRACE_MCOUNT_RECORD=y # CONFIG_FTRACE_STARTUP_TEST is not set CONFIG_MMIOTRACE=y +# CONFIG_HIST_TRIGGERS is not set # CONFIG_MMIOTRACE_TEST is not set # CONFIG_TRACEPOINT_BENCHMARK is not set CONFIG_RING_BUFFER_BENCHMARK=m @@ -8013,7 +8067,9 @@ CONFIG_INTERVAL_TREE_TEST=m # CONFIG_TEST_KSTRTOX is not set CONFIG_TEST_PRINTF=m # CONFIG_TEST_BITMAP is not set +# CONFIG_TEST_UUID is not set # CONFIG_TEST_RHASHTABLE is not set +# CONFIG_TEST_HASH is not set # CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set # CONFIG_DMA_API_DEBUG is not set # CONFIG_TEST_LKM is not set @@ -8067,6 +8123,7 @@ CONFIG_KEYS=y # CONFIG_BIG_KEYS is not set CONFIG_TRUSTED_KEYS=m CONFIG_ENCRYPTED_KEYS=m +# CONFIG_KEY_DH_OPERATIONS is not set # CONFIG_SECURITY_DMESG_RESTRICT is not set CONFIG_SECURITY=y CONFIG_SECURITYFS=y @@ -8086,12 +8143,14 @@ CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=0 # CONFIG_SECURITY_SMACK is not set # CONFIG_SECURITY_TOMOYO is not set # CONFIG_SECURITY_APPARMOR is not set +# CONFIG_SECURITY_LOADPIN is not set # CONFIG_SECURITY_YAMA is not set CONFIG_INTEGRITY=y -# CONFIG_INTEGRITY_ASYMMETRIC_KEYS is not set -CONFIG_INTEGRITY_AUDIT=y # CONFIG_INTEGRITY_SIGNATURE is not set +CONFIG_INTEGRITY_AUDIT=y CONFIG_IMA=y +CONFIG_IMA_MEASURE_PCR_IDX=10 +CONFIG_IMA_LSM_RULES=y # CONFIG_IMA_TEMPLATE is not set CONFIG_IMA_NG_TEMPLATE=y # CONFIG_IMA_SIG_TEMPLATE is not set @@ -8099,13 +8158,13 @@ CONFIG_IMA_DEFAULT_HASH_SHA1=y # CONFIG_IMA_DEFAULT_HASH_SHA256 is not set # CONFIG_IMA_DEFAULT_HASH_SHA512 is not set # CONFIG_IMA_DEFAULT_HASH_WP512 is not set -# CONFIG_EVM is not set -# CONFIG_IMA_APPRAISE is not set +CONFIG_IMA_DEFAULT_HASH="sha1" # CONFIG_IMA_WRITE_POLICY is not set # CONFIG_IMA_READ_POLICY is not set -# CONFIG_DEFAULT_SECURITY_TOMOYO is not set -# CONFIG_DEFAULT_SECURITY_DAC is not set +# CONFIG_IMA_APPRAISE is not set +# CONFIG_EVM is not set CONFIG_DEFAULT_SECURITY_SELINUX=y +# CONFIG_DEFAULT_SECURITY_DAC is not set CONFIG_DEFAULT_SECURITY="selinux" CONFIG_XOR_BLOCKS=y CONFIG_ASYNC_CORE=m @@ -8194,7 +8253,7 @@ CONFIG_CRYPTO_RMD128=m CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m -CONFIG_CRYPTO_SHA1=m +CONFIG_CRYPTO_SHA1=y CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_TGR192=m @@ -8268,11 +8327,10 @@ CONFIG_CRYPTO_DEV_QAT_C62X=m CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m CONFIG_CRYPTO_DEV_QAT_C62XVF=m -CONFIG_ASYMMETRIC_KEY_TYPE=m +CONFIG_ASYMMETRIC_KEY_TYPE=y CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=m CONFIG_X509_CERTIFICATE_PARSER=m CONFIG_PKCS7_MESSAGE_PARSER=m -# CONFIG_PKCS7_TEST_KEY is not set # # Certificates for signature checking diff --git a/kernel-x86_64.config b/kernel-x86_64.config index 08c52e2..296796f 100644 --- a/kernel-x86_64.config +++ b/kernel-x86_64.config @@ -1,12 +1,11 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 4.6.3-nrj-desktop-1rosa-x86_64 Kernel Configuration +# Linux/x86 4.7.x-nrj-desktop Kernel Configuration # CONFIG_64BIT=y CONFIG_X86_64=y CONFIG_X86=y CONFIG_INSTRUCTION_DECODER=y -CONFIG_PERF_EVENTS_INTEL_UNCORE=y CONFIG_OUTPUT_FORMAT="elf64-x86-64" CONFIG_LOCKDEP_SUPPORT=y CONFIG_STACKTRACE_SUPPORT=y @@ -111,7 +110,7 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_PREEMPT_RCU=y # CONFIG_RCU_EXPERT is not set CONFIG_SRCU=y -# CONFIG_TASKS_RCU is not set +CONFIG_TASKS_RCU=y CONFIG_RCU_STALL_COMMON=y # CONFIG_TREE_RCU_TRACE is not set # CONFIG_RCU_EXPEDITE_BOOT is not set @@ -120,6 +119,7 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 +CONFIG_NMI_LOG_BUF_SHIFT=13 # CONFIG_NUMA_BALANCING is not set CONFIG_CGROUPS=y CONFIG_PAGE_COUNTER=y @@ -160,6 +160,7 @@ CONFIG_RD_LZMA=y CONFIG_RD_XZ=y CONFIG_RD_LZO=y CONFIG_RD_LZ4=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -384,6 +385,14 @@ CONFIG_X86_MCE_AMD=y CONFIG_X86_MCE_THRESHOLD=y # CONFIG_X86_MCE_INJECT is not set CONFIG_X86_THERMAL_VECTOR=y + +# +# Performance monitoring +# +CONFIG_PERF_EVENTS_INTEL_UNCORE=y +CONFIG_PERF_EVENTS_INTEL_RAPL=y +CONFIG_PERF_EVENTS_INTEL_CSTATE=y +CONFIG_PERF_EVENTS_AMD_POWER=m # CONFIG_VM86 is not set CONFIG_X86_16BIT=y CONFIG_X86_ESPFIX64=y @@ -393,7 +402,6 @@ CONFIG_MICROCODE=y CONFIG_MICROCODE_INTEL=y CONFIG_MICROCODE_AMD=y CONFIG_MICROCODE_OLD_INTERFACE=y -CONFIG_PERF_EVENTS_AMD_POWER=m CONFIG_X86_MSR=m CONFIG_X86_CPUID=m CONFIG_X86_DIRECT_GBPAGES=y @@ -417,6 +425,7 @@ CONFIG_MEMORY_ISOLATION=y # CONFIG_MOVABLE_NODE is not set CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTPLUG_SPARSE=y +# CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE is not set CONFIG_MEMORY_HOTREMOVE=y CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_MEMORY_BALLOON=y @@ -424,7 +433,6 @@ CONFIG_BALLOON_COMPACTION=y CONFIG_COMPACTION=y CONFIG_MIGRATION=y CONFIG_PHYS_ADDR_T_64BIT=y -CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y CONFIG_MMU_NOTIFIER=y @@ -445,6 +453,7 @@ CONFIG_CMA_AREAS=7 CONFIG_ZSWAP=y CONFIG_ZPOOL=y CONFIG_ZBUD=y +CONFIG_Z3FOLD=m CONFIG_ZSMALLOC=y # CONFIG_PGTABLE_MAPPING is not set # CONFIG_ZSMALLOC_STAT is not set @@ -544,7 +553,7 @@ CONFIG_ACPI_THERMAL=y CONFIG_ACPI_NUMA=y CONFIG_ACPI_CUSTOM_DSDT_FILE="" # CONFIG_ACPI_CUSTOM_DSDT is not set -CONFIG_ACPI_INITRD_TABLE_OVERRIDE=y +CONFIG_ACPI_TABLE_UPGRADE=y # CONFIG_ACPI_DEBUG is not set CONFIG_ACPI_PCI_SLOT=y CONFIG_X86_PM_TIMER=y @@ -573,6 +582,7 @@ CONFIG_SFI=y # CPU Frequency scaling # CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_GOV_ATTR_SET=y CONFIG_CPU_FREQ_GOV_COMMON=y CONFIG_CPU_FREQ_STAT=m CONFIG_CPU_FREQ_STAT_DETAILS=y @@ -581,11 +591,13 @@ CONFIG_CPU_FREQ_STAT_DETAILS=y # CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y # CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set CONFIG_CPU_FREQ_GOV_PERFORMANCE=y CONFIG_CPU_FREQ_GOV_POWERSAVE=m CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m +CONFIG_CPU_FREQ_GOV_SCHEDUTIL=m # # CPU frequency scaling drivers @@ -638,6 +650,7 @@ CONFIG_PCIEASPM_DEFAULT=y # CONFIG_PCIEASPM_POWERSAVE is not set # CONFIG_PCIEASPM_PERFORMANCE is not set CONFIG_PCIE_PME=y +CONFIG_PCIE_DPC=m CONFIG_PCI_BUS_ADDR_T_64BIT=y CONFIG_PCI_MSI=y CONFIG_PCI_MSI_IRQ_DOMAIN=y @@ -664,6 +677,7 @@ CONFIG_HOTPLUG_PCI_SHPC=m # PCI host controller drivers # # CONFIG_PCIE_DW_PLAT is not set +# CONFIG_ISA_BUS is not set CONFIG_ISA_DMA_API=y CONFIG_AMD_NB=y CONFIG_PCCARD=m @@ -706,6 +720,7 @@ CONFIG_RAPIDIO_CPS_GEN2=y # CONFIG_BINFMT_ELF=y CONFIG_COMPAT_BINFMT_ELF=y +CONFIG_ELFCORE=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set CONFIG_BINFMT_SCRIPT=y CONFIG_BINFMT_MISC=m @@ -816,6 +831,8 @@ CONFIG_IPV6_SIT=m CONFIG_IPV6_NDISC_NODETYPE=y CONFIG_IPV6_TUNNEL=m CONFIG_IPV6_GRE=m +CONFIG_IPV6_FOU=m +CONFIG_IPV6_FOU_TUNNEL=m CONFIG_IPV6_MULTIPLE_TABLES=y CONFIG_IPV6_SUBTREES=y CONFIG_IPV6_MROUTE=y @@ -1184,6 +1201,7 @@ CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5=y # CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE is not set CONFIG_SCTP_COOKIE_HMAC_MD5=y CONFIG_SCTP_COOKIE_HMAC_SHA1=y +CONFIG_INET_SCTP_DIAG=m CONFIG_RDS=m CONFIG_RDS_TCP=m # CONFIG_RDS_DEBUG is not set @@ -1208,7 +1226,6 @@ CONFIG_BRIDGE_VLAN_FILTERING=y CONFIG_NET_DSA=m CONFIG_NET_DSA_HWMON=y CONFIG_NET_DSA_TAG_BRCM=y -CONFIG_NET_DSA_TAG_DSA=y CONFIG_NET_DSA_TAG_EDSA=y CONFIG_NET_DSA_TAG_TRAILER=y CONFIG_VLAN_8021Q=m @@ -1534,7 +1551,7 @@ CONFIG_BT_ATH3K=m CONFIG_BT_WILINK=m CONFIG_AF_RXRPC=m # CONFIG_AF_RXRPC_DEBUG is not set -CONFIG_RXKAD=m +# CONFIG_RXKAD is not set CONFIG_AF_KCM=m CONFIG_FIB_RULES=y CONFIG_WIRELESS=y @@ -1599,7 +1616,6 @@ CONFIG_NFC_SHDLC=y # # Near Field Communication (NFC) devices # -CONFIG_NFC_PN533=m CONFIG_NFC_WILINK=m CONFIG_NFC_TRF7970A=m CONFIG_NFC_MEI_PHY=m @@ -1610,6 +1626,9 @@ CONFIG_NFC_FDP=m CONFIG_NFC_PN544=m CONFIG_NFC_PN544_I2C=m CONFIG_NFC_PN544_MEI=m +CONFIG_NFC_PN533=m +CONFIG_NFC_PN533_USB=m +CONFIG_NFC_PN533_I2C=m CONFIG_NFC_MICROREAD=m CONFIG_NFC_MICROREAD_I2C=m CONFIG_NFC_MICROREAD_MEI=m @@ -2184,6 +2203,7 @@ CONFIG_SCSI_IPR=m CONFIG_SCSI_QLOGIC_1280=m CONFIG_SCSI_QLA_FC=m CONFIG_TCM_QLA2XXX=m +# CONFIG_TCM_QLA2XXX_DEBUG is not set CONFIG_SCSI_QLA_ISCSI=m CONFIG_SCSI_LPFC=m # CONFIG_SCSI_LPFC_DEBUG_FS is not set @@ -2239,6 +2259,9 @@ CONFIG_ATA_BMDMA=y # SATA SFF controllers with BMDMA # CONFIG_ATA_PIIX=m +CONFIG_SATA_DWC=m +# CONFIG_SATA_DWC_OLD_DMA is not set +# CONFIG_SATA_DWC_DEBUG is not set CONFIG_SATA_MV=m CONFIG_SATA_NV=m CONFIG_SATA_PROMISE=m @@ -2355,6 +2378,7 @@ CONFIG_TCM_USER2=m CONFIG_LOOPBACK_TARGET=m CONFIG_TCM_FC=m CONFIG_ISCSI_TARGET=m +CONFIG_ISCSI_TARGET_CXGB4=m CONFIG_SBP_TARGET=m # @@ -2386,6 +2410,7 @@ CONFIG_MACVTAP=m CONFIG_IPVLAN=m CONFIG_VXLAN=m CONFIG_GENEVE=m +CONFIG_GTP=m CONFIG_MACSEC=m CONFIG_NETCONSOLE=m CONFIG_NETCONSOLE_DYNAMIC=y @@ -2420,13 +2445,8 @@ CONFIG_VHOST=m # # Distributed Switch Architecture drivers # -CONFIG_NET_DSA_MV88E6XXX=m CONFIG_NET_DSA_MV88E6060=m -CONFIG_NET_DSA_MV88E6XXX_NEED_PPU=y -CONFIG_NET_DSA_MV88E6131=m -CONFIG_NET_DSA_MV88E6123=m -CONFIG_NET_DSA_MV88E6171=m -CONFIG_NET_DSA_MV88E6352=m +CONFIG_NET_DSA_MV88E6XXX=m CONFIG_NET_DSA_BCM_SF2=m CONFIG_ETHERNET=y CONFIG_MDIO=m @@ -2486,7 +2506,7 @@ CONFIG_CHELSIO_T1_1G=y CONFIG_CHELSIO_T3=m CONFIG_CHELSIO_T4=m CONFIG_CHELSIO_T4_DCB=y -# CONFIG_CHELSIO_T4_UWIRE is not set +CONFIG_CHELSIO_T4_UWIRE=y CONFIG_CHELSIO_T4_FCOE=y CONFIG_CHELSIO_T4VF=m CONFIG_NET_VENDOR_CISCO=y @@ -2569,6 +2589,7 @@ CONFIG_MLXSW_CORE_HWMON=y CONFIG_MLXSW_PCI=m CONFIG_MLXSW_SWITCHX2=m CONFIG_MLXSW_SPECTRUM=m +CONFIG_MLXSW_SPECTRUM_DCB=y CONFIG_NET_VENDOR_MICREL=y CONFIG_KS8842=m CONFIG_KS8851=m @@ -2609,7 +2630,10 @@ CONFIG_QLCNIC_HWMON=y CONFIG_QLGE=m CONFIG_NETXEN_NIC=m CONFIG_QED=m +CONFIG_QED_SRIOV=y CONFIG_QEDE=m +# CONFIG_QEDE_VXLAN is not set +# CONFIG_QEDE_GENEVE is not set CONFIG_NET_VENDOR_QUALCOMM=y CONFIG_NET_VENDOR_REALTEK=y CONFIG_ATP=m @@ -2670,6 +2694,7 @@ CONFIG_WIZNET_W5300=m # CONFIG_WIZNET_BUS_DIRECT is not set # CONFIG_WIZNET_BUS_INDIRECT is not set CONFIG_WIZNET_BUS_ANY=y +CONFIG_WIZNET_W5100_SPI=m CONFIG_NET_VENDOR_XIRCOM=y CONFIG_PCMCIA_XIRC2PS=m # CONFIG_HIPPI is not set @@ -2893,14 +2918,12 @@ CONFIG_IWLDVM=m CONFIG_IWLMVM=m CONFIG_IWLWIFI_OPMODE_MODULAR=y # CONFIG_IWLWIFI_BCAST_FILTERING is not set -# CONFIG_IWLWIFI_UAPSD is not set # CONFIG_IWLWIFI_PCIE_RTPM is not set # # Debugging Options # CONFIG_IWLWIFI_DEBUG=y -# CONFIG_IWLWIFI_DEBUG_EXPERIMENTAL_UCODE is not set # CONFIG_IWLWIFI_DEVICE_TRACING is not set CONFIG_WLAN_VENDOR_INTERSIL=y CONFIG_HOSTAP=m @@ -3451,7 +3474,6 @@ CONFIG_VT_CONSOLE_SLEEP=y CONFIG_HW_CONSOLE=y CONFIG_VT_HW_CONSOLE_BINDING=y CONFIG_UNIX98_PTYS=y -CONFIG_DEVPTS_MULTIPLE_INSTANCES=y CONFIG_LEGACY_PTYS=y CONFIG_LEGACY_PTY_COUNT=0 CONFIG_SERIAL_NONSTANDARD=y @@ -3493,7 +3515,7 @@ CONFIG_SERIAL_8250_RSA=y # CONFIG_SERIAL_8250_FSL is not set CONFIG_SERIAL_8250_DW=m CONFIG_SERIAL_8250_RT288X=y -CONFIG_SERIAL_8250_FINTEK=m +CONFIG_SERIAL_8250_FINTEK=y CONFIG_SERIAL_8250_MID=m CONFIG_SERIAL_8250_MOXA=m @@ -3521,7 +3543,6 @@ CONFIG_SERIAL_ARC_NR_PORTS=1 CONFIG_SERIAL_RP2=m CONFIG_SERIAL_RP2_NR_UARTS=32 CONFIG_SERIAL_FSL_LPUART=m -# CONFIG_SERIAL_MVEBU_UART is not set # CONFIG_TTY_PRINTK is not set CONFIG_PRINTER=m # CONFIG_LP_CONSOLE is not set @@ -3565,8 +3586,8 @@ CONFIG_HPET=y CONFIG_HPET_MMAP=y CONFIG_HPET_MMAP_DEFAULT=y CONFIG_HANGCHECK_TIMER=m -CONFIG_TCG_TPM=m -CONFIG_TCG_TIS=m +CONFIG_TCG_TPM=y +CONFIG_TCG_TIS=y CONFIG_TCG_TIS_I2C_ATMEL=m CONFIG_TCG_TIS_I2C_INFINEON=m CONFIG_TCG_TIS_I2C_NUVOTON=m @@ -3696,6 +3717,7 @@ CONFIG_SPI_LM70_LLP=m CONFIG_SPI_OC_TINY=m CONFIG_SPI_PXA2XX=m CONFIG_SPI_PXA2XX_PCI=m +CONFIG_SPI_ROCKCHIP=m CONFIG_SPI_SC18IS602=m CONFIG_SPI_XCOMM=m CONFIG_SPI_XILINX=m @@ -4003,6 +4025,7 @@ CONFIG_SENSORS_MAX16065=m CONFIG_SENSORS_MAX1619=m CONFIG_SENSORS_MAX1668=m CONFIG_SENSORS_MAX197=m +CONFIG_SENSORS_MAX31722=m CONFIG_SENSORS_MAX6639=m CONFIG_SENSORS_MAX6642=m CONFIG_SENSORS_MAX6650=m @@ -4118,9 +4141,15 @@ CONFIG_INTEL_POWERCLAMP=m CONFIG_X86_PKG_TEMP_THERMAL=m CONFIG_INTEL_SOC_DTS_IOSF_CORE=m CONFIG_INTEL_SOC_DTS_THERMAL=m + +# +# ACPI INT340X thermal drivers +# CONFIG_INT340X_THERMAL=m CONFIG_ACPI_THERMAL_REL=m +CONFIG_INT3406_THERMAL=m CONFIG_INTEL_PCH_THERMAL=m +CONFIG_GENERIC_ADC_THERMAL=m CONFIG_WATCHDOG=y CONFIG_WATCHDOG_CORE=y CONFIG_WATCHDOG_NOWAYOUT=y @@ -4149,7 +4178,6 @@ CONFIG_ACQUIRE_WDT=m CONFIG_ADVANTECH_WDT=m CONFIG_ALIM1535_WDT=m CONFIG_ALIM7101_WDT=m -CONFIG_EBC_C384_WDT=m CONFIG_F71808E_WDT=m CONFIG_SP5100_TCO=m CONFIG_SBC_FITPC2_WATCHDOG=m @@ -4373,7 +4401,6 @@ CONFIG_REGULATOR_MAX8660=m CONFIG_REGULATOR_MAX8907=m CONFIG_REGULATOR_MAX8925=m CONFIG_REGULATOR_MAX8952=m -CONFIG_REGULATOR_MAX8973=m CONFIG_REGULATOR_MAX8997=m CONFIG_REGULATOR_MAX8998=m CONFIG_REGULATOR_MAX77693=m @@ -4385,6 +4412,7 @@ CONFIG_REGULATOR_PALMAS=m CONFIG_REGULATOR_PCAP=m CONFIG_REGULATOR_PFUZE100=m CONFIG_REGULATOR_PV88060=m +CONFIG_REGULATOR_PV88080=m CONFIG_REGULATOR_PV88090=m CONFIG_REGULATOR_PWM=m CONFIG_REGULATOR_QCOM_SPMI=m @@ -4651,6 +4679,7 @@ CONFIG_MEDIA_PCI_SUPPORT=y CONFIG_VIDEO_MEYE=m CONFIG_VIDEO_SOLO6X10=m CONFIG_VIDEO_TW68=m +CONFIG_VIDEO_TW686X=m # CONFIG_VIDEO_ZORAN is not set # @@ -4775,6 +4804,7 @@ CONFIG_VIDEO_SAA7146_VV=m CONFIG_SMS_SIANO_MDTV=m CONFIG_SMS_SIANO_RC=y # CONFIG_SMS_SIANO_DEBUGFS is not set +CONFIG_VIDEO_V4L2_TPG=m # # Media ancillary drivers (tuners, sensors, i2c, frontends) @@ -5092,6 +5122,7 @@ CONFIG_DRM_RADEON_USERPTR=y CONFIG_DRM_AMDGPU=m CONFIG_DRM_AMDGPU_CIK=y # CONFIG_DRM_AMDGPU_USERPTR is not set +# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set CONFIG_DRM_AMD_POWERPLAY=y # @@ -5105,6 +5136,12 @@ CONFIG_DRM_NOUVEAU_BACKLIGHT=y CONFIG_DRM_I915=m # CONFIG_DRM_I915_PRELIMINARY_HW_SUPPORT is not set CONFIG_DRM_I915_USERPTR=y + +# +# drm/i915 Debugging +# +# CONFIG_DRM_I915_WERROR is not set +# CONFIG_DRM_I915_DEBUG is not set CONFIG_DRM_MGA=m CONFIG_DRM_SIS=m CONFIG_DRM_VIA=m @@ -5132,6 +5169,7 @@ CONFIG_DRM_BRIDGE=y # # Display Interface Bridges # +CONFIG_DRM_ANALOGIX_ANX78XX=m CONFIG_HSA_AMD=m # @@ -5527,6 +5565,7 @@ CONFIG_SND_SST_IPC_ACPI=m CONFIG_SND_SOC_INTEL_SST=m CONFIG_SND_SOC_INTEL_SST_ACPI=m CONFIG_SND_SOC_INTEL_SST_MATCH=m +CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m @@ -5589,6 +5628,7 @@ CONFIG_SND_SOC_PCM512x_SPI=m CONFIG_SND_SOC_RL6231=m CONFIG_SND_SOC_RL6347A=m CONFIG_SND_SOC_RT286=m +CONFIG_SND_SOC_RT298=m CONFIG_SND_SOC_RT5616=m CONFIG_SND_SOC_RT5631=m CONFIG_SND_SOC_RT5640=m @@ -5611,6 +5651,7 @@ CONFIG_SND_SOC_STI_SAS=m CONFIG_SND_SOC_TAS2552=m CONFIG_SND_SOC_TAS5086=m CONFIG_SND_SOC_TAS571X=m +CONFIG_SND_SOC_TAS5720=m CONFIG_SND_SOC_TFA9879=m CONFIG_SND_SOC_TLV320AIC23=m CONFIG_SND_SOC_TLV320AIC23_I2C=m @@ -5634,6 +5675,7 @@ CONFIG_SND_SOC_WM8804=m CONFIG_SND_SOC_WM8804_I2C=m CONFIG_SND_SOC_WM8804_SPI=m CONFIG_SND_SOC_WM8903=m +CONFIG_SND_SOC_WM8960=m CONFIG_SND_SOC_WM8962=m CONFIG_SND_SOC_WM8974=m CONFIG_SND_SOC_WM8978=m @@ -5676,6 +5718,7 @@ CONFIG_HID_ACRUX=m CONFIG_HID_ACRUX_FF=y CONFIG_HID_APPLE=m CONFIG_HID_APPLEIR=m +CONFIG_HID_ASUS=m CONFIG_HID_AUREAL=m CONFIG_HID_BELKIN=m CONFIG_HID_BETOP_FF=m @@ -6005,6 +6048,7 @@ CONFIG_USB_EZUSB_FX2=m CONFIG_USB_HSIC_USB3503=m # CONFIG_USB_LINK_LAYER_TEST is not set # CONFIG_USB_CHAOSKEY is not set +CONFIG_UCSI=m CONFIG_USB_ATM=m CONFIG_USB_SPEEDTOUCH=m CONFIG_USB_CXACRU=m @@ -6131,6 +6175,7 @@ CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=m CONFIG_LEDS_TRIGGER_ONESHOT=m CONFIG_LEDS_TRIGGER_IDE_DISK=y +# CONFIG_LEDS_TRIGGER_MTD is not set CONFIG_LEDS_TRIGGER_HEARTBEAT=m CONFIG_LEDS_TRIGGER_BACKLIGHT=m CONFIG_LEDS_TRIGGER_CPU=y @@ -6142,6 +6187,7 @@ CONFIG_LEDS_TRIGGER_DEFAULT_ON=m # CONFIG_LEDS_TRIGGER_TRANSIENT=m CONFIG_LEDS_TRIGGER_CAMERA=m +# CONFIG_LEDS_TRIGGER_PANIC is not set CONFIG_ACCESSIBILITY=y CONFIG_A11Y_BRAILLE_CONSOLE=y CONFIG_EDAC_ATOMIC_SCRUB=y @@ -6224,8 +6270,6 @@ CONFIG_RTC_DRV_RX8010=m CONFIG_RTC_DRV_RX8581=m CONFIG_RTC_DRV_RX8025=m CONFIG_RTC_DRV_EM3027=m -CONFIG_RTC_DRV_RV3029C2=m -CONFIG_RTC_DRV_RV3029_HWMON=y CONFIG_RTC_DRV_RV8803=m CONFIG_RTC_DRV_S5M=m @@ -6234,6 +6278,7 @@ CONFIG_RTC_DRV_S5M=m # CONFIG_RTC_DRV_M41T93=m CONFIG_RTC_DRV_M41T94=m +CONFIG_RTC_DRV_DS1302=m CONFIG_RTC_DRV_DS1305=m CONFIG_RTC_DRV_DS1343=m CONFIG_RTC_DRV_DS1347=m @@ -6252,6 +6297,8 @@ CONFIG_RTC_I2C_AND_SPI=y # CONFIG_RTC_DRV_DS3232=m CONFIG_RTC_DRV_PCF2127=m +CONFIG_RTC_DRV_RV3029C2=m +CONFIG_RTC_DRV_RV3029_HWMON=y # # Platform RTC drivers @@ -6319,6 +6366,11 @@ CONFIG_HSU_DMA=m CONFIG_ASYNC_TX_DMA=y CONFIG_DMATEST=m CONFIG_DMA_ENGINE_RAID=y + +# +# DMABUF options +# +# CONFIG_SYNC_FILE is not set CONFIG_DCA=m CONFIG_AUXDISPLAY=y CONFIG_KS0108=m @@ -6560,9 +6612,7 @@ CONFIG_VT6656=m # CONFIG_ADIS16201=m CONFIG_ADIS16203=m -CONFIG_ADIS16204=m CONFIG_ADIS16209=m -CONFIG_ADIS16220=m CONFIG_ADIS16240=m CONFIG_LIS3L02DQ=m CONFIG_SCA3000=m @@ -6759,6 +6809,7 @@ CONFIG_TOSHIBA_WMI=m CONFIG_ACPI_CMPC=m CONFIG_INTEL_HID_EVENT=m CONFIG_INTEL_IPS=m +# CONFIG_INTEL_PMC_CORE is not set CONFIG_IBM_RTL=m CONFIG_SAMSUNG_LAPTOP=m CONFIG_MXM_WMI=m @@ -6778,6 +6829,7 @@ CONFIG_CHROMEOS_PSTORE=m CONFIG_CROS_EC_CHARDEV=m CONFIG_CROS_EC_LPC=m CONFIG_CROS_EC_PROTO=y +CONFIG_CROS_KBD_LED_BACKLIGHT=m CONFIG_CLKDEV_LOOKUP=y CONFIG_COMMON_CLK=y @@ -6794,6 +6846,8 @@ CONFIG_CLK_TWL6040=m CONFIG_COMMON_CLK_PALMAS=m CONFIG_COMMON_CLK_PWM=m # CONFIG_COMMON_CLK_PXA is not set +# CONFIG_COMMON_CLK_PIC32 is not set +# CONFIG_COMMON_CLK_OXNAS is not set # # Hardware Spinlock drivers @@ -6819,7 +6873,6 @@ CONFIG_IOMMU_SUPPORT=y # CONFIG_IOMMU_IOVA=y CONFIG_AMD_IOMMU=y -# CONFIG_AMD_IOMMU_STATS is not set CONFIG_AMD_IOMMU_V2=m CONFIG_DMAR_TABLE=y CONFIG_INTEL_IOMMU=y @@ -6852,6 +6905,7 @@ CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=y CONFIG_DEVFREQ_GOV_PERFORMANCE=y CONFIG_DEVFREQ_GOV_POWERSAVE=y CONFIG_DEVFREQ_GOV_USERSPACE=y +CONFIG_DEVFREQ_GOV_PASSIVE=m # # DEVFREQ Drivers @@ -6982,6 +7036,9 @@ CONFIG_AD5380=m CONFIG_AD5421=m CONFIG_AD5446=m CONFIG_AD5449=m +CONFIG_AD5592R_BASE=m +CONFIG_AD5592R=m +CONFIG_AD5593R=m CONFIG_AD5504=m CONFIG_AD5624R_SPI=m CONFIG_AD5686=m @@ -7045,6 +7102,7 @@ CONFIG_MAX30100=m # # Humidity sensors # +CONFIG_AM2315=m CONFIG_DHT11=m CONFIG_HDC100X=m CONFIG_HTU21=m @@ -7056,6 +7114,9 @@ CONFIG_SI7020=m # CONFIG_ADIS16400=m CONFIG_ADIS16480=m +CONFIG_BMI160=m +CONFIG_BMI160_I2C=m +CONFIG_BMI160_SPI=m # CONFIG_KMX61 is not set CONFIG_INV_MPU6050_IIO=m CONFIG_INV_MPU6050_I2C=m @@ -7072,6 +7133,7 @@ CONFIG_AL3320A=m CONFIG_APDS9300=m CONFIG_APDS9960=m CONFIG_BH1750=m +CONFIG_BH1780=m CONFIG_CM32181=m CONFIG_CM3232=m CONFIG_CM3323=m @@ -7083,6 +7145,7 @@ CONFIG_HID_SENSOR_PROX=m CONFIG_JSA1212=m CONFIG_RPR0521=m CONFIG_LTR501=m +CONFIG_MAX44000=m CONFIG_OPT3001=m CONFIG_PA12203001=m CONFIG_STK3310=m @@ -7092,6 +7155,7 @@ CONFIG_SENSORS_TSL2563=m CONFIG_TSL4531=m CONFIG_US5182D=m CONFIG_VCNL4000=m +CONFIG_VEML6070=m # # Magnetometer sensors @@ -7099,6 +7163,8 @@ CONFIG_VCNL4000=m CONFIG_AK8975=m CONFIG_AK09911=m CONFIG_BMC150_MAGN=m +CONFIG_BMC150_MAGN_I2C=m +CONFIG_BMC150_MAGN_SPI=m CONFIG_MAG3110=m CONFIG_HID_SENSOR_MAGNETOMETER_3D=m CONFIG_MMC35240=m @@ -7125,14 +7191,16 @@ CONFIG_IIO_SYSFS_TRIGGER=m # # Digital potentiometers # +CONFIG_DS1803=m +CONFIG_MCP4131=m CONFIG_MCP4531=m CONFIG_TPL0102=m # # Pressure sensors # -CONFIG_BMP280=m CONFIG_HID_SENSOR_PRESS=m +CONFIG_HP03=m CONFIG_MPL115=m CONFIG_MPL115_I2C=m CONFIG_MPL115_SPI=m @@ -7143,6 +7211,7 @@ CONFIG_IIO_ST_PRESS=m CONFIG_IIO_ST_PRESS_I2C=m CONFIG_IIO_ST_PRESS_SPI=m CONFIG_T5403=m +CONFIG_HP206C=m # # Lightning sensors @@ -7226,7 +7295,7 @@ CONFIG_INTEL_RAPL=m # Performance monitor support # CONFIG_RAS=y -CONFIG_AMD_MCE_INJ=m +CONFIG_MCE_AMD_INJ=m CONFIG_THUNDERBOLT=m # @@ -7239,6 +7308,7 @@ CONFIG_ND_BLK=y CONFIG_ND_CLAIM=y CONFIG_ND_BTT=y CONFIG_BTT=y +CONFIG_DEV_DAX=m CONFIG_NVMEM=m CONFIG_STM=m CONFIG_STM_DUMMY=m @@ -7285,6 +7355,8 @@ CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE=y CONFIG_EFI_RUNTIME_MAP=y # CONFIG_EFI_FAKE_MEMMAP is not set CONFIG_EFI_RUNTIME_WRAPPERS=y +CONFIG_EFI_BOOTLOADER_CONTROL=m +CONFIG_EFI_CAPSULE_LOADER=m CONFIG_UEFI_CPER=y # @@ -7347,6 +7419,7 @@ CONFIG_F2FS_FS_SECURITY=y # CONFIG_F2FS_CHECK_FS is not set # CONFIG_F2FS_FS_ENCRYPTION is not set # CONFIG_F2FS_IO_TRACE is not set +# CONFIG_F2FS_FAULT_INJECTION is not set CONFIG_FS_DAX=y CONFIG_FS_POSIX_ACL=y CONFIG_EXPORTFS=y @@ -7648,9 +7721,6 @@ CONFIG_DYNAMIC_DEBUG=y # Compile-time checks and compiler options # # CONFIG_DEBUG_INFO is not set -# CONFIG_DEBUG_INFO_SPLIT is not set -# CONFIG_DEBUG_INFO_DWARF4 is not set -# CONFIG_GDB_SCRIPTS is not set CONFIG_ENABLE_WARN_DEPRECATED=y CONFIG_ENABLE_MUST_CHECK=y CONFIG_FRAME_WARN=2048 @@ -7739,6 +7809,7 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_PROVE_RCU is not set # CONFIG_SPARSE_RCU_POINTER is not set # CONFIG_TORTURE_TEST is not set +# CONFIG_RCU_PERF_TEST is not set # CONFIG_RCU_TORTURE_TEST is not set CONFIG_RCU_CPU_STALL_TIMEOUT=60 # CONFIG_RCU_TRACE is not set @@ -7788,6 +7859,7 @@ CONFIG_FUNCTION_PROFILER=y CONFIG_FTRACE_MCOUNT_RECORD=y # CONFIG_FTRACE_STARTUP_TEST is not set CONFIG_MMIOTRACE=y +# CONFIG_HIST_TRIGGERS is not set # CONFIG_MMIOTRACE_TEST is not set # CONFIG_TRACEPOINT_BENCHMARK is not set CONFIG_RING_BUFFER_BENCHMARK=m @@ -7812,7 +7884,9 @@ CONFIG_INTERVAL_TREE_TEST=m # CONFIG_TEST_KSTRTOX is not set # CONFIG_TEST_PRINTF is not set # CONFIG_TEST_BITMAP is not set +# CONFIG_TEST_UUID is not set # CONFIG_TEST_RHASHTABLE is not set +# CONFIG_TEST_HASH is not set # CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set # CONFIG_DMA_API_DEBUG is not set # CONFIG_TEST_LKM is not set @@ -7867,6 +7941,7 @@ CONFIG_KEYS=y # CONFIG_BIG_KEYS is not set CONFIG_TRUSTED_KEYS=m CONFIG_ENCRYPTED_KEYS=m +# CONFIG_KEY_DH_OPERATIONS is not set # CONFIG_SECURITY_DMESG_RESTRICT is not set CONFIG_SECURITY=y CONFIG_SECURITYFS=y @@ -7886,26 +7961,29 @@ CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=0 # CONFIG_SECURITY_SMACK is not set # CONFIG_SECURITY_TOMOYO is not set # CONFIG_SECURITY_APPARMOR is not set +# CONFIG_SECURITY_LOADPIN is not set # CONFIG_SECURITY_YAMA is not set CONFIG_INTEGRITY=y -# CONFIG_INTEGRITY_ASYMMETRIC_KEYS is not set -CONFIG_INTEGRITY_AUDIT=y # CONFIG_INTEGRITY_SIGNATURE is not set +CONFIG_INTEGRITY_AUDIT=y CONFIG_IMA=y +CONFIG_IMA_MEASURE_PCR_IDX=10 +CONFIG_IMA_LSM_RULES=y # CONFIG_IMA_TEMPLATE is not set CONFIG_IMA_NG_TEMPLATE=y # CONFIG_IMA_SIG_TEMPLATE is not set +CONFIG_IMA_DEFAULT_TEMPLATE="ima-ng" CONFIG_IMA_DEFAULT_HASH_SHA1=y # CONFIG_IMA_DEFAULT_HASH_SHA256 is not set # CONFIG_IMA_DEFAULT_HASH_SHA512 is not set # CONFIG_IMA_DEFAULT_HASH_WP512 is not set -# CONFIG_EVM is not set -# CONFIG_IMA_APPRAISE is not set +CONFIG_IMA_DEFAULT_HASH="sha1" # CONFIG_IMA_WRITE_POLICY is not set # CONFIG_IMA_READ_POLICY is not set -# CONFIG_DEFAULT_SECURITY_TOMOYO is not set -# CONFIG_DEFAULT_SECURITY_DAC is not set +# CONFIG_IMA_APPRAISE is not set +# CONFIG_EVM is not set CONFIG_DEFAULT_SECURITY_SELINUX=y +# CONFIG_DEFAULT_SECURITY_DAC is not set CONFIG_DEFAULT_SECURITY="selinux" CONFIG_XOR_BLOCKS=y CONFIG_ASYNC_CORE=m @@ -7996,7 +8074,7 @@ CONFIG_CRYPTO_RMD128=m CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m -CONFIG_CRYPTO_SHA1=m +CONFIG_CRYPTO_SHA1=y CONFIG_CRYPTO_SHA1_SSSE3=m CONFIG_CRYPTO_SHA256_SSSE3=m CONFIG_CRYPTO_SHA512_SSSE3=m @@ -8086,7 +8164,7 @@ CONFIG_CRYPTO_DEV_QAT_C62X=m CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m CONFIG_CRYPTO_DEV_QAT_C62XVF=m -CONFIG_ASYMMETRIC_KEY_TYPE=m +CONFIG_ASYMMETRIC_KEY_TYPE=y CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=m CONFIG_X509_CERTIFICATE_PARSER=m CONFIG_PKCS7_MESSAGE_PARSER=m @@ -8175,6 +8253,7 @@ CONFIG_TEXTSEARCH_BM=m CONFIG_TEXTSEARCH_FSM=m CONFIG_BTREE=y CONFIG_INTERVAL_TREE=y +CONFIG_RADIX_TREE_MULTIORDER=y CONFIG_ASSOCIATIVE_ARRAY=y CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT_MAP=y diff --git a/kernel.spec b/kernel.spec index 9e544e6..892134d 100644 --- a/kernel.spec +++ b/kernel.spec @@ -1,7 +1,7 @@ %define kernelversion 4 -%define patchlevel 6 +%define patchlevel 7 # sublevel is now used for -stable patches -%define sublevel 7 +%define sublevel 2 # Release number. Increase this before a rebuild. %define rpmrel 1 @@ -204,13 +204,11 @@ Patch108: ata-prefer-ata-drivers-over-ide-drivers-when-both-are-built.patch # AUFS from http://aufs.sourceforge.net/ Patch109: fs-aufs4.patch -# https://bugzilla.kernel.org/show_bug.cgi?id=69131 -Patch110: hp-wmi-rfkill-fix.patch - # BFQ IO scheduler, http://algogroup.unimore.it/people/paolo/disk_sched/ -Patch111: 0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.5.0.patch -Patch112: 0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.5.0.patch +Patch111: 0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.7.0.patch +Patch112: 0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.7.0.patch Patch113: 0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for.patch +Patch114: 0004-block-bfq-turn-BFQ-v7r11-for-4.7.0-into-BFQ-v8r2-for.patch # Sanitizing kernel memory # We do not use "Patch:" here because apply_patched would always apply it diff --git a/linux-4.6.tar.sign b/linux-4.6.tar.sign deleted file mode 100644 index 29b51ba..0000000 --- a/linux-4.6.tar.sign +++ /dev/null @@ -1,11 +0,0 @@ ------BEGIN PGP SIGNATURE----- -Version: GnuPG v1 - -iQEcBAABAgAGBQJXOP0sAAoJEHm+PkMAQRiGcSkH/397p/jPGp8XUNp/gRZuf42j -nepYZuFvOrO6iVFHKQp8bdVhPXukQq8QgmGcXowMKZntvA6w+mqw9pMOd0/imq1r -kkd6Hc/qCZxsZjCD27lDaHL25J/yMImxApNHGFGxI6sFJpGmIp0hb2vC3j6LHMPf -e8rIEQcm+wRs+RYFQVBGi1XMFc+QU8hbdHzNw3wNd+EacKXYcacJVBETRQrPs8OY -aLZV8GRe9zAJiuIj1YF7sy6Y3Jt2D3yI1wD+ps+hClFPkw8hlkch+8uSiOPqULeD -lwJTq1cj/RW85Fav1EoeiyVFOlLDvG3ndXD6WwnfxdTBFT3QXBuGaLgvhoJLZRk= -=ktV3 ------END PGP SIGNATURE----- diff --git a/linux-4.7.tar.sign b/linux-4.7.tar.sign new file mode 100644 index 0000000..7d4db9d --- /dev/null +++ b/linux-4.7.tar.sign @@ -0,0 +1,11 @@ +-----BEGIN PGP SIGNATURE----- +Version: GnuPG v1 + +iQEcBAABAgAGBQJXlR20AAoJEHm+PkMAQRiGE28H/2prCblJSfXW40RNq3uQydhl +AT8Vo/VnsO3lIa41Py9Iet8ZXP+Wg5ed4nGNXs6myxwW/wxFDx1+peD1pJWWOqf9 +krt1FA7jB4gmqNSsd+AgiUy9ZaRFxTXFXqPdMbiwU8O+UEhYllMJGfobH1RMu4Ul +4uLszvNlppbYxQeB94Ft0cOGeRxJE5jBltc9KJvYOSog1upa+1vNiwHBD5BAOOUC +LJtpUbTr0p3D5/mpzhkGaam7hjRsgqsm6X84ebUdR9RqoMkYaGj2cT9n2NR1FS29 +kwH2rRB2VYIYkUGVb3ELHZTVFJyf4yQWo6/t6frrEmYi3n+Xrc6aNQ3gVKQyEH4= +=xcAK +-----END PGP SIGNATURE----- diff --git a/patch-4.6.7.sign b/patch-4.6.7.sign deleted file mode 100644 index 22581b1..0000000 --- a/patch-4.6.7.sign +++ /dev/null @@ -1,17 +0,0 @@ ------BEGIN PGP SIGNATURE----- -Version: GnuPG v2 - -iQIcBAABCAAGBQJXssHkAAoJEDjbvchgkmk+TZsP/1DUuZ64TlYgLKvJgPjzwIE1 -rklRklYGwqg0YX3Yb6g7Gq33KgyOjtvmpm4FP6LCwUb+9WAZahDXhGYRrpnmLKGU -JavlgbHGHtN5FGqeSMIAS02jBvSpBvHyRqc1NF3BBrR67sL/TTmY0O/NuAgb/UkA -vtipSiWZFHiPOWakQyG2ZjjX88Zb2xcqLJnGTAs7jzZwldJTVSxPCgVIJ3lqKtIW -WWnePdJGCErtyYg2xhM29xpwNwmbm5Li2u1ZLMp/MTvX6T0VAp2d53/nqgKRhgft -s2vLRYIcnZT6k68TTOj+mRyT+5FBdsd+vWa7MKULLMTJc28YyDNFbtOFSRCpdFaq -bwSJi34l6aeBef9fEbnp9dNoU/KIqoUsBBlVRFrV3aNFlvWCJMHWTKDx77Qisi6M -JlfuLRT6XSbuNSm37P2A4hqOOgVQDA2SLv4yaTrfVCPQu+QGg8h0eHe7OoNc6UxU -8n5hV8JTeLDSw3jkgEb2eLq9isKijuOABKG4yepCVB4BiDgkup2t02d7Tw8xrGHT -YchQIpUjbVhlmahOMxgv10i8Mv4ZEm0B3VLNAv3SK1XdF2kUiyYiiKFHh3kYyp6G -n48zQITc+lZVKHsbWey6KuDaVwJjpT2a6fW8LEy4JfVYrrC1bWktlLFjQXiyqtyl -qxu8zAArUm2z20nF+VcB -=4Blr ------END PGP SIGNATURE----- diff --git a/patch-4.7.2.sign b/patch-4.7.2.sign new file mode 100644 index 0000000..4a11627 --- /dev/null +++ b/patch-4.7.2.sign @@ -0,0 +1,17 @@ +-----BEGIN PGP SIGNATURE----- +Version: GnuPG v2 + +iQIcBAABCAAGBQJXuIE5AAoJEDjbvchgkmk+0wgQAKfn5FWSSxyYkrmkv+HuVAtl +XPcuJaiHCjLFGkgHt9jk9rKuwucyAEnKArXEDH0O8TICrv6IZ3gzzXsNWVxaJCzq +l1ElPQvKj8oKRQee37fXR6xnpZ2v8WhKuEjlYxmp0zoCpzxkWWmMhW6Wkl0LHtV6 +jeBaVF2Boo/O8HlRgZq0V1wTOQnGgpGL2IcweSNa6T3MvrW5lIkcoEijcXfhA6OC +EUuEgNb+mB4UWKVMvtrYT3TpQpRDyYc/QK6PDaumGHYAer8vFubgx4A/ZhMA6zh5 +QTDi4N2A6Bk1p+jpVpgFfSY+W3u+2S5MzEbLQdILiPt+RGhaqJwAv82jHesMyJ58 +tTZ735RGdfGQQdUM54UmFIGskSs6jUITIoSrMGH04ufGtdzzTscJ7pv4igocWByH +pwyZRj8I0awyzZMk6tt8w0y6+9a198Gko2NUWybOkTBJSwJM6NioILsHE3yVZl9k +celTY6lua77vKwHw2KzcAn7Cs7x+VpWQu6e7W6+vWDQhOIl3udyPyVEr9yStfDxa +D7/yuXROeSbeWHoSrxFBtJJz/WaY7faJTntWpjcHo70HybpNgmeCTkAOweyUAz+m +v6AX4NW63Mz8CQvLDGzIzolvRmpFLsI/x8JExeAvacTmp5ly2Rx0MuvBR6JQgKHe +ajZc1OFSNqx18elDXr/H +=ZLOb +-----END PGP SIGNATURE----- diff --git a/sanitize-memory.patch b/sanitize-memory.patch index 3e42580..f166334 100644 --- a/sanitize-memory.patch +++ b/sanitize-memory.patch @@ -1,8 +1,8 @@ diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt -index 0b3de80..2167ea0 100644 +index 82b42c9..090568e 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt -@@ -2818,6 +2818,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. +@@ -2862,6 +2862,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. the specified number of seconds. This is to be used if your oopses keep scrolling off the screen. @@ -14,7 +14,7 @@ index 0b3de80..2167ea0 100644 pcd. [PARIDE] diff --git a/fs/buffer.c b/fs/buffer.c -index af0d9a8..2437a67 100644 +index 754813a..4c25e3c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3406,7 +3406,7 @@ void __init buffer_init(void) @@ -27,10 +27,10 @@ index af0d9a8..2437a67 100644 /* diff --git a/fs/dcache.c b/fs/dcache.c -index 44008e3..e5c7f9d 100644 +index 1ed81bb..30f6c6b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c -@@ -3444,7 +3444,8 @@ void __init vfs_caches_init_early(void) +@@ -3709,7 +3709,8 @@ void __init vfs_caches_init_early(void) void __init vfs_caches_init(void) { names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, @@ -41,7 +41,7 @@ index 44008e3..e5c7f9d 100644 dcache_init(); inode_init(); diff --git a/include/linux/slab.h b/include/linux/slab.h -index 508bd82..35f172f 100644 +index aeb3e6d..df60597 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -23,6 +23,13 @@ @@ -59,10 +59,10 @@ index 508bd82..35f172f 100644 #define SLAB_CACHE_DMA 0x00004000UL /* Use GFP_DMA memory */ #define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */ diff --git a/kernel/fork.c b/kernel/fork.c -index d277e83..3ee91a2 100644 +index 4a7ec0c..49f43cb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c -@@ -1882,7 +1882,7 @@ void __init proc_caches_init(void) +@@ -1913,7 +1913,7 @@ void __init proc_caches_init(void) sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); @@ -72,7 +72,7 @@ index d277e83..3ee91a2 100644 nsproxy_cache_init(); } diff --git a/mm/rmap.c b/mm/rmap.c -index 3ebf9c4..4e0d554 100644 +index 701b93f..22ab5d9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -429,10 +429,10 @@ static void anon_vma_ctor(void *data) @@ -89,11 +89,11 @@ index 3ebf9c4..4e0d554 100644 /* diff --git a/mm/slab.c b/mm/slab.c -index 17e2848..25c241f 100644 +index cc8bbc1..6bd0823 100644 --- a/mm/slab.c +++ b/mm/slab.c -@@ -3332,6 +3332,17 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, - kasan_slab_free(cachep, objp); +@@ -3560,6 +3560,17 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, + struct array_cache *ac = cpu_cache_get(cachep); check_irq_off(); + @@ -111,7 +111,7 @@ index 17e2848..25c241f 100644 objp = cache_free_debugcheck(cachep, objp, caller); diff --git a/mm/slab.h b/mm/slab.h -index 5969769..2f0bbc6 100644 +index dedb1a9..1d157d4 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -70,6 +70,15 @@ extern struct list_head slab_caches; @@ -131,7 +131,7 @@ index 5969769..2f0bbc6 100644 unsigned long align, unsigned long size); diff --git a/mm/slab_common.c b/mm/slab_common.c -index 3239bfd..8a974f5 100644 +index 82317ab..a5e0b77 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -44,7 +44,11 @@ struct kmem_cache *kmem_cache; @@ -222,7 +222,7 @@ index 5ec1580..385cdbc 100644 /* This slob page is about to become partially free. Easy! */ sp->units = units; diff --git a/mm/slub.c b/mm/slub.c -index 4dbb109e..da2dc67 100644 +index 825ff45..c4eb91d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2778,6 +2778,22 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page, @@ -259,10 +259,10 @@ index 4dbb109e..da2dc67 100644 /* * Relocate free pointer after the object if it is not diff --git a/net/core/skbuff.c b/net/core/skbuff.c -index 59bf4d7..69f08cd 100644 +index eb12d21..9d8f097 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c -@@ -3396,12 +3396,14 @@ void __init skb_init(void) +@@ -3426,12 +3426,14 @@ void __init skb_init(void) skbuff_head_cache = kmem_cache_create("skbuff_head_cache", sizeof(struct sk_buff), 0, @@ -280,7 +280,7 @@ index 59bf4d7..69f08cd 100644 } diff --git a/security/Kconfig b/security/Kconfig -index e452378..a7ca1d9 100644 +index 176758c..92b5346 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -6,6 +6,37 @@ menu "Security options"